LCOV - code coverage report
Current view: top level - src/backend/storage/aio - aio.c (source / functions) Hit Total Coverage
Test: PostgreSQL 18beta1 Lines: 282 333 84.7 %
Date: 2025-06-07 19:17:40 Functions: 35 37 94.6 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * aio.c
       4             :  *    AIO - Core Logic
       5             :  *
       6             :  * For documentation about how AIO works on a higher level, including a
       7             :  * schematic example, see README.md.
       8             :  *
       9             :  *
      10             :  * AIO is a complicated subsystem. To keep things navigable, it is split
      11             :  * across a number of files:
      12             :  *
      13             :  * - method_*.c - different ways of executing AIO (e.g. worker process)
      14             :  *
      15             :  * - aio_target.c - IO on different kinds of targets
      16             :  *
      17             :  * - aio_io.c - method-independent code for specific IO ops (e.g. readv)
      18             :  *
      19             :  * - aio_callback.c - callbacks at IO operation lifecycle events
      20             :  *
      21             :  * - aio_init.c - per-server and per-backend initialization
      22             :  *
      23             :  * - aio.c - all other topics
      24             :  *
      25             :  * - read_stream.c - helper for reading buffered relation data
      26             :  *
      27             :  * - README.md - higher-level overview over AIO
      28             :  *
      29             :  *
      30             :  * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
      31             :  * Portions Copyright (c) 1994, Regents of the University of California
      32             :  *
      33             :  * IDENTIFICATION
      34             :  *    src/backend/storage/aio/aio.c
      35             :  *
      36             :  *-------------------------------------------------------------------------
      37             :  */
      38             : 
      39             : #include "postgres.h"
      40             : 
      41             : #include "lib/ilist.h"
      42             : #include "miscadmin.h"
      43             : #include "port/atomics.h"
      44             : #include "storage/aio.h"
      45             : #include "storage/aio_internal.h"
      46             : #include "storage/aio_subsys.h"
      47             : #include "utils/guc.h"
      48             : #include "utils/guc_hooks.h"
      49             : #include "utils/injection_point.h"
      50             : #include "utils/resowner.h"
      51             : #include "utils/wait_event_types.h"
      52             : 
      53             : 
      54             : static inline void pgaio_io_update_state(PgAioHandle *ioh, PgAioHandleState new_state);
      55             : static void pgaio_io_reclaim(PgAioHandle *ioh);
      56             : static void pgaio_io_resowner_register(PgAioHandle *ioh);
      57             : static void pgaio_io_wait_for_free(void);
      58             : static PgAioHandle *pgaio_io_from_wref(PgAioWaitRef *iow, uint64 *ref_generation);
      59             : static const char *pgaio_io_state_get_name(PgAioHandleState s);
      60             : static void pgaio_io_wait(PgAioHandle *ioh, uint64 ref_generation);
      61             : 
      62             : 
      63             : /* Options for io_method. */
      64             : const struct config_enum_entry io_method_options[] = {
      65             :     {"sync", IOMETHOD_SYNC, false},
      66             :     {"worker", IOMETHOD_WORKER, false},
      67             : #ifdef IOMETHOD_IO_URING_ENABLED
      68             :     {"io_uring", IOMETHOD_IO_URING, false},
      69             : #endif
      70             :     {NULL, 0, false}
      71             : };
      72             : 
      73             : /* GUCs */
      74             : int         io_method = DEFAULT_IO_METHOD;
      75             : int         io_max_concurrency = -1;
      76             : 
      77             : /* global control for AIO */
      78             : PgAioCtl   *pgaio_ctl;
      79             : 
      80             : /* current backend's per-backend state */
      81             : PgAioBackend *pgaio_my_backend;
      82             : 
      83             : 
      84             : static const IoMethodOps *const pgaio_method_ops_table[] = {
      85             :     [IOMETHOD_SYNC] = &pgaio_sync_ops,
      86             :     [IOMETHOD_WORKER] = &pgaio_worker_ops,
      87             : #ifdef IOMETHOD_IO_URING_ENABLED
      88             :     [IOMETHOD_IO_URING] = &pgaio_uring_ops,
      89             : #endif
      90             : };
      91             : 
      92             : /* callbacks for the configured io_method, set by assign_io_method */
      93             : const IoMethodOps *pgaio_method_ops;
      94             : 
      95             : 
      96             : /* --------------------------------------------------------------------------------
      97             :  * Public Functions related to PgAioHandle
      98             :  * --------------------------------------------------------------------------------
      99             :  */
     100             : 
     101             : /*
     102             :  * Acquire an AioHandle, waiting for IO completion if necessary.
     103             :  *
     104             :  * Each backend can only have one AIO handle that has been "handed out" to
     105             :  * code, but not yet submitted or released. This restriction is necessary to
     106             :  * ensure that it is possible for code to wait for an unused handle by waiting
     107             :  * for in-flight IO to complete. There is a limited number of handles in each
     108             :  * backend, if multiple handles could be handed out without being submitted,
     109             :  * waiting for all in-flight IO to complete would not guarantee that handles
     110             :  * free up.
     111             :  *
     112             :  * It is cheap to acquire an IO handle, unless all handles are in use. In that
     113             :  * case this function waits for the oldest IO to complete. If that is not
     114             :  * desirable, use pgaio_io_acquire_nb().
     115             :  *
     116             :  * If a handle was acquired but then does not turn out to be needed,
     117             :  * e.g. because pgaio_io_acquire() is called before starting an IO in a
     118             :  * critical section, the handle needs to be released with pgaio_io_release().
     119             :  *
     120             :  *
     121             :  * To react to the completion of the IO as soon as it is known to have
     122             :  * completed, callbacks can be registered with pgaio_io_register_callbacks().
     123             :  *
     124             :  * To actually execute IO using the returned handle, the pgaio_io_start_*()
     125             :  * family of functions is used. In many cases the pgaio_io_start_*() call will
     126             :  * not be done directly by code that acquired the handle, but by lower level
     127             :  * code that gets passed the handle. E.g. if code in bufmgr.c wants to perform
     128             :  * AIO, it typically will pass the handle to smgr.c, which will pass it on to
     129             :  * md.c, on to fd.c, which then finally calls pgaio_io_start_*().  This
     130             :  * forwarding allows the various layers to react to the IO's completion by
     131             :  * registering callbacks. These callbacks in turn can translate a lower
     132             :  * layer's result into a result understandable by a higher layer.
     133             :  *
     134             :  * During pgaio_io_start_*() the IO is staged (i.e. prepared for execution but
     135             :  * not submitted to the kernel). Unless in batchmode
     136             :  * (c.f. pgaio_enter_batchmode()), the IO will also get submitted for
     137             :  * execution. Note that, whether in batchmode or not, the IO might even
     138             :  * complete before the functions return.
     139             :  *
     140             :  * After pgaio_io_start_*() the AioHandle is "consumed" and may not be
     141             :  * referenced by the IO issuing code. To e.g. wait for IO, references to the
     142             :  * IO can be established with pgaio_io_get_wref() *before* pgaio_io_start_*()
     143             :  * is called.  pgaio_wref_wait() can be used to wait for the IO to complete.
     144             :  *
     145             :  *
     146             :  * To know if the IO [partially] succeeded or failed, a PgAioReturn * can be
     147             :  * passed to pgaio_io_acquire(). Once the issuing backend has called
     148             :  * pgaio_wref_wait(), the PgAioReturn contains information about whether the
     149             :  * operation succeeded and details about the first failure, if any. The error
     150             :  * can be raised / logged with pgaio_result_report().
     151             :  *
     152             :  * The lifetime of the memory pointed to be *ret needs to be at least as long
     153             :  * as the passed in resowner. If the resowner releases resources before the IO
     154             :  * completes (typically due to an error), the reference to *ret will be
     155             :  * cleared. In case of resowner cleanup *ret will not be updated with the
     156             :  * results of the IO operation.
     157             :  */
     158             : PgAioHandle *
     159        5868 : pgaio_io_acquire(struct ResourceOwnerData *resowner, PgAioReturn *ret)
     160             : {
     161             :     PgAioHandle *h;
     162             : 
     163             :     while (true)
     164             :     {
     165       11536 :         h = pgaio_io_acquire_nb(resowner, ret);
     166             : 
     167       11532 :         if (h != NULL)
     168        5864 :             return h;
     169             : 
     170             :         /*
     171             :          * Evidently all handles by this backend are in use. Just wait for
     172             :          * some to complete.
     173             :          */
     174        5668 :         pgaio_io_wait_for_free();
     175             :     }
     176             : }
     177             : 
     178             : /*
     179             :  * Acquire an AioHandle, returning NULL if no handles are free.
     180             :  *
     181             :  * See pgaio_io_acquire(). The only difference is that this function will return
     182             :  * NULL if there are no idle handles, instead of blocking.
     183             :  */
     184             : PgAioHandle *
     185     2412168 : pgaio_io_acquire_nb(struct ResourceOwnerData *resowner, PgAioReturn *ret)
     186             : {
     187     2412168 :     PgAioHandle *ioh = NULL;
     188             : 
     189     2412168 :     if (pgaio_my_backend->num_staged_ios >= PGAIO_SUBMIT_BATCH_SIZE)
     190             :     {
     191             :         Assert(pgaio_my_backend->num_staged_ios == PGAIO_SUBMIT_BATCH_SIZE);
     192           0 :         pgaio_submit_staged();
     193             :     }
     194             : 
     195     2412168 :     if (pgaio_my_backend->handed_out_io)
     196           4 :         elog(ERROR, "API violation: Only one IO can be handed out");
     197             : 
     198             :     /*
     199             :      * Probably not needed today, as interrupts should not process this IO,
     200             :      * but...
     201             :      */
     202     2412164 :     HOLD_INTERRUPTS();
     203             : 
     204     2412164 :     if (!dclist_is_empty(&pgaio_my_backend->idle_ios))
     205             :     {
     206     2400828 :         dlist_node *ion = dclist_pop_head_node(&pgaio_my_backend->idle_ios);
     207             : 
     208     2400828 :         ioh = dclist_container(PgAioHandle, node, ion);
     209             : 
     210             :         Assert(ioh->state == PGAIO_HS_IDLE);
     211             :         Assert(ioh->owner_procno == MyProcNumber);
     212             : 
     213     2400828 :         pgaio_io_update_state(ioh, PGAIO_HS_HANDED_OUT);
     214     2400828 :         pgaio_my_backend->handed_out_io = ioh;
     215             : 
     216     2400828 :         if (resowner)
     217     2400828 :             pgaio_io_resowner_register(ioh);
     218             : 
     219     2400828 :         if (ret)
     220             :         {
     221     2400776 :             ioh->report_return = ret;
     222     2400776 :             ret->result.status = PGAIO_RS_UNKNOWN;
     223             :         }
     224             :     }
     225             : 
     226     2412164 :     RESUME_INTERRUPTS();
     227             : 
     228     2412164 :     return ioh;
     229             : }
     230             : 
     231             : /*
     232             :  * Release IO handle that turned out to not be required.
     233             :  *
     234             :  * See pgaio_io_acquire() for more details.
     235             :  */
     236             : void
     237        3996 : pgaio_io_release(PgAioHandle *ioh)
     238             : {
     239        3996 :     if (ioh == pgaio_my_backend->handed_out_io)
     240             :     {
     241             :         Assert(ioh->state == PGAIO_HS_HANDED_OUT);
     242             :         Assert(ioh->resowner);
     243             : 
     244        3992 :         pgaio_my_backend->handed_out_io = NULL;
     245             : 
     246             :         /*
     247             :          * Note that no interrupts are processed between the handed_out_io
     248             :          * check and the call to reclaim - that's important as otherwise an
     249             :          * interrupt could have already reclaimed the handle.
     250             :          */
     251        3992 :         pgaio_io_reclaim(ioh);
     252             :     }
     253             :     else
     254             :     {
     255           4 :         elog(ERROR, "release in unexpected state");
     256             :     }
     257        3992 : }
     258             : 
     259             : /*
     260             :  * Release IO handle during resource owner cleanup.
     261             :  */
     262             : void
     263          94 : pgaio_io_release_resowner(dlist_node *ioh_node, bool on_error)
     264             : {
     265          94 :     PgAioHandle *ioh = dlist_container(PgAioHandle, resowner_node, ioh_node);
     266             : 
     267             :     Assert(ioh->resowner);
     268             : 
     269             :     /*
     270             :      * Otherwise an interrupt, in the middle of releasing the IO, could end up
     271             :      * trying to wait for the IO, leading to state confusion.
     272             :      */
     273          94 :     HOLD_INTERRUPTS();
     274             : 
     275          94 :     ResourceOwnerForgetAioHandle(ioh->resowner, &ioh->resowner_node);
     276          94 :     ioh->resowner = NULL;
     277             : 
     278          94 :     switch (ioh->state)
     279             :     {
     280           0 :         case PGAIO_HS_IDLE:
     281           0 :             elog(ERROR, "unexpected");
     282             :             break;
     283          66 :         case PGAIO_HS_HANDED_OUT:
     284             :             Assert(ioh == pgaio_my_backend->handed_out_io || pgaio_my_backend->handed_out_io == NULL);
     285             : 
     286          66 :             if (ioh == pgaio_my_backend->handed_out_io)
     287             :             {
     288          66 :                 pgaio_my_backend->handed_out_io = NULL;
     289          66 :                 if (!on_error)
     290          20 :                     elog(WARNING, "leaked AIO handle");
     291             :             }
     292             : 
     293          66 :             pgaio_io_reclaim(ioh);
     294          66 :             break;
     295           0 :         case PGAIO_HS_DEFINED:
     296             :         case PGAIO_HS_STAGED:
     297           0 :             if (!on_error)
     298           0 :                 elog(WARNING, "AIO handle was not submitted");
     299           0 :             pgaio_submit_staged();
     300           0 :             break;
     301          28 :         case PGAIO_HS_SUBMITTED:
     302             :         case PGAIO_HS_COMPLETED_IO:
     303             :         case PGAIO_HS_COMPLETED_SHARED:
     304             :         case PGAIO_HS_COMPLETED_LOCAL:
     305             :             /* this is expected to happen */
     306          28 :             break;
     307             :     }
     308             : 
     309             :     /*
     310             :      * Need to unregister the reporting of the IO's result, the memory it's
     311             :      * referencing likely has gone away.
     312             :      */
     313          94 :     if (ioh->report_return)
     314          28 :         ioh->report_return = NULL;
     315             : 
     316          94 :     RESUME_INTERRUPTS();
     317          94 : }
     318             : 
     319             : /*
     320             :  * Add a [set of] flags to the IO.
     321             :  *
     322             :  * Note that this combines flags with already set flags, rather than set flags
     323             :  * to explicitly the passed in parameters. This is to allow multiple callsites
     324             :  * to set flags.
     325             :  */
     326             : void
     327     4790672 : pgaio_io_set_flag(PgAioHandle *ioh, PgAioHandleFlags flag)
     328             : {
     329             :     Assert(ioh->state == PGAIO_HS_HANDED_OUT);
     330             : 
     331     4790672 :     ioh->flags |= flag;
     332     4790672 : }
     333             : 
     334             : /*
     335             :  * Returns an ID uniquely identifying the IO handle. This is only really
     336             :  * useful for logging, as handles are reused across multiple IOs.
     337             :  */
     338             : int
     339     1111286 : pgaio_io_get_id(PgAioHandle *ioh)
     340             : {
     341             :     Assert(ioh >= pgaio_ctl->io_handles &&
     342             :            ioh < (pgaio_ctl->io_handles + pgaio_ctl->io_handle_count));
     343     1111286 :     return ioh - pgaio_ctl->io_handles;
     344             : }
     345             : 
     346             : /*
     347             :  * Return the ProcNumber for the process that can use an IO handle. The
     348             :  * mapping from IO handles to PGPROCs is static, therefore this even works
     349             :  * when the corresponding PGPROC is not in use.
     350             :  */
     351             : ProcNumber
     352           0 : pgaio_io_get_owner(PgAioHandle *ioh)
     353             : {
     354           0 :     return ioh->owner_procno;
     355             : }
     356             : 
     357             : /*
     358             :  * Return a wait reference for the IO. Only wait references can be used to
     359             :  * wait for an IOs completion, as handles themselves can be reused after
     360             :  * completion.  See also the comment above pgaio_io_acquire().
     361             :  */
     362             : void
     363     4793570 : pgaio_io_get_wref(PgAioHandle *ioh, PgAioWaitRef *iow)
     364             : {
     365             :     Assert(ioh->state == PGAIO_HS_HANDED_OUT ||
     366             :            ioh->state == PGAIO_HS_DEFINED ||
     367             :            ioh->state == PGAIO_HS_STAGED);
     368             :     Assert(ioh->generation != 0);
     369             : 
     370     4793570 :     iow->aio_index = ioh - pgaio_ctl->io_handles;
     371     4793570 :     iow->generation_upper = (uint32) (ioh->generation >> 32);
     372     4793570 :     iow->generation_lower = (uint32) ioh->generation;
     373     4793570 : }
     374             : 
     375             : 
     376             : 
     377             : /* --------------------------------------------------------------------------------
     378             :  * Internal Functions related to PgAioHandle
     379             :  * --------------------------------------------------------------------------------
     380             :  */
     381             : 
     382             : static inline void
     383    18765144 : pgaio_io_update_state(PgAioHandle *ioh, PgAioHandleState new_state)
     384             : {
     385             :     /*
     386             :      * All callers need to have held interrupts in some form, otherwise
     387             :      * interrupt processing could wait for the IO to complete, while in an
     388             :      * intermediary state.
     389             :      */
     390             :     Assert(!INTERRUPTS_CAN_BE_PROCESSED());
     391             : 
     392    18765144 :     pgaio_debug_io(DEBUG5, ioh,
     393             :                    "updating state to %s",
     394             :                    pgaio_io_state_get_name(new_state));
     395             : 
     396             :     /*
     397             :      * Ensure the changes signified by the new state are visible before the
     398             :      * new state becomes visible.
     399             :      */
     400    18765144 :     pg_write_barrier();
     401             : 
     402    18765144 :     ioh->state = new_state;
     403    18765144 : }
     404             : 
     405             : static void
     406     2400828 : pgaio_io_resowner_register(PgAioHandle *ioh)
     407             : {
     408             :     Assert(!ioh->resowner);
     409             :     Assert(CurrentResourceOwner);
     410             : 
     411     2400828 :     ResourceOwnerRememberAioHandle(CurrentResourceOwner, &ioh->resowner_node);
     412     2400828 :     ioh->resowner = CurrentResourceOwner;
     413     2400828 : }
     414             : 
     415             : /*
     416             :  * Stage IO for execution and, if appropriate, submit it immediately.
     417             :  *
     418             :  * Should only be called from pgaio_io_start_*().
     419             :  */
     420             : void
     421     2396770 : pgaio_io_stage(PgAioHandle *ioh, PgAioOp op)
     422             : {
     423             :     bool        needs_synchronous;
     424             : 
     425             :     Assert(ioh->state == PGAIO_HS_HANDED_OUT);
     426             :     Assert(pgaio_my_backend->handed_out_io == ioh);
     427             :     Assert(pgaio_io_has_target(ioh));
     428             : 
     429             :     /*
     430             :      * Otherwise an interrupt, in the middle of staging and possibly executing
     431             :      * the IO, could end up trying to wait for the IO, leading to state
     432             :      * confusion.
     433             :      */
     434     2396770 :     HOLD_INTERRUPTS();
     435             : 
     436     2396770 :     ioh->op = op;
     437     2396770 :     ioh->result = 0;
     438             : 
     439     2396770 :     pgaio_io_update_state(ioh, PGAIO_HS_DEFINED);
     440             : 
     441             :     /* allow a new IO to be staged */
     442     2396770 :     pgaio_my_backend->handed_out_io = NULL;
     443             : 
     444     2396770 :     pgaio_io_call_stage(ioh);
     445             : 
     446     2396770 :     pgaio_io_update_state(ioh, PGAIO_HS_STAGED);
     447             : 
     448             :     /*
     449             :      * Synchronous execution has to be executed, well, synchronously, so check
     450             :      * that first.
     451             :      */
     452     2396770 :     needs_synchronous = pgaio_io_needs_synchronous_execution(ioh);
     453             : 
     454     2396770 :     pgaio_debug_io(DEBUG3, ioh,
     455             :                    "staged (synchronous: %d, in_batch: %d)",
     456             :                    needs_synchronous, pgaio_my_backend->in_batchmode);
     457             : 
     458     2396770 :     if (!needs_synchronous)
     459             :     {
     460     1053392 :         pgaio_my_backend->staged_ios[pgaio_my_backend->num_staged_ios++] = ioh;
     461             :         Assert(pgaio_my_backend->num_staged_ios <= PGAIO_SUBMIT_BATCH_SIZE);
     462             : 
     463             :         /*
     464             :          * Unless code explicitly opted into batching IOs, submit the IO
     465             :          * immediately.
     466             :          */
     467     1053392 :         if (!pgaio_my_backend->in_batchmode)
     468       53214 :             pgaio_submit_staged();
     469             :     }
     470             :     else
     471             :     {
     472     1343378 :         pgaio_io_prepare_submit(ioh);
     473     1343378 :         pgaio_io_perform_synchronously(ioh);
     474             :     }
     475             : 
     476     2396770 :     RESUME_INTERRUPTS();
     477     2396770 : }
     478             : 
     479             : bool
     480     2396770 : pgaio_io_needs_synchronous_execution(PgAioHandle *ioh)
     481             : {
     482             :     /*
     483             :      * If the caller said to execute the IO synchronously, do so.
     484             :      *
     485             :      * XXX: We could optimize the logic when to execute synchronously by first
     486             :      * checking if there are other IOs in flight and only synchronously
     487             :      * executing if not. Unclear whether that'll be sufficiently common to be
     488             :      * worth worrying about.
     489             :      */
     490     2396770 :     if (ioh->flags & PGAIO_HF_SYNCHRONOUS)
     491     1335352 :         return true;
     492             : 
     493             :     /* Check if the IO method requires synchronous execution of IO */
     494     1061418 :     if (pgaio_method_ops->needs_synchronous_execution)
     495     1061418 :         return pgaio_method_ops->needs_synchronous_execution(ioh);
     496             : 
     497           0 :     return false;
     498             : }
     499             : 
     500             : /*
     501             :  * Handle IO being processed by IO method.
     502             :  *
     503             :  * Should be called by IO methods / synchronous IO execution, just before the
     504             :  * IO is performed.
     505             :  */
     506             : void
     507     2396770 : pgaio_io_prepare_submit(PgAioHandle *ioh)
     508             : {
     509     2396770 :     pgaio_io_update_state(ioh, PGAIO_HS_SUBMITTED);
     510             : 
     511     2396770 :     dclist_push_tail(&pgaio_my_backend->in_flight_ios, &ioh->node);
     512     2396770 : }
     513             : 
     514             : /*
     515             :  * Handle IO getting completed by a method.
     516             :  *
     517             :  * Should be called by IO methods / synchronous IO execution, just after the
     518             :  * IO has been performed.
     519             :  *
     520             :  * Expects to be called in a critical section. We expect IOs to be usable for
     521             :  * WAL etc, which requires being able to execute completion callbacks in a
     522             :  * critical section.
     523             :  */
     524             : void
     525     2188204 : pgaio_io_process_completion(PgAioHandle *ioh, int result)
     526             : {
     527             :     Assert(ioh->state == PGAIO_HS_SUBMITTED);
     528             : 
     529             :     Assert(CritSectionCount > 0);
     530             : 
     531     2188204 :     ioh->result = result;
     532             : 
     533     2188204 :     pgaio_io_update_state(ioh, PGAIO_HS_COMPLETED_IO);
     534             : 
     535     2188204 :     INJECTION_POINT("aio-process-completion-before-shared", ioh);
     536             : 
     537     2188204 :     pgaio_io_call_complete_shared(ioh);
     538             : 
     539     2188204 :     pgaio_io_update_state(ioh, PGAIO_HS_COMPLETED_SHARED);
     540             : 
     541             :     /* condition variable broadcast ensures state is visible before wakeup */
     542     2188204 :     ConditionVariableBroadcast(&ioh->cv);
     543             : 
     544             :     /* contains call to pgaio_io_call_complete_local() */
     545     2188204 :     if (ioh->owner_procno == MyProcNumber)
     546     1343378 :         pgaio_io_reclaim(ioh);
     547     2188204 : }
     548             : 
     549             : /*
     550             :  * Has the IO completed and thus the IO handle been reused?
     551             :  *
     552             :  * This is useful when waiting for IO completion at a low level (e.g. in an IO
     553             :  * method's ->wait_one() callback).
     554             :  */
     555             : bool
     556     2376394 : pgaio_io_was_recycled(PgAioHandle *ioh, uint64 ref_generation, PgAioHandleState *state)
     557             : {
     558     2376394 :     *state = ioh->state;
     559     2376394 :     pg_read_barrier();
     560             : 
     561     2376394 :     return ioh->generation != ref_generation;
     562             : }
     563             : 
     564             : /*
     565             :  * Wait for IO to complete. External code should never use this, outside of
     566             :  * the AIO subsystem waits are only allowed via pgaio_wref_wait().
     567             :  */
     568             : static void
     569      266200 : pgaio_io_wait(PgAioHandle *ioh, uint64 ref_generation)
     570             : {
     571             :     PgAioHandleState state;
     572             :     bool        am_owner;
     573             : 
     574      266200 :     am_owner = ioh->owner_procno == MyProcNumber;
     575             : 
     576      266200 :     if (pgaio_io_was_recycled(ioh, ref_generation, &state))
     577          70 :         return;
     578             : 
     579      266130 :     if (am_owner)
     580             :     {
     581      262910 :         if (state != PGAIO_HS_SUBMITTED
     582       49100 :             && state != PGAIO_HS_COMPLETED_IO
     583         304 :             && state != PGAIO_HS_COMPLETED_SHARED
     584           0 :             && state != PGAIO_HS_COMPLETED_LOCAL)
     585             :         {
     586           0 :             elog(PANIC, "waiting for own IO %d in wrong state: %s",
     587             :                  pgaio_io_get_id(ioh), pgaio_io_get_state_name(ioh));
     588             :         }
     589             :     }
     590             : 
     591             :     while (true)
     592             :     {
     593      531816 :         if (pgaio_io_was_recycled(ioh, ref_generation, &state))
     594        2386 :             return;
     595             : 
     596      529430 :         switch (state)
     597             :         {
     598           0 :             case PGAIO_HS_IDLE:
     599             :             case PGAIO_HS_HANDED_OUT:
     600           0 :                 elog(ERROR, "IO in wrong state: %d", state);
     601             :                 break;
     602             : 
     603      215358 :             case PGAIO_HS_SUBMITTED:
     604             : 
     605             :                 /*
     606             :                  * If we need to wait via the IO method, do so now. Don't
     607             :                  * check via the IO method if the issuing backend is executing
     608             :                  * the IO synchronously.
     609             :                  */
     610      215358 :                 if (pgaio_method_ops->wait_one && !(ioh->flags & PGAIO_HF_SYNCHRONOUS))
     611             :                 {
     612           0 :                     pgaio_method_ops->wait_one(ioh, ref_generation);
     613           0 :                     continue;
     614             :                 }
     615             :                 /* fallthrough */
     616             : 
     617             :                 /* waiting for owner to submit */
     618             :             case PGAIO_HS_DEFINED:
     619             :             case PGAIO_HS_STAGED:
     620             :                 /* waiting for reaper to complete */
     621             :                 /* fallthrough */
     622             :             case PGAIO_HS_COMPLETED_IO:
     623             :                 /* shouldn't be able to hit this otherwise */
     624             :                 Assert(IsUnderPostmaster);
     625             :                 /* ensure we're going to get woken up */
     626      265686 :                 ConditionVariablePrepareToSleep(&ioh->cv);
     627             : 
     628      530706 :                 while (!pgaio_io_was_recycled(ioh, ref_generation, &state))
     629             :                 {
     630      528338 :                     if (state == PGAIO_HS_COMPLETED_SHARED ||
     631      265052 :                         state == PGAIO_HS_COMPLETED_LOCAL)
     632             :                         break;
     633      265020 :                     ConditionVariableSleep(&ioh->cv, WAIT_EVENT_AIO_IO_COMPLETION);
     634             :                 }
     635             : 
     636      265686 :                 ConditionVariableCancelSleep();
     637      265686 :                 break;
     638             : 
     639      263744 :             case PGAIO_HS_COMPLETED_SHARED:
     640             :             case PGAIO_HS_COMPLETED_LOCAL:
     641             : 
     642             :                 /*
     643             :                  * Note that no interrupts are processed between
     644             :                  * pgaio_io_was_recycled() and this check - that's important
     645             :                  * as otherwise an interrupt could have already reclaimed the
     646             :                  * handle.
     647             :                  */
     648      263744 :                 if (am_owner)
     649      262910 :                     pgaio_io_reclaim(ioh);
     650      263744 :                 return;
     651             :         }
     652             :     }
     653             : }
     654             : 
     655             : /*
     656             :  * Make IO handle ready to be reused after IO has completed or after the
     657             :  * handle has been released without being used.
     658             :  *
     659             :  * Note that callers need to be careful about only calling this in the right
     660             :  * state and that no interrupts can be processed between the state check and
     661             :  * the call to pgaio_io_reclaim(). Otherwise interrupt processing could
     662             :  * already have reclaimed the handle.
     663             :  */
     664             : static void
     665     2400828 : pgaio_io_reclaim(PgAioHandle *ioh)
     666             : {
     667             :     /* This is only ok if it's our IO */
     668             :     Assert(ioh->owner_procno == MyProcNumber);
     669             :     Assert(ioh->state != PGAIO_HS_IDLE);
     670             : 
     671             :     /* see comment in function header */
     672     2400828 :     HOLD_INTERRUPTS();
     673             : 
     674             :     /*
     675             :      * It's a bit ugly, but right now the easiest place to put the execution
     676             :      * of local completion callbacks is this function, as we need to execute
     677             :      * local callbacks just before reclaiming at multiple callsites.
     678             :      */
     679     2400828 :     if (ioh->state == PGAIO_HS_COMPLETED_SHARED)
     680             :     {
     681             :         PgAioResult local_result;
     682             : 
     683     2396770 :         local_result = pgaio_io_call_complete_local(ioh);
     684     2396770 :         pgaio_io_update_state(ioh, PGAIO_HS_COMPLETED_LOCAL);
     685             : 
     686     2396770 :         if (ioh->report_return)
     687             :         {
     688     2396742 :             ioh->report_return->result = local_result;
     689     2396742 :             ioh->report_return->target_data = ioh->target_data;
     690             :         }
     691             :     }
     692             : 
     693     2400828 :     pgaio_debug_io(DEBUG4, ioh,
     694             :                    "reclaiming: distilled_result: (status %s, id %u, error_data %d), raw_result: %d",
     695             :                    pgaio_result_status_string(ioh->distilled_result.status),
     696             :                    ioh->distilled_result.id,
     697             :                    ioh->distilled_result.error_data,
     698             :                    ioh->result);
     699             : 
     700             :     /* if the IO has been defined, it's on the in-flight list, remove */
     701     2400828 :     if (ioh->state != PGAIO_HS_HANDED_OUT)
     702     2396770 :         dclist_delete_from(&pgaio_my_backend->in_flight_ios, &ioh->node);
     703             : 
     704     2400828 :     if (ioh->resowner)
     705             :     {
     706     2400734 :         ResourceOwnerForgetAioHandle(ioh->resowner, &ioh->resowner_node);
     707     2400734 :         ioh->resowner = NULL;
     708             :     }
     709             : 
     710             :     Assert(!ioh->resowner);
     711             : 
     712             :     /*
     713             :      * Update generation & state first, before resetting the IO's fields,
     714             :      * otherwise a concurrent "viewer" could think the fields are valid, even
     715             :      * though they are being reset.  Increment the generation first, so that
     716             :      * we can assert elsewhere that we never wait for an IDLE IO.  While it's
     717             :      * a bit weird for the state to go backwards for a generation, it's OK
     718             :      * here, as there cannot be references to the "reborn" IO yet.  Can't
     719             :      * update both at once, so something has to give.
     720             :      */
     721     2400828 :     ioh->generation++;
     722     2400828 :     pgaio_io_update_state(ioh, PGAIO_HS_IDLE);
     723             : 
     724             :     /* ensure the state update is visible before we reset fields */
     725     2400828 :     pg_write_barrier();
     726             : 
     727     2400828 :     ioh->op = PGAIO_OP_INVALID;
     728     2400828 :     ioh->target = PGAIO_TID_INVALID;
     729     2400828 :     ioh->flags = 0;
     730     2400828 :     ioh->num_callbacks = 0;
     731     2400828 :     ioh->handle_data_len = 0;
     732     2400828 :     ioh->report_return = NULL;
     733     2400828 :     ioh->result = 0;
     734     2400828 :     ioh->distilled_result.status = PGAIO_RS_UNKNOWN;
     735             : 
     736             :     /*
     737             :      * We push the IO to the head of the idle IO list, that seems more cache
     738             :      * efficient in cases where only a few IOs are used.
     739             :      */
     740     2400828 :     dclist_push_head(&pgaio_my_backend->idle_ios, &ioh->node);
     741             : 
     742     2400828 :     RESUME_INTERRUPTS();
     743     2400828 : }
     744             : 
     745             : /*
     746             :  * Wait for an IO handle to become usable.
     747             :  *
     748             :  * This only really is useful for pgaio_io_acquire().
     749             :  */
     750             : static void
     751        5668 : pgaio_io_wait_for_free(void)
     752             : {
     753        5668 :     int         reclaimed = 0;
     754             : 
     755        5668 :     pgaio_debug(DEBUG2, "waiting for free IO with %d pending, %u in-flight, %u idle IOs",
     756             :                 pgaio_my_backend->num_staged_ios,
     757             :                 dclist_count(&pgaio_my_backend->in_flight_ios),
     758             :                 dclist_count(&pgaio_my_backend->idle_ios));
     759             : 
     760             :     /*
     761             :      * First check if any of our IOs actually have completed - when using
     762             :      * worker, that'll often be the case. We could do so as part of the loop
     763             :      * below, but that'd potentially lead us to wait for some IO submitted
     764             :      * before.
     765             :      */
     766       11336 :     for (int i = 0; i < io_max_concurrency; i++)
     767             :     {
     768        5668 :         PgAioHandle *ioh = &pgaio_ctl->io_handles[pgaio_my_backend->io_handle_off + i];
     769             : 
     770        5668 :         if (ioh->state == PGAIO_HS_COMPLETED_SHARED)
     771             :         {
     772             :             /*
     773             :              * Note that no interrupts are processed between the state check
     774             :              * and the call to reclaim - that's important as otherwise an
     775             :              * interrupt could have already reclaimed the handle.
     776             :              */
     777        4670 :             pgaio_io_reclaim(ioh);
     778        4670 :             reclaimed++;
     779             :         }
     780             :     }
     781             : 
     782        5668 :     if (reclaimed > 0)
     783        4670 :         return;
     784             : 
     785             :     /*
     786             :      * If we have any unsubmitted IOs, submit them now. We'll start waiting in
     787             :      * a second, so it's better they're in flight. This also addresses the
     788             :      * edge-case that all IOs are unsubmitted.
     789             :      */
     790         998 :     if (pgaio_my_backend->num_staged_ios > 0)
     791           0 :         pgaio_submit_staged();
     792             : 
     793             :     /* possibly some IOs finished during submission */
     794         998 :     if (!dclist_is_empty(&pgaio_my_backend->idle_ios))
     795           0 :         return;
     796             : 
     797         998 :     if (dclist_count(&pgaio_my_backend->in_flight_ios) == 0)
     798           0 :         ereport(ERROR,
     799             :                 errmsg_internal("no free IOs despite no in-flight IOs"),
     800             :                 errdetail_internal("%d pending, %u in-flight, %u idle IOs",
     801             :                                    pgaio_my_backend->num_staged_ios,
     802             :                                    dclist_count(&pgaio_my_backend->in_flight_ios),
     803             :                                    dclist_count(&pgaio_my_backend->idle_ios)));
     804             : 
     805             :     /*
     806             :      * Wait for the oldest in-flight IO to complete.
     807             :      *
     808             :      * XXX: Reusing the general IO wait is suboptimal, we don't need to wait
     809             :      * for that specific IO to complete, we just need *any* IO to complete.
     810             :      */
     811             :     {
     812         998 :         PgAioHandle *ioh = dclist_head_element(PgAioHandle, node,
     813             :                                                &pgaio_my_backend->in_flight_ios);
     814         998 :         uint64      generation = ioh->generation;
     815             : 
     816         998 :         switch (ioh->state)
     817             :         {
     818             :                 /* should not be in in-flight list */
     819           0 :             case PGAIO_HS_IDLE:
     820             :             case PGAIO_HS_DEFINED:
     821             :             case PGAIO_HS_HANDED_OUT:
     822             :             case PGAIO_HS_STAGED:
     823             :             case PGAIO_HS_COMPLETED_LOCAL:
     824           0 :                 elog(ERROR, "shouldn't get here with io:%d in state %d",
     825             :                      pgaio_io_get_id(ioh), ioh->state);
     826             :                 break;
     827             : 
     828         998 :             case PGAIO_HS_COMPLETED_IO:
     829             :             case PGAIO_HS_SUBMITTED:
     830         998 :                 pgaio_debug_io(DEBUG2, ioh,
     831             :                                "waiting for free io with %u in flight",
     832             :                                dclist_count(&pgaio_my_backend->in_flight_ios));
     833             : 
     834             :                 /*
     835             :                  * In a more general case this would be racy, because the
     836             :                  * generation could increase after we read ioh->state above.
     837             :                  * But we are only looking at IOs by the current backend and
     838             :                  * the IO can only be recycled by this backend.  Even this is
     839             :                  * only OK because we get the handle's generation before
     840             :                  * potentially processing interrupts, e.g. as part of
     841             :                  * pgaio_debug_io().
     842             :                  */
     843         998 :                 pgaio_io_wait(ioh, generation);
     844         998 :                 break;
     845             : 
     846           0 :             case PGAIO_HS_COMPLETED_SHARED:
     847             : 
     848             :                 /*
     849             :                  * It's possible that another backend just finished this IO.
     850             :                  *
     851             :                  * Note that no interrupts are processed between the state
     852             :                  * check and the call to reclaim - that's important as
     853             :                  * otherwise an interrupt could have already reclaimed the
     854             :                  * handle.
     855             :                  */
     856           0 :                 pgaio_io_reclaim(ioh);
     857           0 :                 break;
     858             :         }
     859             : 
     860         998 :         if (dclist_count(&pgaio_my_backend->idle_ios) == 0)
     861           0 :             elog(PANIC, "no idle IO after waiting for IO to terminate");
     862         998 :         return;
     863             :     }
     864             : }
     865             : 
     866             : /*
     867             :  * Internal - code outside of AIO should never need this and it'd be hard for
     868             :  * such code to be safe.
     869             :  */
     870             : static PgAioHandle *
     871     1312856 : pgaio_io_from_wref(PgAioWaitRef *iow, uint64 *ref_generation)
     872             : {
     873             :     PgAioHandle *ioh;
     874             : 
     875             :     Assert(iow->aio_index < pgaio_ctl->io_handle_count);
     876             : 
     877     1312856 :     ioh = &pgaio_ctl->io_handles[iow->aio_index];
     878             : 
     879     1312856 :     *ref_generation = ((uint64) iow->generation_upper) << 32 |
     880     1312856 :         iow->generation_lower;
     881             : 
     882             :     Assert(*ref_generation != 0);
     883             : 
     884     1312856 :     return ioh;
     885             : }
     886             : 
     887             : static const char *
     888       14458 : pgaio_io_state_get_name(PgAioHandleState s)
     889             : {
     890             : #define PGAIO_HS_TOSTR_CASE(sym) case PGAIO_HS_##sym: return #sym
     891       14458 :     switch (s)
     892             :     {
     893           0 :             PGAIO_HS_TOSTR_CASE(IDLE);
     894        4764 :             PGAIO_HS_TOSTR_CASE(HANDED_OUT);
     895        2382 :             PGAIO_HS_TOSTR_CASE(DEFINED);
     896        2382 :             PGAIO_HS_TOSTR_CASE(STAGED);
     897         148 :             PGAIO_HS_TOSTR_CASE(SUBMITTED);
     898        2382 :             PGAIO_HS_TOSTR_CASE(COMPLETED_IO);
     899        2400 :             PGAIO_HS_TOSTR_CASE(COMPLETED_SHARED);
     900           0 :             PGAIO_HS_TOSTR_CASE(COMPLETED_LOCAL);
     901             :     }
     902             : #undef PGAIO_HS_TOSTR_CASE
     903             : 
     904           0 :     return NULL;                /* silence compiler */
     905             : }
     906             : 
     907             : const char *
     908       14458 : pgaio_io_get_state_name(PgAioHandle *ioh)
     909             : {
     910       14458 :     return pgaio_io_state_get_name(ioh->state);
     911             : }
     912             : 
     913             : const char *
     914        4764 : pgaio_result_status_string(PgAioResultStatus rs)
     915             : {
     916        4764 :     switch (rs)
     917             :     {
     918           0 :         case PGAIO_RS_UNKNOWN:
     919           0 :             return "UNKNOWN";
     920        4404 :         case PGAIO_RS_OK:
     921        4404 :             return "OK";
     922         136 :         case PGAIO_RS_WARNING:
     923         136 :             return "WARNING";
     924          40 :         case PGAIO_RS_PARTIAL:
     925          40 :             return "PARTIAL";
     926         184 :         case PGAIO_RS_ERROR:
     927         184 :             return "ERROR";
     928             :     }
     929             : 
     930           0 :     return NULL;                /* silence compiler */
     931             : }
     932             : 
     933             : 
     934             : 
     935             : /* --------------------------------------------------------------------------------
     936             :  * Functions primarily related to IO Wait References
     937             :  * --------------------------------------------------------------------------------
     938             :  */
     939             : 
     940             : /*
     941             :  * Mark a wait reference as invalid
     942             :  */
     943             : void
     944    25515220 : pgaio_wref_clear(PgAioWaitRef *iow)
     945             : {
     946    25515220 :     iow->aio_index = PG_UINT32_MAX;
     947    25515220 : }
     948             : 
     949             : /* Is the wait reference valid? */
     950             : bool
     951     4897400 : pgaio_wref_valid(PgAioWaitRef *iow)
     952             : {
     953     4897400 :     return iow->aio_index != PG_UINT32_MAX;
     954             : }
     955             : 
     956             : /*
     957             :  * Similar to pgaio_io_get_id(), just for wait references.
     958             :  */
     959             : int
     960           0 : pgaio_wref_get_id(PgAioWaitRef *iow)
     961             : {
     962             :     Assert(pgaio_wref_valid(iow));
     963           0 :     return iow->aio_index;
     964             : }
     965             : 
     966             : /*
     967             :  * Wait for the IO to have completed. Can be called in any process, not just
     968             :  * in the issuing backend.
     969             :  */
     970             : void
     971      265184 : pgaio_wref_wait(PgAioWaitRef *iow)
     972             : {
     973             :     uint64      ref_generation;
     974             :     PgAioHandle *ioh;
     975             : 
     976      265184 :     ioh = pgaio_io_from_wref(iow, &ref_generation);
     977             : 
     978      265184 :     pgaio_io_wait(ioh, ref_generation);
     979      265184 : }
     980             : 
     981             : /*
     982             :  * Check if the referenced IO completed, without blocking.
     983             :  */
     984             : bool
     985     1047672 : pgaio_wref_check_done(PgAioWaitRef *iow)
     986             : {
     987             :     uint64      ref_generation;
     988             :     PgAioHandleState state;
     989             :     bool        am_owner;
     990             :     PgAioHandle *ioh;
     991             : 
     992     1047672 :     ioh = pgaio_io_from_wref(iow, &ref_generation);
     993             : 
     994     1047672 :     if (pgaio_io_was_recycled(ioh, ref_generation, &state))
     995           0 :         return true;
     996             : 
     997     1047672 :     if (state == PGAIO_HS_IDLE)
     998           0 :         return true;
     999             : 
    1000     1047672 :     am_owner = ioh->owner_procno == MyProcNumber;
    1001             : 
    1002     1047672 :     if (state == PGAIO_HS_COMPLETED_SHARED ||
    1003      261860 :         state == PGAIO_HS_COMPLETED_LOCAL)
    1004             :     {
    1005             :         /*
    1006             :          * Note that no interrupts are processed between
    1007             :          * pgaio_io_was_recycled() and this check - that's important as
    1008             :          * otherwise an interrupt could have already reclaimed the handle.
    1009             :          */
    1010      785812 :         if (am_owner)
    1011      785812 :             pgaio_io_reclaim(ioh);
    1012      785812 :         return true;
    1013             :     }
    1014             : 
    1015             :     /*
    1016             :      * XXX: It likely would be worth checking in with the io method, to give
    1017             :      * the IO method a chance to check if there are completion events queued.
    1018             :      */
    1019             : 
    1020      261860 :     return false;
    1021             : }
    1022             : 
    1023             : 
    1024             : 
    1025             : /* --------------------------------------------------------------------------------
    1026             :  * Actions on multiple IOs.
    1027             :  * --------------------------------------------------------------------------------
    1028             :  */
    1029             : 
    1030             : /*
    1031             :  * Submit IOs in batches going forward.
    1032             :  *
    1033             :  * Submitting multiple IOs at once can be substantially faster than doing so
    1034             :  * one-by-one. At the same time, submitting multiple IOs at once requires more
    1035             :  * care to avoid deadlocks.
    1036             :  *
    1037             :  * Consider backend A staging an IO for buffer 1 and then trying to start IO
    1038             :  * on buffer 2, while backend B does the inverse. If A submitted the IO before
    1039             :  * moving on to buffer 2, this works just fine, B will wait for the IO to
    1040             :  * complete. But if batching were used, each backend will wait for IO that has
    1041             :  * not yet been submitted to complete, i.e. forever.
    1042             :  *
    1043             :  * End batch submission mode with pgaio_exit_batchmode().  (Throwing errors is
    1044             :  * allowed; error recovery will end the batch.)
    1045             :  *
    1046             :  * To avoid deadlocks, code needs to ensure that it will not wait for another
    1047             :  * backend while there is unsubmitted IO. E.g. by using conditional lock
    1048             :  * acquisition when acquiring buffer locks. To check if there currently are
    1049             :  * staged IOs, call pgaio_have_staged() and to submit all staged IOs call
    1050             :  * pgaio_submit_staged().
    1051             :  *
    1052             :  * It is not allowed to enter batchmode while already in batchmode, it's
    1053             :  * unlikely to ever be needed, as code needs to be explicitly aware of being
    1054             :  * called in batchmode, to avoid the deadlock risks explained above.
    1055             :  *
    1056             :  * Note that IOs may get submitted before pgaio_exit_batchmode() is called,
    1057             :  * e.g. because too many IOs have been staged or because pgaio_submit_staged()
    1058             :  * was called.
    1059             :  */
    1060             : void
    1061     5343556 : pgaio_enter_batchmode(void)
    1062             : {
    1063     5343556 :     if (pgaio_my_backend->in_batchmode)
    1064           0 :         elog(ERROR, "starting batch while batch already in progress");
    1065     5343556 :     pgaio_my_backend->in_batchmode = true;
    1066     5343556 : }
    1067             : 
    1068             : /*
    1069             :  * Stop submitting IOs in batches.
    1070             :  */
    1071             : void
    1072     5343536 : pgaio_exit_batchmode(void)
    1073             : {
    1074             :     Assert(pgaio_my_backend->in_batchmode);
    1075             : 
    1076     5343536 :     pgaio_submit_staged();
    1077     5343536 :     pgaio_my_backend->in_batchmode = false;
    1078     5343536 : }
    1079             : 
    1080             : /*
    1081             :  * Are there staged but unsubmitted IOs?
    1082             :  *
    1083             :  * See comment above pgaio_enter_batchmode() for why code may need to check if
    1084             :  * there is IO in that state.
    1085             :  */
    1086             : bool
    1087     2400632 : pgaio_have_staged(void)
    1088             : {
    1089             :     Assert(pgaio_my_backend->in_batchmode ||
    1090             :            pgaio_my_backend->num_staged_ios == 0);
    1091     2400632 :     return pgaio_my_backend->num_staged_ios > 0;
    1092             : }
    1093             : 
    1094             : /*
    1095             :  * Submit all staged but not yet submitted IOs.
    1096             :  *
    1097             :  * Unless in batch mode, this never needs to be called, as IOs get submitted
    1098             :  * as soon as possible. While in batchmode pgaio_submit_staged() can be called
    1099             :  * before waiting on another backend, to avoid the risk of deadlocks. See
    1100             :  * pgaio_enter_batchmode().
    1101             :  */
    1102             : void
    1103     5402442 : pgaio_submit_staged(void)
    1104             : {
    1105     5402442 :     int         total_submitted = 0;
    1106             :     int         did_submit;
    1107             : 
    1108     5402442 :     if (pgaio_my_backend->num_staged_ios == 0)
    1109     4350190 :         return;
    1110             : 
    1111             : 
    1112     1052252 :     START_CRIT_SECTION();
    1113             : 
    1114     1052252 :     did_submit = pgaio_method_ops->submit(pgaio_my_backend->num_staged_ios,
    1115     1052252 :                                           pgaio_my_backend->staged_ios);
    1116             : 
    1117     1052252 :     END_CRIT_SECTION();
    1118             : 
    1119     1052252 :     total_submitted += did_submit;
    1120             : 
    1121             :     Assert(total_submitted == did_submit);
    1122             : 
    1123     1052252 :     pgaio_my_backend->num_staged_ios = 0;
    1124             : 
    1125     1052252 :     pgaio_debug(DEBUG4,
    1126             :                 "aio: submitted %d IOs",
    1127             :                 total_submitted);
    1128             : }
    1129             : 
    1130             : 
    1131             : 
    1132             : /* --------------------------------------------------------------------------------
    1133             :  * Other
    1134             :  * --------------------------------------------------------------------------------
    1135             :  */
    1136             : 
    1137             : 
    1138             : /*
    1139             :  * Perform AIO related cleanup after an error.
    1140             :  *
    1141             :  * This should be called early in the error recovery paths, as later steps may
    1142             :  * need to issue AIO (e.g. to record a transaction abort WAL record).
    1143             :  */
    1144             : void
    1145       58528 : pgaio_error_cleanup(void)
    1146             : {
    1147             :     /*
    1148             :      * It is possible that code errored out after pgaio_enter_batchmode() but
    1149             :      * before pgaio_exit_batchmode() was called. In that case we need to
    1150             :      * submit the IO now.
    1151             :      */
    1152       58528 :     if (pgaio_my_backend->in_batchmode)
    1153             :     {
    1154          20 :         pgaio_my_backend->in_batchmode = false;
    1155             : 
    1156          20 :         pgaio_submit_staged();
    1157             :     }
    1158             : 
    1159             :     /*
    1160             :      * As we aren't in batchmode, there shouldn't be any unsubmitted IOs.
    1161             :      */
    1162             :     Assert(pgaio_my_backend->num_staged_ios == 0);
    1163       58528 : }
    1164             : 
    1165             : /*
    1166             :  * Perform AIO related checks at (sub-)transactional boundaries.
    1167             :  *
    1168             :  * This should be called late during (sub-)transactional commit/abort, after
    1169             :  * all steps that might need to perform AIO, so that we can verify that the
    1170             :  * AIO subsystem is in a valid state at the end of a transaction.
    1171             :  */
    1172             : void
    1173     1136058 : AtEOXact_Aio(bool is_commit)
    1174             : {
    1175             :     /*
    1176             :      * We should never be in batch mode at transactional boundaries. In case
    1177             :      * an error was thrown while in batch mode, pgaio_error_cleanup() should
    1178             :      * have exited batchmode.
    1179             :      *
    1180             :      * In case we are in batchmode somehow, make sure to submit all staged
    1181             :      * IOs, other backends may need them to complete to continue.
    1182             :      */
    1183     1136058 :     if (pgaio_my_backend->in_batchmode)
    1184             :     {
    1185           8 :         pgaio_error_cleanup();
    1186           8 :         elog(WARNING, "open AIO batch at end of (sub-)transaction");
    1187             :     }
    1188             : 
    1189             :     /*
    1190             :      * As we aren't in batchmode, there shouldn't be any unsubmitted IOs.
    1191             :      */
    1192             :     Assert(pgaio_my_backend->num_staged_ios == 0);
    1193     1136058 : }
    1194             : 
    1195             : /*
    1196             :  * Need to submit staged but not yet submitted IOs using the fd, otherwise
    1197             :  * the IO would end up targeting something bogus.
    1198             :  */
    1199             : void
    1200    16517762 : pgaio_closing_fd(int fd)
    1201             : {
    1202             :     /*
    1203             :      * Might be called before AIO is initialized or in a subprocess that
    1204             :      * doesn't use AIO.
    1205             :      */
    1206    16517762 :     if (!pgaio_my_backend)
    1207       13552 :         return;
    1208             : 
    1209             :     /*
    1210             :      * For now just submit all staged IOs - we could be more selective, but
    1211             :      * it's probably not worth it.
    1212             :      */
    1213    16504210 :     if (pgaio_my_backend->num_staged_ios > 0)
    1214             :     {
    1215           4 :         pgaio_debug(DEBUG2,
    1216             :                     "submitting %d IOs before FD %d gets closed",
    1217             :                     pgaio_my_backend->num_staged_ios, fd);
    1218           4 :         pgaio_submit_staged();
    1219             :     }
    1220             : 
    1221             :     /*
    1222             :      * If requested by the IO method, wait for all IOs that use the
    1223             :      * to-be-closed FD.
    1224             :      */
    1225    16504210 :     if (pgaio_method_ops->wait_on_fd_before_close)
    1226             :     {
    1227             :         /*
    1228             :          * As waiting for one IO to complete may complete multiple IOs, we
    1229             :          * can't just use a mutable list iterator. The maximum number of
    1230             :          * in-flight IOs is fairly small, so just restart the loop after
    1231             :          * waiting for an IO.
    1232             :          */
    1233           0 :         while (!dclist_is_empty(&pgaio_my_backend->in_flight_ios))
    1234             :         {
    1235             :             dlist_iter  iter;
    1236           0 :             PgAioHandle *ioh = NULL;
    1237             :             uint64      generation;
    1238             : 
    1239           0 :             dclist_foreach(iter, &pgaio_my_backend->in_flight_ios)
    1240             :             {
    1241           0 :                 ioh = dclist_container(PgAioHandle, node, iter.cur);
    1242             : 
    1243           0 :                 generation = ioh->generation;
    1244             : 
    1245           0 :                 if (pgaio_io_uses_fd(ioh, fd))
    1246           0 :                     break;
    1247             :                 else
    1248           0 :                     ioh = NULL;
    1249             :             }
    1250             : 
    1251           0 :             if (!ioh)
    1252           0 :                 break;
    1253             : 
    1254           0 :             pgaio_debug_io(DEBUG2, ioh,
    1255             :                            "waiting for IO before FD %d gets closed, %u in-flight IOs",
    1256             :                            fd, dclist_count(&pgaio_my_backend->in_flight_ios));
    1257             : 
    1258             :             /* see comment in pgaio_io_wait_for_free() about raciness */
    1259           0 :             pgaio_io_wait(ioh, generation);
    1260             :         }
    1261             :     }
    1262             : }
    1263             : 
    1264             : /*
    1265             :  * Registered as before_shmem_exit() callback in pgaio_init_backend()
    1266             :  */
    1267             : void
    1268       38764 : pgaio_shutdown(int code, Datum arg)
    1269             : {
    1270             :     Assert(pgaio_my_backend);
    1271             :     Assert(!pgaio_my_backend->handed_out_io);
    1272             : 
    1273             :     /* first clean up resources as we would at a transaction boundary */
    1274       38764 :     AtEOXact_Aio(code == 0);
    1275             : 
    1276             :     /*
    1277             :      * Before exiting, make sure that all IOs are finished. That has two main
    1278             :      * purposes:
    1279             :      *
    1280             :      * - Some kernel-level AIO mechanisms don't deal well with the issuer of
    1281             :      * an AIO exiting before IO completed
    1282             :      *
    1283             :      * - It'd be confusing to see partially finished IOs in stats views etc
    1284             :      */
    1285       38782 :     while (!dclist_is_empty(&pgaio_my_backend->in_flight_ios))
    1286             :     {
    1287          18 :         PgAioHandle *ioh = dclist_head_element(PgAioHandle, node, &pgaio_my_backend->in_flight_ios);
    1288          18 :         uint64      generation = ioh->generation;
    1289             : 
    1290          18 :         pgaio_debug_io(DEBUG2, ioh,
    1291             :                        "waiting for IO to complete during shutdown, %u in-flight IOs",
    1292             :                        dclist_count(&pgaio_my_backend->in_flight_ios));
    1293             : 
    1294             :         /* see comment in pgaio_io_wait_for_free() about raciness */
    1295          18 :         pgaio_io_wait(ioh, generation);
    1296             :     }
    1297             : 
    1298       38764 :     pgaio_my_backend = NULL;
    1299       38764 : }
    1300             : 
    1301             : void
    1302        2186 : assign_io_method(int newval, void *extra)
    1303             : {
    1304             :     Assert(pgaio_method_ops_table[newval] != NULL);
    1305             :     Assert(newval < lengthof(io_method_options));
    1306             : 
    1307        2186 :     pgaio_method_ops = pgaio_method_ops_table[newval];
    1308        2186 : }
    1309             : 
    1310             : bool
    1311        4256 : check_io_max_concurrency(int *newval, void **extra, GucSource source)
    1312             : {
    1313        4256 :     if (*newval == -1)
    1314             :     {
    1315             :         /*
    1316             :          * Auto-tuning will be applied later during startup, as auto-tuning
    1317             :          * depends on the value of various GUCs.
    1318             :          */
    1319        2164 :         return true;
    1320             :     }
    1321        2092 :     else if (*newval == 0)
    1322             :     {
    1323           0 :         GUC_check_errdetail("Only -1 or values bigger than 0 are valid.");
    1324           0 :         return false;
    1325             :     }
    1326             : 
    1327        2092 :     return true;
    1328             : }

Generated by: LCOV version 1.16