LCOV - code coverage report
Current view: top level - src/backend/storage/aio - aio.c (source / functions) Hit Total Coverage
Test: PostgreSQL 18beta1 Lines: 287 335 85.7 %
Date: 2025-06-28 05:17:58 Functions: 35 37 94.6 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * aio.c
       4             :  *    AIO - Core Logic
       5             :  *
       6             :  * For documentation about how AIO works on a higher level, including a
       7             :  * schematic example, see README.md.
       8             :  *
       9             :  *
      10             :  * AIO is a complicated subsystem. To keep things navigable, it is split
      11             :  * across a number of files:
      12             :  *
      13             :  * - method_*.c - different ways of executing AIO (e.g. worker process)
      14             :  *
      15             :  * - aio_target.c - IO on different kinds of targets
      16             :  *
      17             :  * - aio_io.c - method-independent code for specific IO ops (e.g. readv)
      18             :  *
      19             :  * - aio_callback.c - callbacks at IO operation lifecycle events
      20             :  *
      21             :  * - aio_init.c - per-server and per-backend initialization
      22             :  *
      23             :  * - aio.c - all other topics
      24             :  *
      25             :  * - read_stream.c - helper for reading buffered relation data
      26             :  *
      27             :  * - README.md - higher-level overview over AIO
      28             :  *
      29             :  *
      30             :  * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
      31             :  * Portions Copyright (c) 1994, Regents of the University of California
      32             :  *
      33             :  * IDENTIFICATION
      34             :  *    src/backend/storage/aio/aio.c
      35             :  *
      36             :  *-------------------------------------------------------------------------
      37             :  */
      38             : 
      39             : #include "postgres.h"
      40             : 
      41             : #include "lib/ilist.h"
      42             : #include "miscadmin.h"
      43             : #include "port/atomics.h"
      44             : #include "storage/aio.h"
      45             : #include "storage/aio_internal.h"
      46             : #include "storage/aio_subsys.h"
      47             : #include "utils/guc.h"
      48             : #include "utils/guc_hooks.h"
      49             : #include "utils/injection_point.h"
      50             : #include "utils/resowner.h"
      51             : #include "utils/wait_event_types.h"
      52             : 
      53             : 
      54             : static inline void pgaio_io_update_state(PgAioHandle *ioh, PgAioHandleState new_state);
      55             : static void pgaio_io_reclaim(PgAioHandle *ioh);
      56             : static void pgaio_io_resowner_register(PgAioHandle *ioh);
      57             : static void pgaio_io_wait_for_free(void);
      58             : static PgAioHandle *pgaio_io_from_wref(PgAioWaitRef *iow, uint64 *ref_generation);
      59             : static const char *pgaio_io_state_get_name(PgAioHandleState s);
      60             : static void pgaio_io_wait(PgAioHandle *ioh, uint64 ref_generation);
      61             : 
      62             : 
      63             : /* Options for io_method. */
      64             : const struct config_enum_entry io_method_options[] = {
      65             :     {"sync", IOMETHOD_SYNC, false},
      66             :     {"worker", IOMETHOD_WORKER, false},
      67             : #ifdef IOMETHOD_IO_URING_ENABLED
      68             :     {"io_uring", IOMETHOD_IO_URING, false},
      69             : #endif
      70             :     {NULL, 0, false}
      71             : };
      72             : 
      73             : /* GUCs */
      74             : int         io_method = DEFAULT_IO_METHOD;
      75             : int         io_max_concurrency = -1;
      76             : 
      77             : /* global control for AIO */
      78             : PgAioCtl   *pgaio_ctl;
      79             : 
      80             : /* current backend's per-backend state */
      81             : PgAioBackend *pgaio_my_backend;
      82             : 
      83             : 
      84             : static const IoMethodOps *const pgaio_method_ops_table[] = {
      85             :     [IOMETHOD_SYNC] = &pgaio_sync_ops,
      86             :     [IOMETHOD_WORKER] = &pgaio_worker_ops,
      87             : #ifdef IOMETHOD_IO_URING_ENABLED
      88             :     [IOMETHOD_IO_URING] = &pgaio_uring_ops,
      89             : #endif
      90             : };
      91             : 
      92             : /* callbacks for the configured io_method, set by assign_io_method */
      93             : const IoMethodOps *pgaio_method_ops;
      94             : 
      95             : 
      96             : /* --------------------------------------------------------------------------------
      97             :  * Public Functions related to PgAioHandle
      98             :  * --------------------------------------------------------------------------------
      99             :  */
     100             : 
     101             : /*
     102             :  * Acquire an AioHandle, waiting for IO completion if necessary.
     103             :  *
     104             :  * Each backend can only have one AIO handle that has been "handed out" to
     105             :  * code, but not yet submitted or released. This restriction is necessary to
     106             :  * ensure that it is possible for code to wait for an unused handle by waiting
     107             :  * for in-flight IO to complete. There is a limited number of handles in each
     108             :  * backend, if multiple handles could be handed out without being submitted,
     109             :  * waiting for all in-flight IO to complete would not guarantee that handles
     110             :  * free up.
     111             :  *
     112             :  * It is cheap to acquire an IO handle, unless all handles are in use. In that
     113             :  * case this function waits for the oldest IO to complete. If that is not
     114             :  * desirable, use pgaio_io_acquire_nb().
     115             :  *
     116             :  * If a handle was acquired but then does not turn out to be needed,
     117             :  * e.g. because pgaio_io_acquire() is called before starting an IO in a
     118             :  * critical section, the handle needs to be released with pgaio_io_release().
     119             :  *
     120             :  *
     121             :  * To react to the completion of the IO as soon as it is known to have
     122             :  * completed, callbacks can be registered with pgaio_io_register_callbacks().
     123             :  *
     124             :  * To actually execute IO using the returned handle, the pgaio_io_start_*()
     125             :  * family of functions is used. In many cases the pgaio_io_start_*() call will
     126             :  * not be done directly by code that acquired the handle, but by lower level
     127             :  * code that gets passed the handle. E.g. if code in bufmgr.c wants to perform
     128             :  * AIO, it typically will pass the handle to smgr.c, which will pass it on to
     129             :  * md.c, on to fd.c, which then finally calls pgaio_io_start_*().  This
     130             :  * forwarding allows the various layers to react to the IO's completion by
     131             :  * registering callbacks. These callbacks in turn can translate a lower
     132             :  * layer's result into a result understandable by a higher layer.
     133             :  *
     134             :  * During pgaio_io_start_*() the IO is staged (i.e. prepared for execution but
     135             :  * not submitted to the kernel). Unless in batchmode
     136             :  * (c.f. pgaio_enter_batchmode()), the IO will also get submitted for
     137             :  * execution. Note that, whether in batchmode or not, the IO might even
     138             :  * complete before the functions return.
     139             :  *
     140             :  * After pgaio_io_start_*() the AioHandle is "consumed" and may not be
     141             :  * referenced by the IO issuing code. To e.g. wait for IO, references to the
     142             :  * IO can be established with pgaio_io_get_wref() *before* pgaio_io_start_*()
     143             :  * is called.  pgaio_wref_wait() can be used to wait for the IO to complete.
     144             :  *
     145             :  *
     146             :  * To know if the IO [partially] succeeded or failed, a PgAioReturn * can be
     147             :  * passed to pgaio_io_acquire(). Once the issuing backend has called
     148             :  * pgaio_wref_wait(), the PgAioReturn contains information about whether the
     149             :  * operation succeeded and details about the first failure, if any. The error
     150             :  * can be raised / logged with pgaio_result_report().
     151             :  *
     152             :  * The lifetime of the memory pointed to be *ret needs to be at least as long
     153             :  * as the passed in resowner. If the resowner releases resources before the IO
     154             :  * completes (typically due to an error), the reference to *ret will be
     155             :  * cleared. In case of resowner cleanup *ret will not be updated with the
     156             :  * results of the IO operation.
     157             :  */
     158             : PgAioHandle *
     159        5596 : pgaio_io_acquire(struct ResourceOwnerData *resowner, PgAioReturn *ret)
     160             : {
     161             :     PgAioHandle *h;
     162             : 
     163             :     while (true)
     164             :     {
     165       10992 :         h = pgaio_io_acquire_nb(resowner, ret);
     166             : 
     167       10988 :         if (h != NULL)
     168        5592 :             return h;
     169             : 
     170             :         /*
     171             :          * Evidently all handles by this backend are in use. Just wait for
     172             :          * some to complete.
     173             :          */
     174        5396 :         pgaio_io_wait_for_free();
     175             :     }
     176             : }
     177             : 
     178             : /*
     179             :  * Acquire an AioHandle, returning NULL if no handles are free.
     180             :  *
     181             :  * See pgaio_io_acquire(). The only difference is that this function will return
     182             :  * NULL if there are no idle handles, instead of blocking.
     183             :  */
     184             : PgAioHandle *
     185     2440118 : pgaio_io_acquire_nb(struct ResourceOwnerData *resowner, PgAioReturn *ret)
     186             : {
     187     2440118 :     PgAioHandle *ioh = NULL;
     188             : 
     189     2440118 :     if (pgaio_my_backend->num_staged_ios >= PGAIO_SUBMIT_BATCH_SIZE)
     190             :     {
     191             :         Assert(pgaio_my_backend->num_staged_ios == PGAIO_SUBMIT_BATCH_SIZE);
     192           0 :         pgaio_submit_staged();
     193             :     }
     194             : 
     195     2440118 :     if (pgaio_my_backend->handed_out_io)
     196           4 :         elog(ERROR, "API violation: Only one IO can be handed out");
     197             : 
     198             :     /*
     199             :      * Probably not needed today, as interrupts should not process this IO,
     200             :      * but...
     201             :      */
     202     2440114 :     HOLD_INTERRUPTS();
     203             : 
     204     2440114 :     if (!dclist_is_empty(&pgaio_my_backend->idle_ios))
     205             :     {
     206     2429322 :         dlist_node *ion = dclist_pop_head_node(&pgaio_my_backend->idle_ios);
     207             : 
     208     2429322 :         ioh = dclist_container(PgAioHandle, node, ion);
     209             : 
     210             :         Assert(ioh->state == PGAIO_HS_IDLE);
     211             :         Assert(ioh->owner_procno == MyProcNumber);
     212             : 
     213     2429322 :         pgaio_io_update_state(ioh, PGAIO_HS_HANDED_OUT);
     214     2429322 :         pgaio_my_backend->handed_out_io = ioh;
     215             : 
     216     2429322 :         if (resowner)
     217     2429322 :             pgaio_io_resowner_register(ioh);
     218             : 
     219     2429322 :         if (ret)
     220             :         {
     221     2429270 :             ioh->report_return = ret;
     222     2429270 :             ret->result.status = PGAIO_RS_UNKNOWN;
     223             :         }
     224             :     }
     225             : 
     226     2440114 :     RESUME_INTERRUPTS();
     227             : 
     228     2440114 :     return ioh;
     229             : }
     230             : 
     231             : /*
     232             :  * Release IO handle that turned out to not be required.
     233             :  *
     234             :  * See pgaio_io_acquire() for more details.
     235             :  */
     236             : void
     237       15150 : pgaio_io_release(PgAioHandle *ioh)
     238             : {
     239       15150 :     if (ioh == pgaio_my_backend->handed_out_io)
     240             :     {
     241             :         Assert(ioh->state == PGAIO_HS_HANDED_OUT);
     242             :         Assert(ioh->resowner);
     243             : 
     244       15146 :         pgaio_my_backend->handed_out_io = NULL;
     245             : 
     246             :         /*
     247             :          * Note that no interrupts are processed between the handed_out_io
     248             :          * check and the call to reclaim - that's important as otherwise an
     249             :          * interrupt could have already reclaimed the handle.
     250             :          */
     251       15146 :         pgaio_io_reclaim(ioh);
     252             :     }
     253             :     else
     254             :     {
     255           4 :         elog(ERROR, "release in unexpected state");
     256             :     }
     257       15146 : }
     258             : 
     259             : /*
     260             :  * Release IO handle during resource owner cleanup.
     261             :  */
     262             : void
     263          94 : pgaio_io_release_resowner(dlist_node *ioh_node, bool on_error)
     264             : {
     265          94 :     PgAioHandle *ioh = dlist_container(PgAioHandle, resowner_node, ioh_node);
     266             : 
     267             :     Assert(ioh->resowner);
     268             : 
     269             :     /*
     270             :      * Otherwise an interrupt, in the middle of releasing the IO, could end up
     271             :      * trying to wait for the IO, leading to state confusion.
     272             :      */
     273          94 :     HOLD_INTERRUPTS();
     274             : 
     275          94 :     ResourceOwnerForgetAioHandle(ioh->resowner, &ioh->resowner_node);
     276          94 :     ioh->resowner = NULL;
     277             : 
     278          94 :     switch (ioh->state)
     279             :     {
     280           0 :         case PGAIO_HS_IDLE:
     281           0 :             elog(ERROR, "unexpected");
     282             :             break;
     283          66 :         case PGAIO_HS_HANDED_OUT:
     284             :             Assert(ioh == pgaio_my_backend->handed_out_io || pgaio_my_backend->handed_out_io == NULL);
     285             : 
     286          66 :             if (ioh == pgaio_my_backend->handed_out_io)
     287             :             {
     288          66 :                 pgaio_my_backend->handed_out_io = NULL;
     289          66 :                 if (!on_error)
     290          20 :                     elog(WARNING, "leaked AIO handle");
     291             :             }
     292             : 
     293          66 :             pgaio_io_reclaim(ioh);
     294          66 :             break;
     295           0 :         case PGAIO_HS_DEFINED:
     296             :         case PGAIO_HS_STAGED:
     297           0 :             if (!on_error)
     298           0 :                 elog(WARNING, "AIO handle was not submitted");
     299           0 :             pgaio_submit_staged();
     300           0 :             break;
     301          28 :         case PGAIO_HS_SUBMITTED:
     302             :         case PGAIO_HS_COMPLETED_IO:
     303             :         case PGAIO_HS_COMPLETED_SHARED:
     304             :         case PGAIO_HS_COMPLETED_LOCAL:
     305             :             /* this is expected to happen */
     306          28 :             break;
     307             :     }
     308             : 
     309             :     /*
     310             :      * Need to unregister the reporting of the IO's result, the memory it's
     311             :      * referencing likely has gone away.
     312             :      */
     313          94 :     if (ioh->report_return)
     314          28 :         ioh->report_return = NULL;
     315             : 
     316          94 :     RESUME_INTERRUPTS();
     317          94 : }
     318             : 
     319             : /*
     320             :  * Add a [set of] flags to the IO.
     321             :  *
     322             :  * Note that this combines flags with already set flags, rather than set flags
     323             :  * to explicitly the passed in parameters. This is to allow multiple callsites
     324             :  * to set flags.
     325             :  */
     326             : void
     327     4825356 : pgaio_io_set_flag(PgAioHandle *ioh, PgAioHandleFlags flag)
     328             : {
     329             :     Assert(ioh->state == PGAIO_HS_HANDED_OUT);
     330             : 
     331     4825356 :     ioh->flags |= flag;
     332     4825356 : }
     333             : 
     334             : /*
     335             :  * Returns an ID uniquely identifying the IO handle. This is only really
     336             :  * useful for logging, as handles are reused across multiple IOs.
     337             :  */
     338             : int
     339     1131294 : pgaio_io_get_id(PgAioHandle *ioh)
     340             : {
     341             :     Assert(ioh >= pgaio_ctl->io_handles &&
     342             :            ioh < (pgaio_ctl->io_handles + pgaio_ctl->io_handle_count));
     343     1131294 :     return ioh - pgaio_ctl->io_handles;
     344             : }
     345             : 
     346             : /*
     347             :  * Return the ProcNumber for the process that can use an IO handle. The
     348             :  * mapping from IO handles to PGPROCs is static, therefore this even works
     349             :  * when the corresponding PGPROC is not in use.
     350             :  */
     351             : ProcNumber
     352           0 : pgaio_io_get_owner(PgAioHandle *ioh)
     353             : {
     354           0 :     return ioh->owner_procno;
     355             : }
     356             : 
     357             : /*
     358             :  * Return a wait reference for the IO. Only wait references can be used to
     359             :  * wait for an IOs completion, as handles themselves can be reused after
     360             :  * completion.  See also the comment above pgaio_io_acquire().
     361             :  */
     362             : void
     363     4828250 : pgaio_io_get_wref(PgAioHandle *ioh, PgAioWaitRef *iow)
     364             : {
     365             :     Assert(ioh->state == PGAIO_HS_HANDED_OUT ||
     366             :            ioh->state == PGAIO_HS_DEFINED ||
     367             :            ioh->state == PGAIO_HS_STAGED);
     368             :     Assert(ioh->generation != 0);
     369             : 
     370     4828250 :     iow->aio_index = ioh - pgaio_ctl->io_handles;
     371     4828250 :     iow->generation_upper = (uint32) (ioh->generation >> 32);
     372     4828250 :     iow->generation_lower = (uint32) ioh->generation;
     373     4828250 : }
     374             : 
     375             : 
     376             : 
     377             : /* --------------------------------------------------------------------------------
     378             :  * Internal Functions related to PgAioHandle
     379             :  * --------------------------------------------------------------------------------
     380             :  */
     381             : 
     382             : static inline void
     383    18926064 : pgaio_io_update_state(PgAioHandle *ioh, PgAioHandleState new_state)
     384             : {
     385             :     /*
     386             :      * All callers need to have held interrupts in some form, otherwise
     387             :      * interrupt processing could wait for the IO to complete, while in an
     388             :      * intermediary state.
     389             :      */
     390             :     Assert(!INTERRUPTS_CAN_BE_PROCESSED());
     391             : 
     392    18926064 :     pgaio_debug_io(DEBUG5, ioh,
     393             :                    "updating state to %s",
     394             :                    pgaio_io_state_get_name(new_state));
     395             : 
     396             :     /*
     397             :      * Ensure the changes signified by the new state are visible before the
     398             :      * new state becomes visible.
     399             :      */
     400    18926064 :     pg_write_barrier();
     401             : 
     402    18926064 :     ioh->state = new_state;
     403    18926064 : }
     404             : 
     405             : static void
     406     2429322 : pgaio_io_resowner_register(PgAioHandle *ioh)
     407             : {
     408             :     Assert(!ioh->resowner);
     409             :     Assert(CurrentResourceOwner);
     410             : 
     411     2429322 :     ResourceOwnerRememberAioHandle(CurrentResourceOwner, &ioh->resowner_node);
     412     2429322 :     ioh->resowner = CurrentResourceOwner;
     413     2429322 : }
     414             : 
     415             : /*
     416             :  * Stage IO for execution and, if appropriate, submit it immediately.
     417             :  *
     418             :  * Should only be called from pgaio_io_start_*().
     419             :  */
     420             : void
     421     2414110 : pgaio_io_stage(PgAioHandle *ioh, PgAioOp op)
     422             : {
     423             :     bool        needs_synchronous;
     424             : 
     425             :     Assert(ioh->state == PGAIO_HS_HANDED_OUT);
     426             :     Assert(pgaio_my_backend->handed_out_io == ioh);
     427             :     Assert(pgaio_io_has_target(ioh));
     428             : 
     429             :     /*
     430             :      * Otherwise an interrupt, in the middle of staging and possibly executing
     431             :      * the IO, could end up trying to wait for the IO, leading to state
     432             :      * confusion.
     433             :      */
     434     2414110 :     HOLD_INTERRUPTS();
     435             : 
     436     2414110 :     ioh->op = op;
     437     2414110 :     ioh->result = 0;
     438             : 
     439     2414110 :     pgaio_io_update_state(ioh, PGAIO_HS_DEFINED);
     440             : 
     441             :     /* allow a new IO to be staged */
     442     2414110 :     pgaio_my_backend->handed_out_io = NULL;
     443             : 
     444     2414110 :     pgaio_io_call_stage(ioh);
     445             : 
     446     2414110 :     pgaio_io_update_state(ioh, PGAIO_HS_STAGED);
     447             : 
     448             :     /*
     449             :      * Synchronous execution has to be executed, well, synchronously, so check
     450             :      * that first.
     451             :      */
     452     2414110 :     needs_synchronous = pgaio_io_needs_synchronous_execution(ioh);
     453             : 
     454     2414110 :     pgaio_debug_io(DEBUG3, ioh,
     455             :                    "staged (synchronous: %d, in_batch: %d)",
     456             :                    needs_synchronous, pgaio_my_backend->in_batchmode);
     457             : 
     458     2414110 :     if (!needs_synchronous)
     459             :     {
     460     1073476 :         pgaio_my_backend->staged_ios[pgaio_my_backend->num_staged_ios++] = ioh;
     461             :         Assert(pgaio_my_backend->num_staged_ios <= PGAIO_SUBMIT_BATCH_SIZE);
     462             : 
     463             :         /*
     464             :          * Unless code explicitly opted into batching IOs, submit the IO
     465             :          * immediately.
     466             :          */
     467     1073476 :         if (!pgaio_my_backend->in_batchmode)
     468       52052 :             pgaio_submit_staged();
     469             :     }
     470             :     else
     471             :     {
     472     1340634 :         pgaio_io_prepare_submit(ioh);
     473     1340634 :         pgaio_io_perform_synchronously(ioh);
     474             :     }
     475             : 
     476     2414110 :     RESUME_INTERRUPTS();
     477     2414110 : }
     478             : 
     479             : bool
     480     2414110 : pgaio_io_needs_synchronous_execution(PgAioHandle *ioh)
     481             : {
     482             :     /*
     483             :      * If the caller said to execute the IO synchronously, do so.
     484             :      *
     485             :      * XXX: We could optimize the logic when to execute synchronously by first
     486             :      * checking if there are other IOs in flight and only synchronously
     487             :      * executing if not. Unclear whether that'll be sufficiently common to be
     488             :      * worth worrying about.
     489             :      */
     490     2414110 :     if (ioh->flags & PGAIO_HF_SYNCHRONOUS)
     491     1332510 :         return true;
     492             : 
     493             :     /* Check if the IO method requires synchronous execution of IO */
     494     1081600 :     if (pgaio_method_ops->needs_synchronous_execution)
     495     1081600 :         return pgaio_method_ops->needs_synchronous_execution(ioh);
     496             : 
     497           0 :     return false;
     498             : }
     499             : 
     500             : /*
     501             :  * Handle IO being processed by IO method.
     502             :  *
     503             :  * Should be called by IO methods / synchronous IO execution, just before the
     504             :  * IO is performed.
     505             :  */
     506             : void
     507     2414110 : pgaio_io_prepare_submit(PgAioHandle *ioh)
     508             : {
     509     2414110 :     pgaio_io_update_state(ioh, PGAIO_HS_SUBMITTED);
     510             : 
     511     2414110 :     dclist_push_tail(&pgaio_my_backend->in_flight_ios, &ioh->node);
     512     2414110 : }
     513             : 
     514             : /*
     515             :  * Handle IO getting completed by a method.
     516             :  *
     517             :  * Should be called by IO methods / synchronous IO execution, just after the
     518             :  * IO has been performed.
     519             :  *
     520             :  * Expects to be called in a critical section. We expect IOs to be usable for
     521             :  * WAL etc, which requires being able to execute completion callbacks in a
     522             :  * critical section.
     523             :  */
     524             : void
     525     2205490 : pgaio_io_process_completion(PgAioHandle *ioh, int result)
     526             : {
     527             :     Assert(ioh->state == PGAIO_HS_SUBMITTED);
     528             : 
     529             :     Assert(CritSectionCount > 0);
     530             : 
     531     2205490 :     ioh->result = result;
     532             : 
     533     2205490 :     pgaio_io_update_state(ioh, PGAIO_HS_COMPLETED_IO);
     534             : 
     535     2205490 :     INJECTION_POINT("aio-process-completion-before-shared", ioh);
     536             : 
     537     2205490 :     pgaio_io_call_complete_shared(ioh);
     538             : 
     539     2205490 :     pgaio_io_update_state(ioh, PGAIO_HS_COMPLETED_SHARED);
     540             : 
     541             :     /* condition variable broadcast ensures state is visible before wakeup */
     542     2205490 :     ConditionVariableBroadcast(&ioh->cv);
     543             : 
     544             :     /* contains call to pgaio_io_call_complete_local() */
     545     2205490 :     if (ioh->owner_procno == MyProcNumber)
     546     1340634 :         pgaio_io_reclaim(ioh);
     547     2205490 : }
     548             : 
     549             : /*
     550             :  * Has the IO completed and thus the IO handle been reused?
     551             :  *
     552             :  * This is useful when waiting for IO completion at a low level (e.g. in an IO
     553             :  * method's ->wait_one() callback).
     554             :  */
     555             : bool
     556     3000480 : pgaio_io_was_recycled(PgAioHandle *ioh, uint64 ref_generation, PgAioHandleState *state)
     557             : {
     558     3000480 :     *state = ioh->state;
     559             : 
     560             :     /*
     561             :      * Ensure that we don't see an earlier state of the handle than ioh->state
     562             :      * due to compiler or CPU reordering. This protects both ->generation as
     563             :      * directly used here, and other fields in the handle accessed in the
     564             :      * caller if the handle was not reused.
     565             :      */
     566     3000480 :     pg_read_barrier();
     567             : 
     568     3000480 :     return ioh->generation != ref_generation;
     569             : }
     570             : 
     571             : /*
     572             :  * Wait for IO to complete. External code should never use this, outside of
     573             :  * the AIO subsystem waits are only allowed via pgaio_wref_wait().
     574             :  */
     575             : static void
     576      387126 : pgaio_io_wait(PgAioHandle *ioh, uint64 ref_generation)
     577             : {
     578             :     PgAioHandleState state;
     579             :     bool        am_owner;
     580             : 
     581      387126 :     am_owner = ioh->owner_procno == MyProcNumber;
     582             : 
     583      387126 :     if (pgaio_io_was_recycled(ioh, ref_generation, &state))
     584          68 :         return;
     585             : 
     586      387058 :     if (am_owner)
     587             :     {
     588      375208 :         if (state != PGAIO_HS_SUBMITTED
     589       87172 :             && state != PGAIO_HS_COMPLETED_IO
     590         454 :             && state != PGAIO_HS_COMPLETED_SHARED
     591           0 :             && state != PGAIO_HS_COMPLETED_LOCAL)
     592             :         {
     593           0 :             elog(PANIC, "waiting for own IO %d in wrong state: %s",
     594             :                  pgaio_io_get_id(ioh), pgaio_io_get_state_name(ioh));
     595             :         }
     596             :     }
     597             : 
     598             :     while (true)
     599             :     {
     600      773478 :         if (pgaio_io_was_recycled(ioh, ref_generation, &state))
     601        8592 :             return;
     602             : 
     603      764886 :         switch (state)
     604             :         {
     605           0 :             case PGAIO_HS_IDLE:
     606             :             case PGAIO_HS_HANDED_OUT:
     607           0 :                 elog(ERROR, "IO in wrong state: %d", state);
     608             :                 break;
     609             : 
     610      294450 :             case PGAIO_HS_SUBMITTED:
     611             : 
     612             :                 /*
     613             :                  * If we need to wait via the IO method, do so now. Don't
     614             :                  * check via the IO method if the issuing backend is executing
     615             :                  * the IO synchronously.
     616             :                  */
     617      294450 :                 if (pgaio_method_ops->wait_one && !(ioh->flags & PGAIO_HF_SYNCHRONOUS))
     618             :                 {
     619           0 :                     pgaio_method_ops->wait_one(ioh, ref_generation);
     620           0 :                     continue;
     621             :                 }
     622             :                 /* fallthrough */
     623             : 
     624             :                 /* waiting for owner to submit */
     625             :             case PGAIO_HS_DEFINED:
     626             :             case PGAIO_HS_STAGED:
     627             :                 /* waiting for reaper to complete */
     628             :                 /* fallthrough */
     629             :             case PGAIO_HS_COMPLETED_IO:
     630             :                 /* shouldn't be able to hit this otherwise */
     631             :                 Assert(IsUnderPostmaster);
     632             :                 /* ensure we're going to get woken up */
     633      386420 :                 ConditionVariablePrepareToSleep(&ioh->cv);
     634             : 
     635      771848 :                 while (!pgaio_io_was_recycled(ioh, ref_generation, &state))
     636             :                 {
     637      763298 :                     if (state == PGAIO_HS_COMPLETED_SHARED ||
     638      385494 :                         state == PGAIO_HS_COMPLETED_LOCAL)
     639             :                         break;
     640      385428 :                     ConditionVariableSleep(&ioh->cv, WAIT_EVENT_AIO_IO_COMPLETION);
     641             :                 }
     642             : 
     643      386420 :                 ConditionVariableCancelSleep();
     644      386420 :                 break;
     645             : 
     646      378466 :             case PGAIO_HS_COMPLETED_SHARED:
     647             :             case PGAIO_HS_COMPLETED_LOCAL:
     648             : 
     649             :                 /*
     650             :                  * Note that no interrupts are processed between
     651             :                  * pgaio_io_was_recycled() and this check - that's important
     652             :                  * as otherwise an interrupt could have already reclaimed the
     653             :                  * handle.
     654             :                  */
     655      378466 :                 if (am_owner)
     656      375208 :                     pgaio_io_reclaim(ioh);
     657      378466 :                 return;
     658             :         }
     659             :     }
     660             : }
     661             : 
     662             : /*
     663             :  * Make IO handle ready to be reused after IO has completed or after the
     664             :  * handle has been released without being used.
     665             :  *
     666             :  * Note that callers need to be careful about only calling this in the right
     667             :  * state and that no interrupts can be processed between the state check and
     668             :  * the call to pgaio_io_reclaim(). Otherwise interrupt processing could
     669             :  * already have reclaimed the handle.
     670             :  */
     671             : static void
     672     2429322 : pgaio_io_reclaim(PgAioHandle *ioh)
     673             : {
     674             :     /* This is only ok if it's our IO */
     675             :     Assert(ioh->owner_procno == MyProcNumber);
     676             :     Assert(ioh->state != PGAIO_HS_IDLE);
     677             : 
     678             :     /* see comment in function header */
     679     2429322 :     HOLD_INTERRUPTS();
     680             : 
     681             :     /*
     682             :      * It's a bit ugly, but right now the easiest place to put the execution
     683             :      * of local completion callbacks is this function, as we need to execute
     684             :      * local callbacks just before reclaiming at multiple callsites.
     685             :      */
     686     2429322 :     if (ioh->state == PGAIO_HS_COMPLETED_SHARED)
     687             :     {
     688             :         PgAioResult local_result;
     689             : 
     690     2414110 :         local_result = pgaio_io_call_complete_local(ioh);
     691     2414110 :         pgaio_io_update_state(ioh, PGAIO_HS_COMPLETED_LOCAL);
     692             : 
     693     2414110 :         if (ioh->report_return)
     694             :         {
     695     2414082 :             ioh->report_return->result = local_result;
     696     2414082 :             ioh->report_return->target_data = ioh->target_data;
     697             :         }
     698             :     }
     699             : 
     700     2429322 :     pgaio_debug_io(DEBUG4, ioh,
     701             :                    "reclaiming: distilled_result: (status %s, id %u, error_data %d), raw_result: %d",
     702             :                    pgaio_result_status_string(ioh->distilled_result.status),
     703             :                    ioh->distilled_result.id,
     704             :                    ioh->distilled_result.error_data,
     705             :                    ioh->result);
     706             : 
     707             :     /* if the IO has been defined, it's on the in-flight list, remove */
     708     2429322 :     if (ioh->state != PGAIO_HS_HANDED_OUT)
     709     2414110 :         dclist_delete_from(&pgaio_my_backend->in_flight_ios, &ioh->node);
     710             : 
     711     2429322 :     if (ioh->resowner)
     712             :     {
     713     2429228 :         ResourceOwnerForgetAioHandle(ioh->resowner, &ioh->resowner_node);
     714     2429228 :         ioh->resowner = NULL;
     715             :     }
     716             : 
     717             :     Assert(!ioh->resowner);
     718             : 
     719             :     /*
     720             :      * Update generation & state first, before resetting the IO's fields,
     721             :      * otherwise a concurrent "viewer" could think the fields are valid, even
     722             :      * though they are being reset.  Increment the generation first, so that
     723             :      * we can assert elsewhere that we never wait for an IDLE IO.  While it's
     724             :      * a bit weird for the state to go backwards for a generation, it's OK
     725             :      * here, as there cannot be references to the "reborn" IO yet.  Can't
     726             :      * update both at once, so something has to give.
     727             :      */
     728     2429322 :     ioh->generation++;
     729     2429322 :     pgaio_io_update_state(ioh, PGAIO_HS_IDLE);
     730             : 
     731             :     /* ensure the state update is visible before we reset fields */
     732     2429322 :     pg_write_barrier();
     733             : 
     734     2429322 :     ioh->op = PGAIO_OP_INVALID;
     735     2429322 :     ioh->target = PGAIO_TID_INVALID;
     736     2429322 :     ioh->flags = 0;
     737     2429322 :     ioh->num_callbacks = 0;
     738     2429322 :     ioh->handle_data_len = 0;
     739     2429322 :     ioh->report_return = NULL;
     740     2429322 :     ioh->result = 0;
     741     2429322 :     ioh->distilled_result.status = PGAIO_RS_UNKNOWN;
     742             : 
     743             :     /*
     744             :      * We push the IO to the head of the idle IO list, that seems more cache
     745             :      * efficient in cases where only a few IOs are used.
     746             :      */
     747     2429322 :     dclist_push_head(&pgaio_my_backend->idle_ios, &ioh->node);
     748             : 
     749     2429322 :     RESUME_INTERRUPTS();
     750     2429322 : }
     751             : 
     752             : /*
     753             :  * Wait for an IO handle to become usable.
     754             :  *
     755             :  * This only really is useful for pgaio_io_acquire().
     756             :  */
     757             : static void
     758        5396 : pgaio_io_wait_for_free(void)
     759             : {
     760        5396 :     int         reclaimed = 0;
     761             : 
     762        5396 :     pgaio_debug(DEBUG2, "waiting for free IO with %d pending, %u in-flight, %u idle IOs",
     763             :                 pgaio_my_backend->num_staged_ios,
     764             :                 dclist_count(&pgaio_my_backend->in_flight_ios),
     765             :                 dclist_count(&pgaio_my_backend->idle_ios));
     766             : 
     767             :     /*
     768             :      * First check if any of our IOs actually have completed - when using
     769             :      * worker, that'll often be the case. We could do so as part of the loop
     770             :      * below, but that'd potentially lead us to wait for some IO submitted
     771             :      * before.
     772             :      */
     773       10792 :     for (int i = 0; i < io_max_concurrency; i++)
     774             :     {
     775        5396 :         PgAioHandle *ioh = &pgaio_ctl->io_handles[pgaio_my_backend->io_handle_off + i];
     776             : 
     777        5396 :         if (ioh->state == PGAIO_HS_COMPLETED_SHARED)
     778             :         {
     779             :             /*
     780             :              * Note that no interrupts are processed between the state check
     781             :              * and the call to reclaim - that's important as otherwise an
     782             :              * interrupt could have already reclaimed the handle.
     783             :              *
     784             :              * Need to ensure that there's no reordering, in the more common
     785             :              * paths, where we wait for IO, that's done by
     786             :              * pgaio_io_was_recycled().
     787             :              */
     788        4474 :             pg_read_barrier();
     789        4474 :             pgaio_io_reclaim(ioh);
     790        4474 :             reclaimed++;
     791             :         }
     792             :     }
     793             : 
     794        5396 :     if (reclaimed > 0)
     795        4474 :         return;
     796             : 
     797             :     /*
     798             :      * If we have any unsubmitted IOs, submit them now. We'll start waiting in
     799             :      * a second, so it's better they're in flight. This also addresses the
     800             :      * edge-case that all IOs are unsubmitted.
     801             :      */
     802         922 :     if (pgaio_my_backend->num_staged_ios > 0)
     803           0 :         pgaio_submit_staged();
     804             : 
     805             :     /* possibly some IOs finished during submission */
     806         922 :     if (!dclist_is_empty(&pgaio_my_backend->idle_ios))
     807           0 :         return;
     808             : 
     809         922 :     if (dclist_count(&pgaio_my_backend->in_flight_ios) == 0)
     810           0 :         ereport(ERROR,
     811             :                 errmsg_internal("no free IOs despite no in-flight IOs"),
     812             :                 errdetail_internal("%d pending, %u in-flight, %u idle IOs",
     813             :                                    pgaio_my_backend->num_staged_ios,
     814             :                                    dclist_count(&pgaio_my_backend->in_flight_ios),
     815             :                                    dclist_count(&pgaio_my_backend->idle_ios)));
     816             : 
     817             :     /*
     818             :      * Wait for the oldest in-flight IO to complete.
     819             :      *
     820             :      * XXX: Reusing the general IO wait is suboptimal, we don't need to wait
     821             :      * for that specific IO to complete, we just need *any* IO to complete.
     822             :      */
     823             :     {
     824         922 :         PgAioHandle *ioh = dclist_head_element(PgAioHandle, node,
     825             :                                                &pgaio_my_backend->in_flight_ios);
     826         922 :         uint64      generation = ioh->generation;
     827             : 
     828         922 :         switch (ioh->state)
     829             :         {
     830             :                 /* should not be in in-flight list */
     831           0 :             case PGAIO_HS_IDLE:
     832             :             case PGAIO_HS_DEFINED:
     833             :             case PGAIO_HS_HANDED_OUT:
     834             :             case PGAIO_HS_STAGED:
     835             :             case PGAIO_HS_COMPLETED_LOCAL:
     836           0 :                 elog(ERROR, "shouldn't get here with io:%d in state %d",
     837             :                      pgaio_io_get_id(ioh), ioh->state);
     838             :                 break;
     839             : 
     840         920 :             case PGAIO_HS_COMPLETED_IO:
     841             :             case PGAIO_HS_SUBMITTED:
     842         920 :                 pgaio_debug_io(DEBUG2, ioh,
     843             :                                "waiting for free io with %u in flight",
     844             :                                dclist_count(&pgaio_my_backend->in_flight_ios));
     845             : 
     846             :                 /*
     847             :                  * In a more general case this would be racy, because the
     848             :                  * generation could increase after we read ioh->state above.
     849             :                  * But we are only looking at IOs by the current backend and
     850             :                  * the IO can only be recycled by this backend.  Even this is
     851             :                  * only OK because we get the handle's generation before
     852             :                  * potentially processing interrupts, e.g. as part of
     853             :                  * pgaio_debug_io().
     854             :                  */
     855         920 :                 pgaio_io_wait(ioh, generation);
     856         920 :                 break;
     857             : 
     858           2 :             case PGAIO_HS_COMPLETED_SHARED:
     859             : 
     860             :                 /*
     861             :                  * It's possible that another backend just finished this IO.
     862             :                  *
     863             :                  * Note that no interrupts are processed between the state
     864             :                  * check and the call to reclaim - that's important as
     865             :                  * otherwise an interrupt could have already reclaimed the
     866             :                  * handle.
     867             :                  *
     868             :                  * Need to ensure that there's no reordering, in the more
     869             :                  * common paths, where we wait for IO, that's done by
     870             :                  * pgaio_io_was_recycled().
     871             :                  */
     872           2 :                 pg_read_barrier();
     873           2 :                 pgaio_io_reclaim(ioh);
     874           2 :                 break;
     875             :         }
     876             : 
     877         922 :         if (dclist_count(&pgaio_my_backend->idle_ios) == 0)
     878           0 :             elog(PANIC, "no idle IO after waiting for IO to terminate");
     879         922 :         return;
     880             :     }
     881             : }
     882             : 
     883             : /*
     884             :  * Internal - code outside of AIO should never need this and it'd be hard for
     885             :  * such code to be safe.
     886             :  */
     887             : static PgAioHandle *
     888     1454216 : pgaio_io_from_wref(PgAioWaitRef *iow, uint64 *ref_generation)
     889             : {
     890             :     PgAioHandle *ioh;
     891             : 
     892             :     Assert(iow->aio_index < pgaio_ctl->io_handle_count);
     893             : 
     894     1454216 :     ioh = &pgaio_ctl->io_handles[iow->aio_index];
     895             : 
     896     1454216 :     *ref_generation = ((uint64) iow->generation_upper) << 32 |
     897     1454216 :         iow->generation_lower;
     898             : 
     899             :     Assert(*ref_generation != 0);
     900             : 
     901     1454216 :     return ioh;
     902             : }
     903             : 
     904             : static const char *
     905       14382 : pgaio_io_state_get_name(PgAioHandleState s)
     906             : {
     907             : #define PGAIO_HS_TOSTR_CASE(sym) case PGAIO_HS_##sym: return #sym
     908       14382 :     switch (s)
     909             :     {
     910           0 :             PGAIO_HS_TOSTR_CASE(IDLE);
     911        4764 :             PGAIO_HS_TOSTR_CASE(HANDED_OUT);
     912        2382 :             PGAIO_HS_TOSTR_CASE(DEFINED);
     913        2382 :             PGAIO_HS_TOSTR_CASE(STAGED);
     914          70 :             PGAIO_HS_TOSTR_CASE(SUBMITTED);
     915        2384 :             PGAIO_HS_TOSTR_CASE(COMPLETED_IO);
     916        2400 :             PGAIO_HS_TOSTR_CASE(COMPLETED_SHARED);
     917           0 :             PGAIO_HS_TOSTR_CASE(COMPLETED_LOCAL);
     918             :     }
     919             : #undef PGAIO_HS_TOSTR_CASE
     920             : 
     921           0 :     return NULL;                /* silence compiler */
     922             : }
     923             : 
     924             : const char *
     925       14382 : pgaio_io_get_state_name(PgAioHandle *ioh)
     926             : {
     927       14382 :     return pgaio_io_state_get_name(ioh->state);
     928             : }
     929             : 
     930             : const char *
     931        4764 : pgaio_result_status_string(PgAioResultStatus rs)
     932             : {
     933        4764 :     switch (rs)
     934             :     {
     935           0 :         case PGAIO_RS_UNKNOWN:
     936           0 :             return "UNKNOWN";
     937        4404 :         case PGAIO_RS_OK:
     938        4404 :             return "OK";
     939         136 :         case PGAIO_RS_WARNING:
     940         136 :             return "WARNING";
     941          40 :         case PGAIO_RS_PARTIAL:
     942          40 :             return "PARTIAL";
     943         184 :         case PGAIO_RS_ERROR:
     944         184 :             return "ERROR";
     945             :     }
     946             : 
     947           0 :     return NULL;                /* silence compiler */
     948             : }
     949             : 
     950             : 
     951             : 
     952             : /* --------------------------------------------------------------------------------
     953             :  * Functions primarily related to IO Wait References
     954             :  * --------------------------------------------------------------------------------
     955             :  */
     956             : 
     957             : /*
     958             :  * Mark a wait reference as invalid
     959             :  */
     960             : void
     961    25966180 : pgaio_wref_clear(PgAioWaitRef *iow)
     962             : {
     963    25966180 :     iow->aio_index = PG_UINT32_MAX;
     964    25966180 : }
     965             : 
     966             : /* Is the wait reference valid? */
     967             : bool
     968     4942866 : pgaio_wref_valid(PgAioWaitRef *iow)
     969             : {
     970     4942866 :     return iow->aio_index != PG_UINT32_MAX;
     971             : }
     972             : 
     973             : /*
     974             :  * Similar to pgaio_io_get_id(), just for wait references.
     975             :  */
     976             : int
     977           0 : pgaio_wref_get_id(PgAioWaitRef *iow)
     978             : {
     979             :     Assert(pgaio_wref_valid(iow));
     980           0 :     return iow->aio_index;
     981             : }
     982             : 
     983             : /*
     984             :  * Wait for the IO to have completed. Can be called in any process, not just
     985             :  * in the issuing backend.
     986             :  */
     987             : void
     988      386188 : pgaio_wref_wait(PgAioWaitRef *iow)
     989             : {
     990             :     uint64      ref_generation;
     991             :     PgAioHandle *ioh;
     992             : 
     993      386188 :     ioh = pgaio_io_from_wref(iow, &ref_generation);
     994             : 
     995      386188 :     pgaio_io_wait(ioh, ref_generation);
     996      386188 : }
     997             : 
     998             : /*
     999             :  * Check if the referenced IO completed, without blocking.
    1000             :  */
    1001             : bool
    1002     1068028 : pgaio_wref_check_done(PgAioWaitRef *iow)
    1003             : {
    1004             :     uint64      ref_generation;
    1005             :     PgAioHandleState state;
    1006             :     bool        am_owner;
    1007             :     PgAioHandle *ioh;
    1008             : 
    1009     1068028 :     ioh = pgaio_io_from_wref(iow, &ref_generation);
    1010             : 
    1011     1068028 :     if (pgaio_io_was_recycled(ioh, ref_generation, &state))
    1012           0 :         return true;
    1013             : 
    1014     1068028 :     if (state == PGAIO_HS_IDLE)
    1015           0 :         return true;
    1016             : 
    1017     1068028 :     am_owner = ioh->owner_procno == MyProcNumber;
    1018             : 
    1019     1068028 :     if (state == PGAIO_HS_COMPLETED_SHARED ||
    1020      374236 :         state == PGAIO_HS_COMPLETED_LOCAL)
    1021             :     {
    1022             :         /*
    1023             :          * Note that no interrupts are processed between
    1024             :          * pgaio_io_was_recycled() and this check - that's important as
    1025             :          * otherwise an interrupt could have already reclaimed the handle.
    1026             :          */
    1027      693792 :         if (am_owner)
    1028      693792 :             pgaio_io_reclaim(ioh);
    1029      693792 :         return true;
    1030             :     }
    1031             : 
    1032             :     /*
    1033             :      * XXX: It likely would be worth checking in with the io method, to give
    1034             :      * the IO method a chance to check if there are completion events queued.
    1035             :      */
    1036             : 
    1037      374236 :     return false;
    1038             : }
    1039             : 
    1040             : 
    1041             : 
    1042             : /* --------------------------------------------------------------------------------
    1043             :  * Actions on multiple IOs.
    1044             :  * --------------------------------------------------------------------------------
    1045             :  */
    1046             : 
    1047             : /*
    1048             :  * Submit IOs in batches going forward.
    1049             :  *
    1050             :  * Submitting multiple IOs at once can be substantially faster than doing so
    1051             :  * one-by-one. At the same time, submitting multiple IOs at once requires more
    1052             :  * care to avoid deadlocks.
    1053             :  *
    1054             :  * Consider backend A staging an IO for buffer 1 and then trying to start IO
    1055             :  * on buffer 2, while backend B does the inverse. If A submitted the IO before
    1056             :  * moving on to buffer 2, this works just fine, B will wait for the IO to
    1057             :  * complete. But if batching were used, each backend will wait for IO that has
    1058             :  * not yet been submitted to complete, i.e. forever.
    1059             :  *
    1060             :  * End batch submission mode with pgaio_exit_batchmode().  (Throwing errors is
    1061             :  * allowed; error recovery will end the batch.)
    1062             :  *
    1063             :  * To avoid deadlocks, code needs to ensure that it will not wait for another
    1064             :  * backend while there is unsubmitted IO. E.g. by using conditional lock
    1065             :  * acquisition when acquiring buffer locks. To check if there currently are
    1066             :  * staged IOs, call pgaio_have_staged() and to submit all staged IOs call
    1067             :  * pgaio_submit_staged().
    1068             :  *
    1069             :  * It is not allowed to enter batchmode while already in batchmode, it's
    1070             :  * unlikely to ever be needed, as code needs to be explicitly aware of being
    1071             :  * called in batchmode, to avoid the deadlock risks explained above.
    1072             :  *
    1073             :  * Note that IOs may get submitted before pgaio_exit_batchmode() is called,
    1074             :  * e.g. because too many IOs have been staged or because pgaio_submit_staged()
    1075             :  * was called.
    1076             :  */
    1077             : void
    1078     5368792 : pgaio_enter_batchmode(void)
    1079             : {
    1080     5368792 :     if (pgaio_my_backend->in_batchmode)
    1081           0 :         elog(ERROR, "starting batch while batch already in progress");
    1082     5368792 :     pgaio_my_backend->in_batchmode = true;
    1083     5368792 : }
    1084             : 
    1085             : /*
    1086             :  * Stop submitting IOs in batches.
    1087             :  */
    1088             : void
    1089     5368772 : pgaio_exit_batchmode(void)
    1090             : {
    1091             :     Assert(pgaio_my_backend->in_batchmode);
    1092             : 
    1093     5368772 :     pgaio_submit_staged();
    1094     5368772 :     pgaio_my_backend->in_batchmode = false;
    1095     5368772 : }
    1096             : 
    1097             : /*
    1098             :  * Are there staged but unsubmitted IOs?
    1099             :  *
    1100             :  * See comment above pgaio_enter_batchmode() for why code may need to check if
    1101             :  * there is IO in that state.
    1102             :  */
    1103             : bool
    1104     2429126 : pgaio_have_staged(void)
    1105             : {
    1106             :     Assert(pgaio_my_backend->in_batchmode ||
    1107             :            pgaio_my_backend->num_staged_ios == 0);
    1108     2429126 :     return pgaio_my_backend->num_staged_ios > 0;
    1109             : }
    1110             : 
    1111             : /*
    1112             :  * Submit all staged but not yet submitted IOs.
    1113             :  *
    1114             :  * Unless in batch mode, this never needs to be called, as IOs get submitted
    1115             :  * as soon as possible. While in batchmode pgaio_submit_staged() can be called
    1116             :  * before waiting on another backend, to avoid the risk of deadlocks. See
    1117             :  * pgaio_enter_batchmode().
    1118             :  */
    1119             : void
    1120     5426244 : pgaio_submit_staged(void)
    1121             : {
    1122     5426244 :     int         total_submitted = 0;
    1123             :     int         did_submit;
    1124             : 
    1125     5426244 :     if (pgaio_my_backend->num_staged_ios == 0)
    1126     4353912 :         return;
    1127             : 
    1128             : 
    1129     1072332 :     START_CRIT_SECTION();
    1130             : 
    1131     1072332 :     did_submit = pgaio_method_ops->submit(pgaio_my_backend->num_staged_ios,
    1132     1072332 :                                           pgaio_my_backend->staged_ios);
    1133             : 
    1134     1072332 :     END_CRIT_SECTION();
    1135             : 
    1136     1072332 :     total_submitted += did_submit;
    1137             : 
    1138             :     Assert(total_submitted == did_submit);
    1139             : 
    1140     1072332 :     pgaio_my_backend->num_staged_ios = 0;
    1141             : 
    1142     1072332 :     pgaio_debug(DEBUG4,
    1143             :                 "aio: submitted %d IOs",
    1144             :                 total_submitted);
    1145             : }
    1146             : 
    1147             : 
    1148             : 
    1149             : /* --------------------------------------------------------------------------------
    1150             :  * Other
    1151             :  * --------------------------------------------------------------------------------
    1152             :  */
    1153             : 
    1154             : 
    1155             : /*
    1156             :  * Perform AIO related cleanup after an error.
    1157             :  *
    1158             :  * This should be called early in the error recovery paths, as later steps may
    1159             :  * need to issue AIO (e.g. to record a transaction abort WAL record).
    1160             :  */
    1161             : void
    1162       58560 : pgaio_error_cleanup(void)
    1163             : {
    1164             :     /*
    1165             :      * It is possible that code errored out after pgaio_enter_batchmode() but
    1166             :      * before pgaio_exit_batchmode() was called. In that case we need to
    1167             :      * submit the IO now.
    1168             :      */
    1169       58560 :     if (pgaio_my_backend->in_batchmode)
    1170             :     {
    1171          20 :         pgaio_my_backend->in_batchmode = false;
    1172             : 
    1173          20 :         pgaio_submit_staged();
    1174             :     }
    1175             : 
    1176             :     /*
    1177             :      * As we aren't in batchmode, there shouldn't be any unsubmitted IOs.
    1178             :      */
    1179             :     Assert(pgaio_my_backend->num_staged_ios == 0);
    1180       58560 : }
    1181             : 
    1182             : /*
    1183             :  * Perform AIO related checks at (sub-)transactional boundaries.
    1184             :  *
    1185             :  * This should be called late during (sub-)transactional commit/abort, after
    1186             :  * all steps that might need to perform AIO, so that we can verify that the
    1187             :  * AIO subsystem is in a valid state at the end of a transaction.
    1188             :  */
    1189             : void
    1190     1123200 : AtEOXact_Aio(bool is_commit)
    1191             : {
    1192             :     /*
    1193             :      * We should never be in batch mode at transactional boundaries. In case
    1194             :      * an error was thrown while in batch mode, pgaio_error_cleanup() should
    1195             :      * have exited batchmode.
    1196             :      *
    1197             :      * In case we are in batchmode somehow, make sure to submit all staged
    1198             :      * IOs, other backends may need them to complete to continue.
    1199             :      */
    1200     1123200 :     if (pgaio_my_backend->in_batchmode)
    1201             :     {
    1202           8 :         pgaio_error_cleanup();
    1203           8 :         elog(WARNING, "open AIO batch at end of (sub-)transaction");
    1204             :     }
    1205             : 
    1206             :     /*
    1207             :      * As we aren't in batchmode, there shouldn't be any unsubmitted IOs.
    1208             :      */
    1209             :     Assert(pgaio_my_backend->num_staged_ios == 0);
    1210     1123200 : }
    1211             : 
    1212             : /*
    1213             :  * Need to submit staged but not yet submitted IOs using the fd, otherwise
    1214             :  * the IO would end up targeting something bogus.
    1215             :  */
    1216             : void
    1217    16528566 : pgaio_closing_fd(int fd)
    1218             : {
    1219             :     /*
    1220             :      * Might be called before AIO is initialized or in a subprocess that
    1221             :      * doesn't use AIO.
    1222             :      */
    1223    16528566 :     if (!pgaio_my_backend)
    1224       13860 :         return;
    1225             : 
    1226             :     /*
    1227             :      * For now just submit all staged IOs - we could be more selective, but
    1228             :      * it's probably not worth it.
    1229             :      */
    1230    16514706 :     if (pgaio_my_backend->num_staged_ios > 0)
    1231             :     {
    1232           4 :         pgaio_debug(DEBUG2,
    1233             :                     "submitting %d IOs before FD %d gets closed",
    1234             :                     pgaio_my_backend->num_staged_ios, fd);
    1235           4 :         pgaio_submit_staged();
    1236             :     }
    1237             : 
    1238             :     /*
    1239             :      * If requested by the IO method, wait for all IOs that use the
    1240             :      * to-be-closed FD.
    1241             :      */
    1242    16514706 :     if (pgaio_method_ops->wait_on_fd_before_close)
    1243             :     {
    1244             :         /*
    1245             :          * As waiting for one IO to complete may complete multiple IOs, we
    1246             :          * can't just use a mutable list iterator. The maximum number of
    1247             :          * in-flight IOs is fairly small, so just restart the loop after
    1248             :          * waiting for an IO.
    1249             :          */
    1250           0 :         while (!dclist_is_empty(&pgaio_my_backend->in_flight_ios))
    1251             :         {
    1252             :             dlist_iter  iter;
    1253           0 :             PgAioHandle *ioh = NULL;
    1254             :             uint64      generation;
    1255             : 
    1256           0 :             dclist_foreach(iter, &pgaio_my_backend->in_flight_ios)
    1257             :             {
    1258           0 :                 ioh = dclist_container(PgAioHandle, node, iter.cur);
    1259             : 
    1260           0 :                 generation = ioh->generation;
    1261             : 
    1262           0 :                 if (pgaio_io_uses_fd(ioh, fd))
    1263           0 :                     break;
    1264             :                 else
    1265           0 :                     ioh = NULL;
    1266             :             }
    1267             : 
    1268           0 :             if (!ioh)
    1269           0 :                 break;
    1270             : 
    1271           0 :             pgaio_debug_io(DEBUG2, ioh,
    1272             :                            "waiting for IO before FD %d gets closed, %u in-flight IOs",
    1273             :                            fd, dclist_count(&pgaio_my_backend->in_flight_ios));
    1274             : 
    1275             :             /* see comment in pgaio_io_wait_for_free() about raciness */
    1276           0 :             pgaio_io_wait(ioh, generation);
    1277             :         }
    1278             :     }
    1279             : }
    1280             : 
    1281             : /*
    1282             :  * Registered as before_shmem_exit() callback in pgaio_init_backend()
    1283             :  */
    1284             : void
    1285       39002 : pgaio_shutdown(int code, Datum arg)
    1286             : {
    1287             :     Assert(pgaio_my_backend);
    1288             :     Assert(!pgaio_my_backend->handed_out_io);
    1289             : 
    1290             :     /* first clean up resources as we would at a transaction boundary */
    1291       39002 :     AtEOXact_Aio(code == 0);
    1292             : 
    1293             :     /*
    1294             :      * Before exiting, make sure that all IOs are finished. That has two main
    1295             :      * purposes:
    1296             :      *
    1297             :      * - Some kernel-level AIO mechanisms don't deal well with the issuer of
    1298             :      * an AIO exiting before IO completed
    1299             :      *
    1300             :      * - It'd be confusing to see partially finished IOs in stats views etc
    1301             :      */
    1302       39020 :     while (!dclist_is_empty(&pgaio_my_backend->in_flight_ios))
    1303             :     {
    1304          18 :         PgAioHandle *ioh = dclist_head_element(PgAioHandle, node, &pgaio_my_backend->in_flight_ios);
    1305          18 :         uint64      generation = ioh->generation;
    1306             : 
    1307          18 :         pgaio_debug_io(DEBUG2, ioh,
    1308             :                        "waiting for IO to complete during shutdown, %u in-flight IOs",
    1309             :                        dclist_count(&pgaio_my_backend->in_flight_ios));
    1310             : 
    1311             :         /* see comment in pgaio_io_wait_for_free() about raciness */
    1312          18 :         pgaio_io_wait(ioh, generation);
    1313             :     }
    1314             : 
    1315       39002 :     pgaio_my_backend = NULL;
    1316       39002 : }
    1317             : 
    1318             : void
    1319        2216 : assign_io_method(int newval, void *extra)
    1320             : {
    1321             :     Assert(pgaio_method_ops_table[newval] != NULL);
    1322             :     Assert(newval < lengthof(io_method_options));
    1323             : 
    1324        2216 :     pgaio_method_ops = pgaio_method_ops_table[newval];
    1325        2216 : }
    1326             : 
    1327             : bool
    1328        4316 : check_io_max_concurrency(int *newval, void **extra, GucSource source)
    1329             : {
    1330        4316 :     if (*newval == -1)
    1331             :     {
    1332             :         /*
    1333             :          * Auto-tuning will be applied later during startup, as auto-tuning
    1334             :          * depends on the value of various GUCs.
    1335             :          */
    1336        2194 :         return true;
    1337             :     }
    1338        2122 :     else if (*newval == 0)
    1339             :     {
    1340           0 :         GUC_check_errdetail("Only -1 or values bigger than 0 are valid.");
    1341           0 :         return false;
    1342             :     }
    1343             : 
    1344        2122 :     return true;
    1345             : }

Generated by: LCOV version 1.16