LCOV - code coverage report
Current view: top level - src/backend/storage/aio - aio.c (source / functions) Coverage Total Hit
Test: PostgreSQL 19devel Lines: 84.8 % 341 289
Test Date: 2026-04-07 14:16:30 Functions: 94.6 % 37 35
Legend: Lines:     hit not hit

            Line data    Source code
       1              : /*-------------------------------------------------------------------------
       2              :  *
       3              :  * aio.c
       4              :  *    AIO - Core Logic
       5              :  *
       6              :  * For documentation about how AIO works on a higher level, including a
       7              :  * schematic example, see README.md.
       8              :  *
       9              :  *
      10              :  * AIO is a complicated subsystem. To keep things navigable, it is split
      11              :  * across a number of files:
      12              :  *
      13              :  * - method_*.c - different ways of executing AIO (e.g. worker process)
      14              :  *
      15              :  * - aio_target.c - IO on different kinds of targets
      16              :  *
      17              :  * - aio_io.c - method-independent code for specific IO ops (e.g. readv)
      18              :  *
      19              :  * - aio_callback.c - callbacks at IO operation lifecycle events
      20              :  *
      21              :  * - aio_init.c - per-server and per-backend initialization
      22              :  *
      23              :  * - aio.c - all other topics
      24              :  *
      25              :  * - read_stream.c - helper for reading buffered relation data
      26              :  *
      27              :  * - README.md - higher-level overview over AIO
      28              :  *
      29              :  *
      30              :  * Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group
      31              :  * Portions Copyright (c) 1994, Regents of the University of California
      32              :  *
      33              :  * IDENTIFICATION
      34              :  *    src/backend/storage/aio/aio.c
      35              :  *
      36              :  *-------------------------------------------------------------------------
      37              :  */
      38              : 
      39              : #include "postgres.h"
      40              : 
      41              : #include "lib/ilist.h"
      42              : #include "miscadmin.h"
      43              : #include "port/atomics.h"
      44              : #include "storage/aio.h"
      45              : #include "storage/aio_internal.h"
      46              : #include "storage/aio_subsys.h"
      47              : #include "utils/guc.h"
      48              : #include "utils/guc_hooks.h"
      49              : #include "utils/injection_point.h"
      50              : #include "utils/resowner.h"
      51              : #include "utils/wait_event_types.h"
      52              : 
      53              : 
      54              : static inline void pgaio_io_update_state(PgAioHandle *ioh, PgAioHandleState new_state);
      55              : static void pgaio_io_reclaim(PgAioHandle *ioh);
      56              : static void pgaio_io_resowner_register(PgAioHandle *ioh, struct ResourceOwnerData *resowner);
      57              : static void pgaio_io_wait_for_free(void);
      58              : static PgAioHandle *pgaio_io_from_wref(PgAioWaitRef *iow, uint64 *ref_generation);
      59              : static const char *pgaio_io_state_get_name(PgAioHandleState s);
      60              : static void pgaio_io_wait(PgAioHandle *ioh, uint64 ref_generation);
      61              : 
      62              : 
      63              : /* Options for io_method. */
      64              : const struct config_enum_entry io_method_options[] = {
      65              :     {"sync", IOMETHOD_SYNC, false},
      66              :     {"worker", IOMETHOD_WORKER, false},
      67              : #ifdef IOMETHOD_IO_URING_ENABLED
      68              :     {"io_uring", IOMETHOD_IO_URING, false},
      69              : #endif
      70              :     {NULL, 0, false}
      71              : };
      72              : 
      73              : /* GUCs */
      74              : int         io_method = DEFAULT_IO_METHOD;
      75              : int         io_max_concurrency = -1;
      76              : 
      77              : /* global control for AIO */
      78              : PgAioCtl   *pgaio_ctl;
      79              : 
      80              : /* current backend's per-backend state */
      81              : PgAioBackend *pgaio_my_backend;
      82              : 
      83              : 
      84              : static const IoMethodOps *const pgaio_method_ops_table[] = {
      85              :     [IOMETHOD_SYNC] = &pgaio_sync_ops,
      86              :     [IOMETHOD_WORKER] = &pgaio_worker_ops,
      87              : #ifdef IOMETHOD_IO_URING_ENABLED
      88              :     [IOMETHOD_IO_URING] = &pgaio_uring_ops,
      89              : #endif
      90              : };
      91              : 
      92              : StaticAssertDecl(lengthof(io_method_options) == lengthof(pgaio_method_ops_table) + 1,
      93              :                  "io_method_options out of sync with pgaio_method_ops_table");
      94              : 
      95              : /* callbacks for the configured io_method, set by assign_io_method */
      96              : const IoMethodOps *pgaio_method_ops;
      97              : 
      98              : 
      99              : /* --------------------------------------------------------------------------------
     100              :  * Public Functions related to PgAioHandle
     101              :  * --------------------------------------------------------------------------------
     102              :  */
     103              : 
     104              : /*
     105              :  * Acquire an AioHandle, waiting for IO completion if necessary.
     106              :  *
     107              :  * Each backend can only have one AIO handle that has been "handed out" to
     108              :  * code, but not yet submitted or released. This restriction is necessary to
     109              :  * ensure that it is possible for code to wait for an unused handle by waiting
     110              :  * for in-flight IO to complete. There is a limited number of handles in each
     111              :  * backend, if multiple handles could be handed out without being submitted,
     112              :  * waiting for all in-flight IO to complete would not guarantee that handles
     113              :  * free up.
     114              :  *
     115              :  * It is cheap to acquire an IO handle, unless all handles are in use. In that
     116              :  * case this function waits for the oldest IO to complete. If that is not
     117              :  * desirable, use pgaio_io_acquire_nb().
     118              :  *
     119              :  * If a handle was acquired but then does not turn out to be needed,
     120              :  * e.g. because pgaio_io_acquire() is called before starting an IO in a
     121              :  * critical section, the handle needs to be released with pgaio_io_release().
     122              :  *
     123              :  *
     124              :  * To react to the completion of the IO as soon as it is known to have
     125              :  * completed, callbacks can be registered with pgaio_io_register_callbacks().
     126              :  *
     127              :  * To actually execute IO using the returned handle, the pgaio_io_start_*()
     128              :  * family of functions is used. In many cases the pgaio_io_start_*() call will
     129              :  * not be done directly by code that acquired the handle, but by lower level
     130              :  * code that gets passed the handle. E.g. if code in bufmgr.c wants to perform
     131              :  * AIO, it typically will pass the handle to smgr.c, which will pass it on to
     132              :  * md.c, on to fd.c, which then finally calls pgaio_io_start_*().  This
     133              :  * forwarding allows the various layers to react to the IO's completion by
     134              :  * registering callbacks. These callbacks in turn can translate a lower
     135              :  * layer's result into a result understandable by a higher layer.
     136              :  *
     137              :  * During pgaio_io_start_*() the IO is staged (i.e. prepared for execution but
     138              :  * not submitted to the kernel). Unless in batchmode
     139              :  * (c.f. pgaio_enter_batchmode()), the IO will also get submitted for
     140              :  * execution. Note that, whether in batchmode or not, the IO might even
     141              :  * complete before the functions return.
     142              :  *
     143              :  * After pgaio_io_start_*() the AioHandle is "consumed" and may not be
     144              :  * referenced by the IO issuing code. To e.g. wait for IO, references to the
     145              :  * IO can be established with pgaio_io_get_wref() *before* pgaio_io_start_*()
     146              :  * is called.  pgaio_wref_wait() can be used to wait for the IO to complete.
     147              :  *
     148              :  *
     149              :  * To know if the IO [partially] succeeded or failed, a PgAioReturn * can be
     150              :  * passed to pgaio_io_acquire(). Once the issuing backend has called
     151              :  * pgaio_wref_wait(), the PgAioReturn contains information about whether the
     152              :  * operation succeeded and details about the first failure, if any. The error
     153              :  * can be raised / logged with pgaio_result_report().
     154              :  *
     155              :  * The lifetime of the memory pointed to be *ret needs to be at least as long
     156              :  * as the passed in resowner. If the resowner releases resources before the IO
     157              :  * completes (typically due to an error), the reference to *ret will be
     158              :  * cleared. In case of resowner cleanup *ret will not be updated with the
     159              :  * results of the IO operation.
     160              :  */
     161              : PgAioHandle *
     162         3272 : pgaio_io_acquire(struct ResourceOwnerData *resowner, PgAioReturn *ret)
     163              : {
     164              :     PgAioHandle *h;
     165              : 
     166              :     while (true)
     167              :     {
     168         6422 :         h = pgaio_io_acquire_nb(resowner, ret);
     169              : 
     170         6420 :         if (h != NULL)
     171         3270 :             return h;
     172              : 
     173              :         /*
     174              :          * Evidently all handles by this backend are in use. Just wait for
     175              :          * some to complete.
     176              :          */
     177         3150 :         pgaio_io_wait_for_free();
     178              :     }
     179              : }
     180              : 
     181              : /*
     182              :  * Acquire an AioHandle, returning NULL if no handles are free.
     183              :  *
     184              :  * See pgaio_io_acquire(). The only difference is that this function will return
     185              :  * NULL if there are no idle handles, instead of blocking.
     186              :  */
     187              : PgAioHandle *
     188      1487817 : pgaio_io_acquire_nb(struct ResourceOwnerData *resowner, PgAioReturn *ret)
     189              : {
     190      1487817 :     PgAioHandle *ioh = NULL;
     191              : 
     192      1487817 :     if (pgaio_my_backend->num_staged_ios >= PGAIO_SUBMIT_BATCH_SIZE)
     193              :     {
     194              :         Assert(pgaio_my_backend->num_staged_ios == PGAIO_SUBMIT_BATCH_SIZE);
     195            0 :         pgaio_submit_staged();
     196              :     }
     197              : 
     198      1487817 :     if (pgaio_my_backend->handed_out_io)
     199            2 :         elog(ERROR, "API violation: Only one IO can be handed out");
     200              : 
     201              :     /*
     202              :      * Probably not needed today, as interrupts should not process this IO,
     203              :      * but...
     204              :      */
     205      1487815 :     HOLD_INTERRUPTS();
     206              : 
     207      1487815 :     if (!dclist_is_empty(&pgaio_my_backend->idle_ios))
     208              :     {
     209      1481515 :         dlist_node *ion = dclist_pop_head_node(&pgaio_my_backend->idle_ios);
     210              : 
     211      1481515 :         ioh = dclist_container(PgAioHandle, node, ion);
     212              : 
     213              :         Assert(ioh->state == PGAIO_HS_IDLE);
     214              :         Assert(ioh->owner_procno == MyProcNumber);
     215              : 
     216      1481515 :         pgaio_io_update_state(ioh, PGAIO_HS_HANDED_OUT);
     217      1481515 :         pgaio_my_backend->handed_out_io = ioh;
     218              : 
     219      1481515 :         if (resowner)
     220      1481515 :             pgaio_io_resowner_register(ioh, resowner);
     221              : 
     222      1481515 :         if (ret)
     223              :         {
     224      1481489 :             ioh->report_return = ret;
     225      1481489 :             ret->result.status = PGAIO_RS_UNKNOWN;
     226              :         }
     227              :     }
     228              : 
     229      1487815 :     RESUME_INTERRUPTS();
     230              : 
     231      1487815 :     return ioh;
     232              : }
     233              : 
     234              : /*
     235              :  * Release IO handle that turned out to not be required.
     236              :  *
     237              :  * See pgaio_io_acquire() for more details.
     238              :  */
     239              : void
     240         3878 : pgaio_io_release(PgAioHandle *ioh)
     241              : {
     242         3878 :     if (ioh == pgaio_my_backend->handed_out_io)
     243              :     {
     244              :         Assert(ioh->state == PGAIO_HS_HANDED_OUT);
     245              :         Assert(ioh->resowner);
     246              : 
     247         3876 :         pgaio_my_backend->handed_out_io = NULL;
     248              : 
     249              :         /*
     250              :          * Note that no interrupts are processed between the handed_out_io
     251              :          * check and the call to reclaim - that's important as otherwise an
     252              :          * interrupt could have already reclaimed the handle.
     253              :          */
     254         3876 :         pgaio_io_reclaim(ioh);
     255              :     }
     256              :     else
     257              :     {
     258            2 :         elog(ERROR, "release in unexpected state");
     259              :     }
     260         3876 : }
     261              : 
     262              : /*
     263              :  * Release IO handle during resource owner cleanup.
     264              :  */
     265              : void
     266           47 : pgaio_io_release_resowner(dlist_node *ioh_node, bool on_error)
     267              : {
     268           47 :     PgAioHandle *ioh = dlist_container(PgAioHandle, resowner_node, ioh_node);
     269              : 
     270              :     Assert(ioh->resowner);
     271              : 
     272              :     /*
     273              :      * Otherwise an interrupt, in the middle of releasing the IO, could end up
     274              :      * trying to wait for the IO, leading to state confusion.
     275              :      */
     276           47 :     HOLD_INTERRUPTS();
     277              : 
     278           47 :     ResourceOwnerForgetAioHandle(ioh->resowner, &ioh->resowner_node);
     279           47 :     ioh->resowner = NULL;
     280              : 
     281           47 :     switch ((PgAioHandleState) ioh->state)
     282              :     {
     283            0 :         case PGAIO_HS_IDLE:
     284            0 :             elog(ERROR, "unexpected");
     285              :             break;
     286           33 :         case PGAIO_HS_HANDED_OUT:
     287              :             Assert(ioh == pgaio_my_backend->handed_out_io || pgaio_my_backend->handed_out_io == NULL);
     288              : 
     289           33 :             if (ioh == pgaio_my_backend->handed_out_io)
     290              :             {
     291           33 :                 pgaio_my_backend->handed_out_io = NULL;
     292           33 :                 if (!on_error)
     293           10 :                     elog(WARNING, "leaked AIO handle");
     294              :             }
     295              : 
     296           33 :             pgaio_io_reclaim(ioh);
     297           33 :             break;
     298            0 :         case PGAIO_HS_DEFINED:
     299              :         case PGAIO_HS_STAGED:
     300            0 :             if (!on_error)
     301            0 :                 elog(WARNING, "AIO handle was not submitted");
     302            0 :             pgaio_submit_staged();
     303            0 :             break;
     304           14 :         case PGAIO_HS_SUBMITTED:
     305              :         case PGAIO_HS_COMPLETED_IO:
     306              :         case PGAIO_HS_COMPLETED_SHARED:
     307              :         case PGAIO_HS_COMPLETED_LOCAL:
     308              :             /* this is expected to happen */
     309           14 :             break;
     310              :     }
     311              : 
     312              :     /*
     313              :      * Need to unregister the reporting of the IO's result, the memory it's
     314              :      * referencing likely has gone away.
     315              :      */
     316           47 :     if (ioh->report_return)
     317           14 :         ioh->report_return = NULL;
     318              : 
     319           47 :     RESUME_INTERRUPTS();
     320           47 : }
     321              : 
     322              : /*
     323              :  * Add a [set of] flags to the IO.
     324              :  *
     325              :  * Note that this combines flags with already set flags, rather than set flags
     326              :  * to explicitly the passed in parameters. This is to allow multiple callsites
     327              :  * to set flags.
     328              :  */
     329              : void
     330      2953606 : pgaio_io_set_flag(PgAioHandle *ioh, PgAioHandleFlags flag)
     331              : {
     332              :     Assert(ioh->state == PGAIO_HS_HANDED_OUT);
     333              : 
     334      2953606 :     ioh->flags |= flag;
     335      2953606 : }
     336              : 
     337              : /*
     338              :  * Returns an ID uniquely identifying the IO handle. This is only really
     339              :  * useful for logging, as handles are reused across multiple IOs.
     340              :  */
     341              : int
     342       721805 : pgaio_io_get_id(PgAioHandle *ioh)
     343              : {
     344              :     Assert(ioh >= pgaio_ctl->io_handles &&
     345              :            ioh < (pgaio_ctl->io_handles + pgaio_ctl->io_handle_count));
     346       721805 :     return ioh - pgaio_ctl->io_handles;
     347              : }
     348              : 
     349              : /*
     350              :  * Return the ProcNumber for the process that can use an IO handle. The
     351              :  * mapping from IO handles to PGPROCs is static, therefore this even works
     352              :  * when the corresponding PGPROC is not in use.
     353              :  */
     354              : ProcNumber
     355          302 : pgaio_io_get_owner(PgAioHandle *ioh)
     356              : {
     357          302 :     return ioh->owner_procno;
     358              : }
     359              : 
     360              : /*
     361              :  * Return a wait reference for the IO. Only wait references can be used to
     362              :  * wait for an IOs completion, as handles themselves can be reused after
     363              :  * completion.  See also the comment above pgaio_io_acquire().
     364              :  */
     365              : void
     366      2955227 : pgaio_io_get_wref(PgAioHandle *ioh, PgAioWaitRef *iow)
     367              : {
     368              :     Assert(ioh->state == PGAIO_HS_HANDED_OUT ||
     369              :            ioh->state == PGAIO_HS_DEFINED ||
     370              :            ioh->state == PGAIO_HS_STAGED);
     371              :     Assert(ioh->generation != 0);
     372              : 
     373      2955227 :     iow->aio_index = ioh - pgaio_ctl->io_handles;
     374      2955227 :     iow->generation_upper = (uint32) (ioh->generation >> 32);
     375      2955227 :     iow->generation_lower = (uint32) ioh->generation;
     376      2955227 : }
     377              : 
     378              : 
     379              : 
     380              : /* --------------------------------------------------------------------------------
     381              :  * Internal Functions related to PgAioHandle
     382              :  * --------------------------------------------------------------------------------
     383              :  */
     384              : 
     385              : static inline void
     386     11557552 : pgaio_io_update_state(PgAioHandle *ioh, PgAioHandleState new_state)
     387              : {
     388              :     /*
     389              :      * All callers need to have held interrupts in some form, otherwise
     390              :      * interrupt processing could wait for the IO to complete, while in an
     391              :      * intermediary state.
     392              :      */
     393              :     Assert(!INTERRUPTS_CAN_BE_PROCESSED());
     394              : 
     395     11557552 :     pgaio_debug_io(DEBUG5, ioh,
     396              :                    "updating state to %s",
     397              :                    pgaio_io_state_get_name(new_state));
     398              : 
     399              :     /*
     400              :      * Ensure the changes signified by the new state are visible before the
     401              :      * new state becomes visible.
     402              :      */
     403     11557552 :     pg_write_barrier();
     404              : 
     405     11557552 :     ioh->state = new_state;
     406     11557552 : }
     407              : 
     408              : static void
     409      1481515 : pgaio_io_resowner_register(PgAioHandle *ioh, struct ResourceOwnerData *resowner)
     410              : {
     411              :     Assert(!ioh->resowner);
     412              :     Assert(resowner);
     413              : 
     414      1481515 :     ResourceOwnerRememberAioHandle(resowner, &ioh->resowner_node);
     415      1481515 :     ioh->resowner = resowner;
     416      1481515 : }
     417              : 
     418              : /*
     419              :  * Stage IO for execution and, if appropriate, submit it immediately.
     420              :  *
     421              :  * Should only be called from pgaio_io_start_*().
     422              :  */
     423              : void
     424      1477606 : pgaio_io_stage(PgAioHandle *ioh, PgAioOp op)
     425              : {
     426              :     bool        needs_synchronous;
     427              : 
     428              :     Assert(ioh->state == PGAIO_HS_HANDED_OUT);
     429              :     Assert(pgaio_my_backend->handed_out_io == ioh);
     430              :     Assert(pgaio_io_has_target(ioh));
     431              : 
     432              :     /*
     433              :      * Otherwise an interrupt, in the middle of staging and possibly executing
     434              :      * the IO, could end up trying to wait for the IO, leading to state
     435              :      * confusion.
     436              :      */
     437      1477606 :     HOLD_INTERRUPTS();
     438              : 
     439      1477606 :     ioh->op = op;
     440      1477606 :     ioh->result = 0;
     441              : 
     442      1477606 :     pgaio_io_update_state(ioh, PGAIO_HS_DEFINED);
     443              : 
     444              :     /* allow a new IO to be staged */
     445      1477606 :     pgaio_my_backend->handed_out_io = NULL;
     446              : 
     447      1477606 :     pgaio_io_call_stage(ioh);
     448              : 
     449      1477606 :     pgaio_io_update_state(ioh, PGAIO_HS_STAGED);
     450              : 
     451              :     /*
     452              :      * Synchronous execution has to be executed, well, synchronously, so check
     453              :      * that first.
     454              :      */
     455      1477606 :     needs_synchronous = pgaio_io_needs_synchronous_execution(ioh);
     456              : 
     457      1477606 :     pgaio_debug_io(DEBUG3, ioh,
     458              :                    "staged (synchronous: %d, in_batch: %d)",
     459              :                    needs_synchronous, pgaio_my_backend->in_batchmode);
     460              : 
     461      1477606 :     if (!needs_synchronous)
     462              :     {
     463       680945 :         pgaio_my_backend->staged_ios[pgaio_my_backend->num_staged_ios++] = ioh;
     464              :         Assert(pgaio_my_backend->num_staged_ios <= PGAIO_SUBMIT_BATCH_SIZE);
     465              : 
     466              :         /*
     467              :          * Unless code explicitly opted into batching IOs, submit the IO
     468              :          * immediately.
     469              :          */
     470       680945 :         if (!pgaio_my_backend->in_batchmode)
     471        15007 :             pgaio_submit_staged();
     472              :     }
     473              :     else
     474              :     {
     475       796661 :         pgaio_io_prepare_submit(ioh);
     476       796661 :         pgaio_io_perform_synchronously(ioh);
     477              :     }
     478              : 
     479      1477606 :     RESUME_INTERRUPTS();
     480      1477606 : }
     481              : 
     482              : bool
     483      1477606 : pgaio_io_needs_synchronous_execution(PgAioHandle *ioh)
     484              : {
     485              :     /*
     486              :      * If the caller said to execute the IO synchronously, do so.
     487              :      *
     488              :      * XXX: We could optimize the logic when to execute synchronously by first
     489              :      * checking if there are other IOs in flight and only synchronously
     490              :      * executing if not. Unclear whether that'll be sufficiently common to be
     491              :      * worth worrying about.
     492              :      */
     493      1477606 :     if (ioh->flags & PGAIO_HF_SYNCHRONOUS)
     494       790947 :         return true;
     495              : 
     496              :     /* Check if the IO method requires synchronous execution of IO */
     497       686659 :     if (pgaio_method_ops->needs_synchronous_execution)
     498       686659 :         return pgaio_method_ops->needs_synchronous_execution(ioh);
     499              : 
     500            0 :     return false;
     501              : }
     502              : 
     503              : /*
     504              :  * Handle IO being processed by IO method.
     505              :  *
     506              :  * Should be called by IO methods / synchronous IO execution, just before the
     507              :  * IO is performed.
     508              :  */
     509              : void
     510      1477606 : pgaio_io_prepare_submit(PgAioHandle *ioh)
     511              : {
     512      1477606 :     pgaio_io_update_state(ioh, PGAIO_HS_SUBMITTED);
     513              : 
     514      1477606 :     dclist_push_tail(&pgaio_my_backend->in_flight_ios, &ioh->node);
     515      1477606 : }
     516              : 
     517              : /*
     518              :  * Handle IO getting completed by a method.
     519              :  *
     520              :  * Should be called by IO methods / synchronous IO execution, just after the
     521              :  * IO has been performed.
     522              :  *
     523              :  * Expects to be called in a critical section. We expect IOs to be usable for
     524              :  * WAL etc, which requires being able to execute completion callbacks in a
     525              :  * critical section.
     526              :  */
     527              : void
     528      1342049 : pgaio_io_process_completion(PgAioHandle *ioh, int result)
     529              : {
     530              :     Assert(ioh->state == PGAIO_HS_SUBMITTED);
     531              : 
     532              :     Assert(CritSectionCount > 0);
     533              : 
     534      1342049 :     ioh->result = result;
     535              : 
     536      1342049 :     pgaio_io_update_state(ioh, PGAIO_HS_COMPLETED_IO);
     537              : 
     538      1342049 :     INJECTION_POINT("aio-process-completion-before-shared", ioh);
     539              : 
     540      1342049 :     pgaio_io_call_complete_shared(ioh);
     541              : 
     542      1342049 :     pgaio_io_update_state(ioh, PGAIO_HS_COMPLETED_SHARED);
     543              : 
     544              :     /* condition variable broadcast ensures state is visible before wakeup */
     545      1342049 :     ConditionVariableBroadcast(&ioh->cv);
     546              : 
     547              :     /* contains call to pgaio_io_call_complete_local() */
     548      1342049 :     if (ioh->owner_procno == MyProcNumber)
     549       800124 :         pgaio_io_reclaim(ioh);
     550      1342049 : }
     551              : 
     552              : /*
     553              :  * Has the IO completed and thus the IO handle been reused?
     554              :  *
     555              :  * This is useful when waiting for IO completion at a low level (e.g. in an IO
     556              :  * method's ->wait_one() callback).
     557              :  */
     558              : bool
     559      2319919 : pgaio_io_was_recycled(PgAioHandle *ioh, uint64 ref_generation, PgAioHandleState *state)
     560              : {
     561      2319919 :     *state = ioh->state;
     562              : 
     563              :     /*
     564              :      * Ensure that we don't see an earlier state of the handle than ioh->state
     565              :      * due to compiler or CPU reordering. This protects both ->generation as
     566              :      * directly used here, and other fields in the handle accessed in the
     567              :      * caller if the handle was not reused.
     568              :      */
     569      2319919 :     pg_read_barrier();
     570              : 
     571      2319919 :     return ioh->generation != ref_generation;
     572              : }
     573              : 
     574              : /*
     575              :  * Wait for IO to complete. External code should never use this, outside of
     576              :  * the AIO subsystem waits are only allowed via pgaio_wref_wait().
     577              :  */
     578              : static void
     579       328730 : pgaio_io_wait(PgAioHandle *ioh, uint64 ref_generation)
     580              : {
     581              :     PgAioHandleState state;
     582              :     bool        am_owner;
     583              : 
     584       328730 :     am_owner = ioh->owner_procno == MyProcNumber;
     585              : 
     586       328730 :     if (pgaio_io_was_recycled(ioh, ref_generation, &state))
     587           39 :         return;
     588              : 
     589       328691 :     if (am_owner)
     590              :     {
     591       325874 :         if (state != PGAIO_HS_SUBMITTED
     592        34482 :             && state != PGAIO_HS_COMPLETED_IO
     593          197 :             && state != PGAIO_HS_COMPLETED_SHARED
     594            0 :             && state != PGAIO_HS_COMPLETED_LOCAL)
     595              :         {
     596            0 :             elog(PANIC, "waiting for own IO %d in wrong state: %s",
     597              :                  pgaio_io_get_id(ioh), pgaio_io_get_state_name(ioh));
     598              :         }
     599              :     }
     600              : 
     601              :     while (true)
     602              :     {
     603       657113 :         if (pgaio_io_was_recycled(ioh, ref_generation, &state))
     604         1853 :             return;
     605              : 
     606       655260 :         switch (state)
     607              :         {
     608            0 :             case PGAIO_HS_IDLE:
     609              :             case PGAIO_HS_HANDED_OUT:
     610            0 :                 elog(ERROR, "IO in wrong state: %d", state);
     611              :                 break;
     612              : 
     613       293077 :             case PGAIO_HS_SUBMITTED:
     614              : 
     615              :                 /*
     616              :                  * If we need to wait via the IO method, do so now. Don't
     617              :                  * check via the IO method if the issuing backend is executing
     618              :                  * the IO synchronously.
     619              :                  */
     620       293077 :                 if (pgaio_method_ops->wait_one && !(ioh->flags & PGAIO_HF_SYNCHRONOUS))
     621              :                 {
     622            0 :                     pgaio_method_ops->wait_one(ioh, ref_generation);
     623            0 :                     continue;
     624              :                 }
     625              :                 pg_fallthrough;
     626              : 
     627              :                 /* waiting for owner to submit */
     628              :             case PGAIO_HS_DEFINED:
     629              :             case PGAIO_HS_STAGED:
     630              :                 /* waiting for reaper to complete */
     631              :                 /* fallthrough */
     632              :             case PGAIO_HS_COMPLETED_IO:
     633              :                 /* shouldn't be able to hit this otherwise */
     634              :                 Assert(IsUnderPostmaster);
     635              :                 /* ensure we're going to get woken up */
     636       328423 :                 ConditionVariablePrepareToSleep(&ioh->cv);
     637              : 
     638       656481 :                 while (!pgaio_io_was_recycled(ioh, ref_generation, &state))
     639              :                 {
     640       654652 :                     if (state == PGAIO_HS_COMPLETED_SHARED ||
     641       328094 :                         state == PGAIO_HS_COMPLETED_LOCAL)
     642              :                         break;
     643       328059 :                     ConditionVariableSleep(&ioh->cv, WAIT_EVENT_AIO_IO_COMPLETION);
     644              :                 }
     645              : 
     646       328422 :                 ConditionVariableCancelSleep();
     647       328422 :                 break;
     648              : 
     649       326837 :             case PGAIO_HS_COMPLETED_SHARED:
     650              :             case PGAIO_HS_COMPLETED_LOCAL:
     651              : 
     652              :                 /*
     653              :                  * Note that no interrupts are processed between
     654              :                  * pgaio_io_was_recycled() and this check - that's important
     655              :                  * as otherwise an interrupt could have already reclaimed the
     656              :                  * handle.
     657              :                  */
     658       326837 :                 if (am_owner)
     659       325873 :                     pgaio_io_reclaim(ioh);
     660       326837 :                 return;
     661              :         }
     662              :     }
     663              : }
     664              : 
     665              : /*
     666              :  * Make IO handle ready to be reused after IO has completed or after the
     667              :  * handle has been released without being used.
     668              :  *
     669              :  * Note that callers need to be careful about only calling this in the right
     670              :  * state and that no interrupts can be processed between the state check and
     671              :  * the call to pgaio_io_reclaim(). Otherwise interrupt processing could
     672              :  * already have reclaimed the handle.
     673              :  */
     674              : static void
     675      1481515 : pgaio_io_reclaim(PgAioHandle *ioh)
     676              : {
     677              :     /* This is only ok if it's our IO */
     678              :     Assert(ioh->owner_procno == MyProcNumber);
     679              :     Assert(ioh->state != PGAIO_HS_IDLE);
     680              : 
     681              :     /* see comment in function header */
     682      1481515 :     HOLD_INTERRUPTS();
     683              : 
     684              :     /*
     685              :      * It's a bit ugly, but right now the easiest place to put the execution
     686              :      * of local completion callbacks is this function, as we need to execute
     687              :      * local callbacks just before reclaiming at multiple callsites.
     688              :      */
     689      1481515 :     if (ioh->state == PGAIO_HS_COMPLETED_SHARED)
     690              :     {
     691              :         PgAioResult local_result;
     692              : 
     693      1477606 :         local_result = pgaio_io_call_complete_local(ioh);
     694      1477606 :         pgaio_io_update_state(ioh, PGAIO_HS_COMPLETED_LOCAL);
     695              : 
     696      1477606 :         if (ioh->report_return)
     697              :         {
     698      1477592 :             ioh->report_return->result = local_result;
     699      1477592 :             ioh->report_return->target_data = ioh->target_data;
     700              :         }
     701              :     }
     702              : 
     703      1481515 :     pgaio_debug_io(DEBUG4, ioh,
     704              :                    "reclaiming: distilled_result: (status %s, id %u, error_data %d), raw_result: %d",
     705              :                    pgaio_result_status_string(ioh->distilled_result.status),
     706              :                    ioh->distilled_result.id,
     707              :                    ioh->distilled_result.error_data,
     708              :                    ioh->result);
     709              : 
     710              :     /* if the IO has been defined, it's on the in-flight list, remove */
     711      1481515 :     if (ioh->state != PGAIO_HS_HANDED_OUT)
     712      1477606 :         dclist_delete_from(&pgaio_my_backend->in_flight_ios, &ioh->node);
     713              : 
     714      1481515 :     if (ioh->resowner)
     715              :     {
     716      1481468 :         ResourceOwnerForgetAioHandle(ioh->resowner, &ioh->resowner_node);
     717      1481468 :         ioh->resowner = NULL;
     718              :     }
     719              : 
     720              :     Assert(!ioh->resowner);
     721              : 
     722              :     /*
     723              :      * Update generation & state first, before resetting the IO's fields,
     724              :      * otherwise a concurrent "viewer" could think the fields are valid, even
     725              :      * though they are being reset.  Increment the generation first, so that
     726              :      * we can assert elsewhere that we never wait for an IDLE IO.  While it's
     727              :      * a bit weird for the state to go backwards for a generation, it's OK
     728              :      * here, as there cannot be references to the "reborn" IO yet.  Can't
     729              :      * update both at once, so something has to give.
     730              :      */
     731      1481515 :     ioh->generation++;
     732      1481515 :     pgaio_io_update_state(ioh, PGAIO_HS_IDLE);
     733              : 
     734              :     /* ensure the state update is visible before we reset fields */
     735      1481515 :     pg_write_barrier();
     736              : 
     737      1481515 :     ioh->op = PGAIO_OP_INVALID;
     738      1481515 :     ioh->target = PGAIO_TID_INVALID;
     739      1481515 :     ioh->flags = 0;
     740      1481515 :     ioh->num_callbacks = 0;
     741      1481515 :     ioh->handle_data_len = 0;
     742      1481515 :     ioh->report_return = NULL;
     743      1481515 :     ioh->result = 0;
     744      1481515 :     ioh->distilled_result.status = PGAIO_RS_UNKNOWN;
     745              : 
     746              :     /*
     747              :      * We push the IO to the head of the idle IO list, that seems more cache
     748              :      * efficient in cases where only a few IOs are used.
     749              :      */
     750      1481515 :     dclist_push_head(&pgaio_my_backend->idle_ios, &ioh->node);
     751              : 
     752      1481515 :     RESUME_INTERRUPTS();
     753      1481515 : }
     754              : 
     755              : /*
     756              :  * Wait for an IO handle to become usable.
     757              :  *
     758              :  * This only really is useful for pgaio_io_acquire().
     759              :  */
     760              : static void
     761         3150 : pgaio_io_wait_for_free(void)
     762              : {
     763         3150 :     int         reclaimed = 0;
     764              : 
     765         3150 :     pgaio_debug(DEBUG2, "waiting for free IO with %d pending, %u in-flight, %u idle IOs",
     766              :                 pgaio_my_backend->num_staged_ios,
     767              :                 dclist_count(&pgaio_my_backend->in_flight_ios),
     768              :                 dclist_count(&pgaio_my_backend->idle_ios));
     769              : 
     770              :     /*
     771              :      * First check if any of our IOs actually have completed - when using
     772              :      * worker, that'll often be the case. We could do so as part of the loop
     773              :      * below, but that'd potentially lead us to wait for some IO submitted
     774              :      * before.
     775              :      */
     776         6300 :     for (int i = 0; i < io_max_concurrency; i++)
     777              :     {
     778         3150 :         PgAioHandle *ioh = &pgaio_ctl->io_handles[pgaio_my_backend->io_handle_off + i];
     779              : 
     780         3150 :         if (ioh->state == PGAIO_HS_COMPLETED_SHARED)
     781              :         {
     782              :             /*
     783              :              * Note that no interrupts are processed between the state check
     784              :              * and the call to reclaim - that's important as otherwise an
     785              :              * interrupt could have already reclaimed the handle.
     786              :              *
     787              :              * Need to ensure that there's no reordering, in the more common
     788              :              * paths, where we wait for IO, that's done by
     789              :              * pgaio_io_was_recycled().
     790              :              */
     791         2207 :             pg_read_barrier();
     792         2207 :             pgaio_io_reclaim(ioh);
     793         2207 :             reclaimed++;
     794              :         }
     795              :     }
     796              : 
     797         3150 :     if (reclaimed > 0)
     798         2207 :         return;
     799              : 
     800              :     /*
     801              :      * If we have any unsubmitted IOs, submit them now. We'll start waiting in
     802              :      * a second, so it's better they're in flight. This also addresses the
     803              :      * edge-case that all IOs are unsubmitted.
     804              :      */
     805          943 :     if (pgaio_my_backend->num_staged_ios > 0)
     806            0 :         pgaio_submit_staged();
     807              : 
     808              :     /* possibly some IOs finished during submission */
     809          943 :     if (!dclist_is_empty(&pgaio_my_backend->idle_ios))
     810            0 :         return;
     811              : 
     812          943 :     if (dclist_count(&pgaio_my_backend->in_flight_ios) == 0)
     813            0 :         ereport(ERROR,
     814              :                 errmsg_internal("no free IOs despite no in-flight IOs"),
     815              :                 errdetail_internal("%d pending, %u in-flight, %u idle IOs",
     816              :                                    pgaio_my_backend->num_staged_ios,
     817              :                                    dclist_count(&pgaio_my_backend->in_flight_ios),
     818              :                                    dclist_count(&pgaio_my_backend->idle_ios)));
     819              : 
     820              :     /*
     821              :      * Wait for the oldest in-flight IO to complete.
     822              :      *
     823              :      * XXX: Reusing the general IO wait is suboptimal, we don't need to wait
     824              :      * for that specific IO to complete, we just need *any* IO to complete.
     825              :      */
     826              :     {
     827          943 :         PgAioHandle *ioh = dclist_head_element(PgAioHandle, node,
     828              :                                                &pgaio_my_backend->in_flight_ios);
     829          943 :         uint64      generation = ioh->generation;
     830              : 
     831          943 :         switch ((PgAioHandleState) ioh->state)
     832              :         {
     833              :                 /* should not be in in-flight list */
     834            0 :             case PGAIO_HS_IDLE:
     835              :             case PGAIO_HS_DEFINED:
     836              :             case PGAIO_HS_HANDED_OUT:
     837              :             case PGAIO_HS_STAGED:
     838              :             case PGAIO_HS_COMPLETED_LOCAL:
     839            0 :                 elog(ERROR, "shouldn't get here with io:%d in state %d",
     840              :                      pgaio_io_get_id(ioh), ioh->state);
     841              :                 break;
     842              : 
     843          942 :             case PGAIO_HS_COMPLETED_IO:
     844              :             case PGAIO_HS_SUBMITTED:
     845          942 :                 pgaio_debug_io(DEBUG2, ioh,
     846              :                                "waiting for free io with %u in flight",
     847              :                                dclist_count(&pgaio_my_backend->in_flight_ios));
     848              : 
     849              :                 /*
     850              :                  * In a more general case this would be racy, because the
     851              :                  * generation could increase after we read ioh->state above.
     852              :                  * But we are only looking at IOs by the current backend and
     853              :                  * the IO can only be recycled by this backend.  Even this is
     854              :                  * only OK because we get the handle's generation before
     855              :                  * potentially processing interrupts, e.g. as part of
     856              :                  * pgaio_debug_io().
     857              :                  */
     858          942 :                 pgaio_io_wait(ioh, generation);
     859          942 :                 break;
     860              : 
     861            1 :             case PGAIO_HS_COMPLETED_SHARED:
     862              : 
     863              :                 /*
     864              :                  * It's possible that another backend just finished this IO.
     865              :                  *
     866              :                  * Note that no interrupts are processed between the state
     867              :                  * check and the call to reclaim - that's important as
     868              :                  * otherwise an interrupt could have already reclaimed the
     869              :                  * handle.
     870              :                  *
     871              :                  * Need to ensure that there's no reordering, in the more
     872              :                  * common paths, where we wait for IO, that's done by
     873              :                  * pgaio_io_was_recycled().
     874              :                  */
     875            1 :                 pg_read_barrier();
     876            1 :                 pgaio_io_reclaim(ioh);
     877            1 :                 break;
     878              :         }
     879              : 
     880          943 :         if (dclist_count(&pgaio_my_backend->idle_ios) == 0)
     881            0 :             elog(PANIC, "no idle IO after waiting for IO to terminate");
     882          943 :         return;
     883              :     }
     884              : }
     885              : 
     886              : /*
     887              :  * Internal - code outside of AIO should never need this and it'd be hard for
     888              :  * such code to be safe.
     889              :  */
     890              : static PgAioHandle *
     891      1005370 : pgaio_io_from_wref(PgAioWaitRef *iow, uint64 *ref_generation)
     892              : {
     893              :     PgAioHandle *ioh;
     894              : 
     895              :     Assert(iow->aio_index < pgaio_ctl->io_handle_count);
     896              : 
     897      1005370 :     ioh = &pgaio_ctl->io_handles[iow->aio_index];
     898              : 
     899      1005370 :     *ref_generation = ((uint64) iow->generation_upper) << 32 |
     900      1005370 :         iow->generation_lower;
     901              : 
     902              :     Assert(*ref_generation != 0);
     903              : 
     904      1005370 :     return ioh;
     905              : }
     906              : 
     907              : static const char *
     908        12109 : pgaio_io_state_get_name(PgAioHandleState s)
     909              : {
     910              : #define PGAIO_HS_TOSTR_CASE(sym) case PGAIO_HS_##sym: return #sym
     911        12109 :     switch (s)
     912              :     {
     913            0 :             PGAIO_HS_TOSTR_CASE(IDLE);
     914         4028 :             PGAIO_HS_TOSTR_CASE(HANDED_OUT);
     915         2014 :             PGAIO_HS_TOSTR_CASE(DEFINED);
     916         2014 :             PGAIO_HS_TOSTR_CASE(STAGED);
     917           13 :             PGAIO_HS_TOSTR_CASE(SUBMITTED);
     918         2014 :             PGAIO_HS_TOSTR_CASE(COMPLETED_IO);
     919         2026 :             PGAIO_HS_TOSTR_CASE(COMPLETED_SHARED);
     920            0 :             PGAIO_HS_TOSTR_CASE(COMPLETED_LOCAL);
     921              :     }
     922              : #undef PGAIO_HS_TOSTR_CASE
     923              : 
     924            0 :     return NULL;                /* silence compiler */
     925              : }
     926              : 
     927              : const char *
     928        12109 : pgaio_io_get_state_name(PgAioHandle *ioh)
     929              : {
     930        12109 :     return pgaio_io_state_get_name(ioh->state);
     931              : }
     932              : 
     933              : const char *
     934         4028 : pgaio_result_status_string(PgAioResultStatus rs)
     935              : {
     936         4028 :     switch (rs)
     937              :     {
     938            0 :         case PGAIO_RS_UNKNOWN:
     939            0 :             return "UNKNOWN";
     940         3646 :         case PGAIO_RS_OK:
     941         3646 :             return "OK";
     942           68 :         case PGAIO_RS_WARNING:
     943           68 :             return "WARNING";
     944          218 :         case PGAIO_RS_PARTIAL:
     945          218 :             return "PARTIAL";
     946           96 :         case PGAIO_RS_ERROR:
     947           96 :             return "ERROR";
     948              :     }
     949              : 
     950            0 :     return NULL;                /* silence compiler */
     951              : }
     952              : 
     953              : 
     954              : 
     955              : /* --------------------------------------------------------------------------------
     956              :  * Functions primarily related to IO Wait References
     957              :  * --------------------------------------------------------------------------------
     958              :  */
     959              : 
     960              : /*
     961              :  * Mark a wait reference as invalid
     962              :  */
     963              : void
     964     16607757 : pgaio_wref_clear(PgAioWaitRef *iow)
     965              : {
     966     16607757 :     iow->aio_index = PG_UINT32_MAX;
     967     16607757 : }
     968              : 
     969              : /* Is the wait reference valid? */
     970              : bool
     971      3031317 : pgaio_wref_valid(PgAioWaitRef *iow)
     972              : {
     973      3031317 :     return iow->aio_index != PG_UINT32_MAX;
     974              : }
     975              : 
     976              : /*
     977              :  * Similar to pgaio_io_get_id(), just for wait references.
     978              :  */
     979              : int
     980            0 : pgaio_wref_get_id(PgAioWaitRef *iow)
     981              : {
     982              :     Assert(pgaio_wref_valid(iow));
     983            0 :     return iow->aio_index;
     984              : }
     985              : 
     986              : /*
     987              :  * Wait for the IO to have completed. Can be called in any process, not just
     988              :  * in the issuing backend.
     989              :  */
     990              : void
     991       327775 : pgaio_wref_wait(PgAioWaitRef *iow)
     992              : {
     993              :     uint64      ref_generation;
     994              :     PgAioHandle *ioh;
     995              : 
     996       327775 :     ioh = pgaio_io_from_wref(iow, &ref_generation);
     997              : 
     998       327775 :     pgaio_io_wait(ioh, ref_generation);
     999       327774 : }
    1000              : 
    1001              : /*
    1002              :  * Check if the referenced IO completed, without blocking.
    1003              :  */
    1004              : bool
    1005       677595 : pgaio_wref_check_done(PgAioWaitRef *iow)
    1006              : {
    1007              :     uint64      ref_generation;
    1008              :     PgAioHandleState state;
    1009              :     bool        am_owner;
    1010              :     PgAioHandle *ioh;
    1011              : 
    1012       677595 :     ioh = pgaio_io_from_wref(iow, &ref_generation);
    1013              : 
    1014       677595 :     if (pgaio_io_was_recycled(ioh, ref_generation, &state))
    1015          269 :         return true;
    1016              : 
    1017       677326 :     if (state == PGAIO_HS_IDLE)
    1018            0 :         return true;
    1019              : 
    1020       677326 :     am_owner = ioh->owner_procno == MyProcNumber;
    1021              : 
    1022              :     /*
    1023              :      * If the IO is not executing synchronously, allow the IO method to check
    1024              :      * if the IO already has completed.
    1025              :      */
    1026       677326 :     if (pgaio_method_ops->check_one && !(ioh->flags & PGAIO_HF_SYNCHRONOUS))
    1027              :     {
    1028            0 :         pgaio_method_ops->check_one(ioh, ref_generation);
    1029              : 
    1030            0 :         if (pgaio_io_was_recycled(ioh, ref_generation, &state))
    1031            0 :             return true;
    1032              : 
    1033            0 :         if (state == PGAIO_HS_IDLE)
    1034            0 :             return true;
    1035              :     }
    1036              : 
    1037       677326 :     if (state == PGAIO_HS_COMPLETED_SHARED ||
    1038       327645 :         state == PGAIO_HS_COMPLETED_LOCAL)
    1039              :     {
    1040              :         /*
    1041              :          * Note that no interrupts are processed between
    1042              :          * pgaio_io_was_recycled() and this check - that's important as
    1043              :          * otherwise an interrupt could have already reclaimed the handle.
    1044              :          */
    1045       349684 :         if (am_owner)
    1046       349401 :             pgaio_io_reclaim(ioh);
    1047       349684 :         return true;
    1048              :     }
    1049              : 
    1050       327642 :     return false;
    1051              : }
    1052              : 
    1053              : 
    1054              : 
    1055              : /* --------------------------------------------------------------------------------
    1056              :  * Actions on multiple IOs.
    1057              :  * --------------------------------------------------------------------------------
    1058              :  */
    1059              : 
    1060              : /*
    1061              :  * Submit IOs in batches going forward.
    1062              :  *
    1063              :  * Submitting multiple IOs at once can be substantially faster than doing so
    1064              :  * one-by-one. At the same time, submitting multiple IOs at once requires more
    1065              :  * care to avoid deadlocks.
    1066              :  *
    1067              :  * Consider backend A staging an IO for buffer 1 and then trying to start IO
    1068              :  * on buffer 2, while backend B does the inverse. If A submitted the IO before
    1069              :  * moving on to buffer 2, this works just fine, B will wait for the IO to
    1070              :  * complete. But if batching were used, each backend will wait for IO that has
    1071              :  * not yet been submitted to complete, i.e. forever.
    1072              :  *
    1073              :  * End batch submission mode with pgaio_exit_batchmode().  (Throwing errors is
    1074              :  * allowed; error recovery will end the batch.)
    1075              :  *
    1076              :  * To avoid deadlocks, code needs to ensure that it will not wait for another
    1077              :  * backend while there is unsubmitted IO. E.g. by using conditional lock
    1078              :  * acquisition when acquiring buffer locks. To check if there currently are
    1079              :  * staged IOs, call pgaio_have_staged() and to submit all staged IOs call
    1080              :  * pgaio_submit_staged().
    1081              :  *
    1082              :  * It is not allowed to enter batchmode while already in batchmode, it's
    1083              :  * unlikely to ever be needed, as code needs to be explicitly aware of being
    1084              :  * called in batchmode, to avoid the deadlock risks explained above.
    1085              :  *
    1086              :  * Note that IOs may get submitted before pgaio_exit_batchmode() is called,
    1087              :  * e.g. because too many IOs have been staged or because pgaio_submit_staged()
    1088              :  * was called.
    1089              :  */
    1090              : void
    1091      3219964 : pgaio_enter_batchmode(void)
    1092              : {
    1093      3219964 :     if (pgaio_my_backend->in_batchmode)
    1094            0 :         elog(ERROR, "starting batch while batch already in progress");
    1095      3219964 :     pgaio_my_backend->in_batchmode = true;
    1096      3219964 : }
    1097              : 
    1098              : /*
    1099              :  * Stop submitting IOs in batches.
    1100              :  */
    1101              : void
    1102      3219952 : pgaio_exit_batchmode(void)
    1103              : {
    1104              :     Assert(pgaio_my_backend->in_batchmode);
    1105              : 
    1106      3219952 :     pgaio_submit_staged();
    1107      3219952 :     pgaio_my_backend->in_batchmode = false;
    1108      3219952 : }
    1109              : 
    1110              : /*
    1111              :  * Are there staged but unsubmitted IOs?
    1112              :  *
    1113              :  * See comment above pgaio_enter_batchmode() for why code may need to check if
    1114              :  * there is IO in that state.
    1115              :  */
    1116              : bool
    1117            0 : pgaio_have_staged(void)
    1118              : {
    1119              :     Assert(pgaio_my_backend->in_batchmode ||
    1120              :            pgaio_my_backend->num_staged_ios == 0);
    1121            0 :     return pgaio_my_backend->num_staged_ios > 0;
    1122              : }
    1123              : 
    1124              : /*
    1125              :  * Submit all staged but not yet submitted IOs.
    1126              :  *
    1127              :  * Unless in batch mode, this never needs to be called, as IOs get submitted
    1128              :  * as soon as possible. While in batchmode pgaio_submit_staged() can be called
    1129              :  * before waiting on another backend, to avoid the risk of deadlocks. See
    1130              :  * pgaio_enter_batchmode().
    1131              :  */
    1132              : void
    1133      3238503 : pgaio_submit_staged(void)
    1134              : {
    1135      3238503 :     int         total_submitted = 0;
    1136              :     int         did_submit;
    1137              : 
    1138      3238503 :     if (pgaio_my_backend->num_staged_ios == 0)
    1139      2557735 :         return;
    1140              : 
    1141              : 
    1142       680768 :     START_CRIT_SECTION();
    1143              : 
    1144       680768 :     did_submit = pgaio_method_ops->submit(pgaio_my_backend->num_staged_ios,
    1145       680768 :                                           pgaio_my_backend->staged_ios);
    1146              : 
    1147       680768 :     END_CRIT_SECTION();
    1148              : 
    1149       680768 :     total_submitted += did_submit;
    1150              : 
    1151              :     Assert(total_submitted == did_submit);
    1152              : 
    1153       680768 :     pgaio_my_backend->num_staged_ios = 0;
    1154              : 
    1155       680768 :     pgaio_debug(DEBUG4,
    1156              :                 "aio: submitted %d IOs",
    1157              :                 total_submitted);
    1158              : }
    1159              : 
    1160              : 
    1161              : 
    1162              : /* --------------------------------------------------------------------------------
    1163              :  * Other
    1164              :  * --------------------------------------------------------------------------------
    1165              :  */
    1166              : 
    1167              : 
    1168              : /*
    1169              :  * Perform AIO related cleanup after an error.
    1170              :  *
    1171              :  * This should be called early in the error recovery paths, as later steps may
    1172              :  * need to issue AIO (e.g. to record a transaction abort WAL record).
    1173              :  */
    1174              : void
    1175        41381 : pgaio_error_cleanup(void)
    1176              : {
    1177              :     /*
    1178              :      * It is possible that code errored out after pgaio_enter_batchmode() but
    1179              :      * before pgaio_exit_batchmode() was called. In that case we need to
    1180              :      * submit the IO now.
    1181              :      */
    1182        41381 :     if (pgaio_my_backend->in_batchmode)
    1183              :     {
    1184           12 :         pgaio_my_backend->in_batchmode = false;
    1185              : 
    1186           12 :         pgaio_submit_staged();
    1187              :     }
    1188              : 
    1189              :     /*
    1190              :      * As we aren't in batchmode, there shouldn't be any unsubmitted IOs.
    1191              :      */
    1192              :     Assert(pgaio_my_backend->num_staged_ios == 0);
    1193        41381 : }
    1194              : 
    1195              : /*
    1196              :  * Perform AIO related checks at (sub-)transactional boundaries.
    1197              :  *
    1198              :  * This should be called late during (sub-)transactional commit/abort, after
    1199              :  * all steps that might need to perform AIO, so that we can verify that the
    1200              :  * AIO subsystem is in a valid state at the end of a transaction.
    1201              :  */
    1202              : void
    1203       658670 : AtEOXact_Aio(bool is_commit)
    1204              : {
    1205              :     /*
    1206              :      * We should never be in batch mode at transactional boundaries. In case
    1207              :      * an error was thrown while in batch mode, pgaio_error_cleanup() should
    1208              :      * have exited batchmode.
    1209              :      *
    1210              :      * In case we are in batchmode somehow, make sure to submit all staged
    1211              :      * IOs, other backends may need them to complete to continue.
    1212              :      */
    1213       658670 :     if (pgaio_my_backend->in_batchmode)
    1214              :     {
    1215            4 :         pgaio_error_cleanup();
    1216            4 :         elog(WARNING, "open AIO batch at end of (sub-)transaction");
    1217              :     }
    1218              : 
    1219              :     /*
    1220              :      * As we aren't in batchmode, there shouldn't be any unsubmitted IOs.
    1221              :      */
    1222              :     Assert(pgaio_my_backend->num_staged_ios == 0);
    1223       658670 : }
    1224              : 
    1225              : /*
    1226              :  * Need to submit staged but not yet submitted IOs using the fd, otherwise
    1227              :  * the IO would end up targeting something bogus.
    1228              :  */
    1229              : void
    1230      8493005 : pgaio_closing_fd(int fd)
    1231              : {
    1232              :     /*
    1233              :      * Might be called before AIO is initialized or in a subprocess that
    1234              :      * doesn't use AIO.
    1235              :      */
    1236      8493005 :     if (!pgaio_my_backend)
    1237        10176 :         return;
    1238              : 
    1239              :     /*
    1240              :      * For now just submit all staged IOs - we could be more selective, but
    1241              :      * it's probably not worth it.
    1242              :      */
    1243      8482829 :     if (pgaio_my_backend->num_staged_ios > 0)
    1244              :     {
    1245            2 :         pgaio_debug(DEBUG2,
    1246              :                     "submitting %d IOs before FD %d gets closed",
    1247              :                     pgaio_my_backend->num_staged_ios, fd);
    1248            2 :         pgaio_submit_staged();
    1249              :     }
    1250              : 
    1251              :     /*
    1252              :      * If requested by the IO method, wait for all IOs that use the
    1253              :      * to-be-closed FD.
    1254              :      */
    1255      8482829 :     if (pgaio_method_ops->wait_on_fd_before_close)
    1256              :     {
    1257              :         /*
    1258              :          * As waiting for one IO to complete may complete multiple IOs, we
    1259              :          * can't just use a mutable list iterator. The maximum number of
    1260              :          * in-flight IOs is fairly small, so just restart the loop after
    1261              :          * waiting for an IO.
    1262              :          */
    1263            0 :         while (!dclist_is_empty(&pgaio_my_backend->in_flight_ios))
    1264              :         {
    1265              :             dlist_iter  iter;
    1266            0 :             PgAioHandle *ioh = NULL;
    1267              :             uint64      generation;
    1268              : 
    1269            0 :             dclist_foreach(iter, &pgaio_my_backend->in_flight_ios)
    1270              :             {
    1271            0 :                 ioh = dclist_container(PgAioHandle, node, iter.cur);
    1272              : 
    1273            0 :                 generation = ioh->generation;
    1274              : 
    1275            0 :                 if (pgaio_io_uses_fd(ioh, fd))
    1276            0 :                     break;
    1277              :                 else
    1278            0 :                     ioh = NULL;
    1279              :             }
    1280              : 
    1281            0 :             if (!ioh)
    1282            0 :                 break;
    1283              : 
    1284            0 :             pgaio_debug_io(DEBUG2, ioh,
    1285              :                            "waiting for IO before FD %d gets closed, %u in-flight IOs",
    1286              :                            fd, dclist_count(&pgaio_my_backend->in_flight_ios));
    1287              : 
    1288              :             /* see comment in pgaio_io_wait_for_free() about raciness */
    1289            0 :             pgaio_io_wait(ioh, generation);
    1290              :         }
    1291              :     }
    1292              : }
    1293              : 
    1294              : /*
    1295              :  * Registered as before_shmem_exit() callback in pgaio_init_backend()
    1296              :  */
    1297              : void
    1298        22835 : pgaio_shutdown(int code, Datum arg)
    1299              : {
    1300              :     Assert(pgaio_my_backend);
    1301              :     Assert(!pgaio_my_backend->handed_out_io);
    1302              : 
    1303              :     /* first clean up resources as we would at a transaction boundary */
    1304        22835 :     AtEOXact_Aio(code == 0);
    1305              : 
    1306              :     /*
    1307              :      * Before exiting, make sure that all IOs are finished. That has two main
    1308              :      * purposes:
    1309              :      *
    1310              :      * - Some kernel-level AIO mechanisms don't deal well with the issuer of
    1311              :      * an AIO exiting before IO completed
    1312              :      *
    1313              :      * - It'd be confusing to see partially finished IOs in stats views etc
    1314              :      */
    1315        22848 :     while (!dclist_is_empty(&pgaio_my_backend->in_flight_ios))
    1316              :     {
    1317           13 :         PgAioHandle *ioh = dclist_head_element(PgAioHandle, node, &pgaio_my_backend->in_flight_ios);
    1318           13 :         uint64      generation = ioh->generation;
    1319              : 
    1320           13 :         pgaio_debug_io(DEBUG2, ioh,
    1321              :                        "waiting for IO to complete during shutdown, %u in-flight IOs",
    1322              :                        dclist_count(&pgaio_my_backend->in_flight_ios));
    1323              : 
    1324              :         /* see comment in pgaio_io_wait_for_free() about raciness */
    1325           13 :         pgaio_io_wait(ioh, generation);
    1326              :     }
    1327              : 
    1328        22835 :     pgaio_my_backend = NULL;
    1329        22835 : }
    1330              : 
    1331              : void
    1332         1291 : assign_io_method(int newval, void *extra)
    1333              : {
    1334              :     Assert(newval < lengthof(pgaio_method_ops_table));
    1335              :     Assert(pgaio_method_ops_table[newval] != NULL);
    1336              : 
    1337         1291 :     pgaio_method_ops = pgaio_method_ops_table[newval];
    1338         1291 : }
    1339              : 
    1340              : bool
    1341         2504 : check_io_max_concurrency(int *newval, void **extra, GucSource source)
    1342              : {
    1343         2504 :     if (*newval == -1)
    1344              :     {
    1345              :         /*
    1346              :          * Auto-tuning will be applied later during startup, as auto-tuning
    1347              :          * depends on the value of various GUCs.
    1348              :          */
    1349         1275 :         return true;
    1350              :     }
    1351         1229 :     else if (*newval == 0)
    1352              :     {
    1353            0 :         GUC_check_errdetail("Only -1 or values bigger than 0 are valid.");
    1354            0 :         return false;
    1355              :     }
    1356              : 
    1357         1229 :     return true;
    1358              : }
        

Generated by: LCOV version 2.0-1