LCOV - code coverage report
Current view: top level - src/common - instr_time.c (source / functions) Coverage Total Hit
Test: PostgreSQL 19devel Lines: 85.2 % 108 92
Test Date: 2026-05-21 09:16:36 Functions: 100.0 % 10 10
Legend: Lines:     hit not hit

            Line data    Source code
       1              : /*-------------------------------------------------------------------------
       2              :  *
       3              :  * instr_time.c
       4              :  *     Non-inline parts of the portable high-precision interval timing
       5              :  *   implementation
       6              :  *
       7              :  * Portions Copyright (c) 2026, PostgreSQL Global Development Group
       8              :  *
       9              :  *
      10              :  * IDENTIFICATION
      11              :  *    src/common/instr_time.c
      12              :  *
      13              :  *-------------------------------------------------------------------------
      14              :  */
      15              : #ifndef FRONTEND
      16              : #include "postgres.h"
      17              : #else
      18              : #include "postgres_fe.h"
      19              : #endif
      20              : 
      21              : #include <math.h>
      22              : 
      23              : #include "port/pg_cpu.h"
      24              : #include "portability/instr_time.h"
      25              : 
      26              : /*
      27              :  * Stores what the number of ticks needs to be multiplied with to end up
      28              :  * with nanoseconds using integer math.
      29              :  *
      30              :  * In certain cases (TSC on x86-64, and QueryPerformanceCounter on Windows)
      31              :  * the ticks to nanoseconds conversion requires floating point math because:
      32              :  *
      33              :  * sec = ticks / frequency_hz
      34              :  * ns  = ticks / frequency_hz * 1,000,000,000
      35              :  * ns  = ticks * (1,000,000,000 / frequency_hz)
      36              :  * ns  = ticks * (1,000,000 / frequency_khz) <-- now in kilohertz
      37              :  *
      38              :  * Here, 'ns' is usually a floating point number. For example for a 2.5 GHz CPU
      39              :  * the scaling factor becomes 1,000,000 / 2,500,000 = 0.4.
      40              :  *
      41              :  * To be able to use integer math we work around the lack of precision. We
      42              :  * first scale the integer up (left shift by TICKS_TO_NS_SHIFT) and after the
      43              :  * multiplication by the number of ticks in pg_ticks_to_ns() we shift right by
      44              :  * the same amount.
      45              :  *
      46              :  * We remember the maximum number of ticks that can be multiplied by the scale
      47              :  * factor without overflowing so we can check via a * b > max <=> a > max / b.
      48              :  *
      49              :  * However, as this is meant for interval measurements, it is unlikely that the
      50              :  * overflow path is actually taken in typical scenarios, since overflows would
      51              :  * only occur for intervals longer than 6.5 days.
      52              :  *
      53              :  * Note we utilize unsigned integers even though ticks are stored as a signed
      54              :  * value to encourage compilers to generate better assembly, since we can be
      55              :  * sure these values are not negative.
      56              :  *
      57              :  * In all other cases we are using clock_gettime(), which uses nanoseconds
      58              :  * as ticks. Hence, we set the multiplier to zero, which causes pg_ticks_to_ns
      59              :  * to return the original value.
      60              :  */
      61              : uint64      ticks_per_ns_scaled = 0;
      62              : uint64      max_ticks_no_overflow = 0;
      63              : bool        timing_initialized = false;
      64              : int         timing_clock_source = TIMING_CLOCK_SOURCE_AUTO;
      65              : 
      66              : bool        timing_tsc_enabled = false;
      67              : int32       timing_tsc_frequency_khz = -1;
      68              : 
      69              : static void set_ticks_per_ns(void);
      70              : static void set_ticks_per_ns_system(void);
      71              : 
      72              : #if PG_INSTR_TSC_CLOCK
      73              : static TscClockSourceInfo tsc_info = {.calibrated_frequency_khz = -1};
      74              : 
      75              : static bool tsc_use_by_default(void);
      76              : static void set_ticks_per_ns_for_tsc(void);
      77              : #endif
      78              : 
      79              : /*
      80              :  * Initializes timing infrastructure. Must be called before making any use
      81              :  * of INSTR* macros.
      82              :  */
      83              : void
      84        37129 : pg_initialize_timing(void)
      85              : {
      86        37129 :     if (timing_initialized)
      87        24910 :         return;
      88              : 
      89        12219 :     set_ticks_per_ns_system();
      90        12219 :     timing_initialized = true;
      91              : }
      92              : 
      93              : bool
      94         3882 : pg_set_timing_clock_source(TimingClockSourceType source)
      95              : {
      96              :     Assert(timing_initialized);
      97              : 
      98              : #if PG_INSTR_TSC_CLOCK
      99         3882 :     pg_initialize_timing_tsc();
     100              : 
     101         3882 :     switch (source)
     102              :     {
     103         2586 :         case TIMING_CLOCK_SOURCE_AUTO:
     104         2586 :             timing_tsc_enabled = (timing_tsc_frequency_khz > 0) && tsc_use_by_default();
     105         2586 :             break;
     106         1294 :         case TIMING_CLOCK_SOURCE_SYSTEM:
     107         1294 :             timing_tsc_enabled = false;
     108         1294 :             break;
     109            2 :         case TIMING_CLOCK_SOURCE_TSC:
     110              :             /* Tell caller TSC is not usable */
     111            2 :             if (timing_tsc_frequency_khz <= 0)
     112            0 :                 return false;
     113            2 :             timing_tsc_enabled = true;
     114            2 :             break;
     115              :     }
     116              : #endif
     117              : 
     118         3882 :     set_ticks_per_ns();
     119         3882 :     timing_clock_source = source;
     120         3882 :     return true;
     121              : }
     122              : 
     123              : static void
     124         3882 : set_ticks_per_ns(void)
     125              : {
     126              : #if PG_INSTR_TSC_CLOCK
     127         3882 :     if (timing_tsc_enabled)
     128              :     {
     129         1294 :         set_ticks_per_ns_for_tsc();
     130         1294 :         return;
     131              :     }
     132              : #endif
     133         2588 :     set_ticks_per_ns_system();
     134              : }
     135              : 
     136              : #ifndef WIN32
     137              : 
     138              : static void
     139        14807 : set_ticks_per_ns_system(void)
     140              : {
     141        14807 :     ticks_per_ns_scaled = 0;
     142        14807 :     max_ticks_no_overflow = 0;
     143        14807 : }
     144              : 
     145              : #else                           /* WIN32 */
     146              : 
     147              : /* GetTimerFrequency returns counts per second */
     148              : static inline double
     149              : GetTimerFrequency(void)
     150              : {
     151              :     LARGE_INTEGER f;
     152              : 
     153              :     QueryPerformanceFrequency(&f);
     154              :     return (double) f.QuadPart;
     155              : }
     156              : 
     157              : static void
     158              : set_ticks_per_ns_system(void)
     159              : {
     160              :     ticks_per_ns_scaled = (NS_PER_S << TICKS_TO_NS_SHIFT) / GetTimerFrequency();
     161              :     max_ticks_no_overflow = PG_INT64_MAX / ticks_per_ns_scaled;
     162              : }
     163              : 
     164              : #endif                          /* WIN32 */
     165              : 
     166              : /* TSC specific logic */
     167              : 
     168              : #if PG_INSTR_TSC_CLOCK
     169              : 
     170              : static void tsc_detect_frequency(void);
     171              : static uint32 pg_tsc_calibrate_frequency(void);
     172              : 
     173              : /*
     174              :  * Initialize the TSC clock source by determining its usability and frequency.
     175              :  *
     176              :  * This can be called multiple times without causing repeated work, as
     177              :  * timing_tsc_frequency_khz will be set to 0 if a prior call determined the
     178              :  * TSC is not usable. On EXEC_BACKEND (Windows), the TSC frequency may also be
     179              :  * set by restore_backend_variables.
     180              :  */
     181              : void
     182         5174 : pg_initialize_timing_tsc(void)
     183              : {
     184         5174 :     if (timing_tsc_frequency_khz < 0)
     185         1293 :         tsc_detect_frequency();
     186         5174 : }
     187              : 
     188              : static void
     189         1294 : set_ticks_per_ns_for_tsc(void)
     190              : {
     191         1294 :     ticks_per_ns_scaled = ((NS_PER_S / 1000) << TICKS_TO_NS_SHIFT) / timing_tsc_frequency_khz;
     192         1294 :     max_ticks_no_overflow = PG_INT64_MAX / ticks_per_ns_scaled;
     193         1294 : }
     194              : 
     195              : /*
     196              :  * Detect the TSC frequency and whether RDTSCP is available on x86-64.
     197              :  *
     198              :  * This can't be reliably determined at compile time, since the
     199              :  * availability of an "invariant" TSC (that is not affected by CPU
     200              :  * frequency changes) is dependent on the CPU architecture. Additionally,
     201              :  * there are cases where TSC availability is impacted by virtualization,
     202              :  * where a simple cpuid feature check would not be enough.
     203              :  */
     204              : static void
     205         1293 : tsc_detect_frequency(void)
     206              : {
     207         1293 :     timing_tsc_frequency_khz = 0;
     208         1293 :     tsc_info.frequency_khz = 0;
     209         1293 :     tsc_info.frequency_source[0] = '\0';
     210              : 
     211         1293 :     strlcat(tsc_info.frequency_source, "x86",
     212              :             sizeof(tsc_info.frequency_source));
     213              : 
     214              :     /* We require RDTSCP support and an invariant TSC, bail if not available */
     215         1293 :     if (!x86_feature_available(PG_RDTSCP))
     216              :     {
     217            0 :         strlcat(tsc_info.frequency_source, ", no rdtscp",
     218              :                 sizeof(tsc_info.frequency_source));
     219            0 :         return;
     220              :     }
     221              : 
     222         1293 :     if (!x86_feature_available(PG_TSC_INVARIANT))
     223              :     {
     224            0 :         strlcat(tsc_info.frequency_source, ", not invariant",
     225              :                 sizeof(tsc_info.frequency_source));
     226            0 :         return;
     227              :     }
     228              : 
     229              :     /* Determine speed at which the TSC advances */
     230         1293 :     timing_tsc_frequency_khz = x86_tsc_frequency_khz(tsc_info.frequency_source,
     231              :                                                      sizeof(tsc_info.frequency_source));
     232         1293 :     if (timing_tsc_frequency_khz > 0)
     233              :     {
     234            0 :         tsc_info.frequency_khz = timing_tsc_frequency_khz;
     235            0 :         return;
     236              :     }
     237              : 
     238              :     /*
     239              :      * CPUID did not give us the TSC frequency. We can instead measure the
     240              :      * frequency by comparing ticks against walltime in a calibration loop.
     241              :      */
     242         1293 :     if (tsc_info.calibrated_frequency_khz < 0)
     243         1293 :         tsc_info.calibrated_frequency_khz = pg_tsc_calibrate_frequency();
     244              : 
     245         1293 :     timing_tsc_frequency_khz = tsc_info.calibrated_frequency_khz;
     246         1293 :     if (timing_tsc_frequency_khz > 0)
     247              :     {
     248         1292 :         strlcat(tsc_info.frequency_source, ", calibration",
     249              :                 sizeof(tsc_info.frequency_source));
     250         1292 :         tsc_info.frequency_khz = timing_tsc_frequency_khz;
     251              :     }
     252              : }
     253              : 
     254              : /*
     255              :  * Decides whether to use the TSC clock source if the user did not specify it
     256              :  * one way or the other, and it is available (checked separately).
     257              :  *
     258              :  * Inspired by the Linux kernel's clocksource watchdog disable logic as updated
     259              :  * in 2021 to reflect the reliability of the TSC on Intel platforms, see
     260              :  * check_system_tsc_reliable() in arch/x86/kernel/tsc.c, as well as discussion
     261              :  * in https://lore.kernel.org/lkml/87eekfk8bd.fsf@nanos.tec.linutronix.de/
     262              :  * and https://lore.kernel.org/lkml/87a6pimt1f.ffs@nanos.tec.linutronix.de/
     263              :  * for reference.
     264              :  *
     265              :  * When tsc_detect_frequency determines the TSC is viable (invariant, etc.), and
     266              :  * we're on an Intel platform (determined via TSC_ADJUST), we consider the TSC
     267              :  * trustworthy by default, matching the Linux kernel.
     268              :  *
     269              :  * On other CPU platforms (e.g. AMD), or in some virtual machines, we don't have
     270              :  * an easy way to determine the TSC's reliability. If on Linux, we can check if
     271              :  * TSC is the active clocksource, based on it having run the watchdog logic to
     272              :  * monitor TSC correctness. For other platforms the user must explicitly enable
     273              :  * it via GUC instead.
     274              :  */
     275              : static bool
     276         1292 : tsc_use_by_default(void)
     277              : {
     278         1292 :     if (x86_feature_available(PG_TSC_ADJUST))
     279         1292 :         return true;
     280              : 
     281              : #if defined(__linux__)
     282              :     {
     283              :         FILE       *fp;
     284              :         char        buf[128];
     285              : 
     286            0 :         fp = fopen("/sys/devices/system/clocksource/clocksource0/current_clocksource", "r");
     287            0 :         if (fp)
     288              :         {
     289            0 :             bool        is_tsc = (fgets(buf, sizeof(buf), fp) != NULL &&
     290            0 :                                   strcmp(buf, "tsc\n") == 0);
     291              : 
     292            0 :             fclose(fp);
     293            0 :             if (is_tsc)
     294            0 :                 return true;
     295              :         }
     296              :     }
     297              : #endif
     298              : 
     299            0 :     return false;
     300              : }
     301              : 
     302              : /*
     303              :  * Calibrate the TSC frequency by comparing TSC ticks against walltime.
     304              :  *
     305              :  * Takes initial TSC and system clock snapshots, then loops, recomputing the
     306              :  * frequency each TSC_CALIBRATION_SKIPS iterations from cumulative TSC
     307              :  * ticks divided by elapsed time.
     308              :  *
     309              :  * Once the frequency estimate stabilizes (consecutive iterations agree), we
     310              :  * consider it converged and the frequency in KHz is returned. If either too
     311              :  * many iterations or a time limit passes without convergence, 0 is returned.
     312              :  */
     313              : #define TSC_CALIBRATION_MAX_NS      (50 * NS_PER_MS)
     314              : #define TSC_CALIBRATION_ITERATIONS  1000000
     315              : #define TSC_CALIBRATION_SKIPS       100
     316              : #define TSC_CALIBRATION_STABLE_CYCLES   10
     317              : static uint32
     318         1293 : pg_tsc_calibrate_frequency(void)
     319              : {
     320              :     instr_time  initial_wall;
     321              :     int64       initial_tsc;
     322         1293 :     double      freq_khz = 0;
     323         1293 :     double      prev_freq_khz = 0;
     324         1293 :     int         stable_count = 0;
     325              :     int64       prev_tsc;
     326         1293 :     int         saved_clock_source = timing_clock_source;
     327              : 
     328              :     /*
     329              :      * Frequency must be initialized to avoid recursion via
     330              :      * pg_set_timing_clock_source.
     331              :      */
     332              :     Assert(timing_tsc_frequency_khz >= 0);
     333              : 
     334              :     /* Ensure INSTR_* calls below work on system time */
     335         1293 :     pg_set_timing_clock_source(TIMING_CLOCK_SOURCE_SYSTEM);
     336              : 
     337         1293 :     INSTR_TIME_SET_CURRENT(initial_wall);
     338              : 
     339         1293 :     initial_tsc = pg_rdtscp();
     340         1293 :     prev_tsc = initial_tsc;
     341              : 
     342      2759537 :     for (int i = 0; i < TSC_CALIBRATION_ITERATIONS; i++)
     343              :     {
     344              :         instr_time  now_wall;
     345              :         int64       now_tsc;
     346              :         int64       elapsed_ns;
     347              :         int64       elapsed_ticks;
     348              : 
     349      2759537 :         INSTR_TIME_SET_CURRENT(now_wall);
     350              : 
     351      2759537 :         now_tsc = pg_rdtscp();
     352              : 
     353      2759537 :         INSTR_TIME_SUBTRACT(now_wall, initial_wall);
     354      2759537 :         elapsed_ns = INSTR_TIME_GET_NANOSEC(now_wall);
     355              : 
     356              :         /* Safety: bail out if we've taken too long */
     357      2759537 :         if (elapsed_ns >= TSC_CALIBRATION_MAX_NS)
     358         1293 :             break;
     359              : 
     360      2759536 :         elapsed_ticks = now_tsc - initial_tsc;
     361              : 
     362              :         /*
     363              :          * Skip if TSC hasn't advanced, or we walked backwards for some
     364              :          * reason.
     365              :          */
     366      2759536 :         if (now_tsc == prev_tsc || elapsed_ns <= 0 || elapsed_ticks <= 0)
     367      2730661 :             continue;
     368              : 
     369              :         /*
     370              :          * We only measure frequency every TSC_CALIBRATION_SKIPS to avoid
     371              :          * stabilizing based on just a handful of RDTSC instructions.
     372              :          */
     373      2759536 :         if (i % TSC_CALIBRATION_SKIPS != 0)
     374      2730661 :             continue;
     375              : 
     376        28875 :         freq_khz = ((double) elapsed_ticks / elapsed_ns) * 1000 * 1000;
     377              : 
     378              :         /*
     379              :          * Once freq_khz / prev_freq_khz is small, check if it stays that way.
     380              :          * If it does for long enough, we've got a winner frequency.
     381              :          */
     382        28875 :         if (prev_freq_khz != 0 && fabs(1 - freq_khz / prev_freq_khz) < 0.0001)
     383              :         {
     384        15778 :             stable_count++;
     385        15778 :             if (stable_count >= TSC_CALIBRATION_STABLE_CYCLES)
     386         1292 :                 break;
     387              :         }
     388              :         else
     389        13097 :             stable_count = 0;
     390              : 
     391        27583 :         prev_tsc = now_tsc;
     392        27583 :         prev_freq_khz = freq_khz;
     393              :     }
     394              : 
     395              :     /* Restore the previous clock source */
     396         1293 :     pg_set_timing_clock_source(saved_clock_source);
     397              : 
     398         1293 :     if (stable_count < TSC_CALIBRATION_STABLE_CYCLES)
     399            1 :         return 0;               /* did not converge */
     400              : 
     401         1292 :     return (uint32) freq_khz;
     402              : }
     403              : 
     404              : /*
     405              :  * Returns TSC clock source information for diagnostic purposes.
     406              :  *
     407              :  * On first call, may run the TSC calibration loop (if not already done during
     408              :  * frequency detection) which can take up to TSC_CALIBRATION_MAX_NS.
     409              :  * Subsequent calls return cached results.
     410              :  *
     411              :  * Note: This won't return the right info in EXEC_BACKEND builds if this were
     412              :  * used in the backend (which it currently is not), as tsc_info is not copied
     413              :  * using read_backend_variables - only the TSC frequency is.
     414              :  */
     415              : const TscClockSourceInfo *
     416            1 : pg_timing_tsc_clock_source_info(void)
     417              : {
     418            1 :     if (tsc_info.frequency_khz > 0 && tsc_info.calibrated_frequency_khz < 0)
     419            0 :         tsc_info.calibrated_frequency_khz = pg_tsc_calibrate_frequency();
     420              : 
     421            1 :     return &tsc_info;
     422              : }
     423              : 
     424              : #endif                          /* PG_INSTR_TSC_CLOCK */
        

Generated by: LCOV version 2.0-1