LCOV - code coverage report
Current view: top level - src/common - instr_time.c (source / functions) Coverage Total Hit
Test: PostgreSQL 19devel Lines: 86.8 % 91 79
Test Date: 2026-05-01 00:16:35 Functions: 100.0 % 9 9
Legend: Lines:     hit not hit

            Line data    Source code
       1              : /*-------------------------------------------------------------------------
       2              :  *
       3              :  * instr_time.c
       4              :  *     Non-inline parts of the portable high-precision interval timing
       5              :  *   implementation
       6              :  *
       7              :  * Portions Copyright (c) 2026, PostgreSQL Global Development Group
       8              :  *
       9              :  *
      10              :  * IDENTIFICATION
      11              :  *    src/common/instr_time.c
      12              :  *
      13              :  *-------------------------------------------------------------------------
      14              :  */
      15              : #ifndef FRONTEND
      16              : #include "postgres.h"
      17              : #else
      18              : #include "postgres_fe.h"
      19              : #endif
      20              : 
      21              : #include <math.h>
      22              : 
      23              : #include "port/pg_cpu.h"
      24              : #include "portability/instr_time.h"
      25              : 
      26              : /*
      27              :  * Stores what the number of ticks needs to be multiplied with to end up
      28              :  * with nanoseconds using integer math.
      29              :  *
      30              :  * In certain cases (TSC on x86-64, and QueryPerformanceCounter on Windows)
      31              :  * the ticks to nanoseconds conversion requires floating point math because:
      32              :  *
      33              :  * sec = ticks / frequency_hz
      34              :  * ns  = ticks / frequency_hz * 1,000,000,000
      35              :  * ns  = ticks * (1,000,000,000 / frequency_hz)
      36              :  * ns  = ticks * (1,000,000 / frequency_khz) <-- now in kilohertz
      37              :  *
      38              :  * Here, 'ns' is usually a floating point number. For example for a 2.5 GHz CPU
      39              :  * the scaling factor becomes 1,000,000 / 2,500,000 = 0.4.
      40              :  *
      41              :  * To be able to use integer math we work around the lack of precision. We
      42              :  * first scale the integer up (left shift by TICKS_TO_NS_SHIFT) and after the
      43              :  * multiplication by the number of ticks in pg_ticks_to_ns() we shift right by
      44              :  * the same amount.
      45              :  *
      46              :  * We remember the maximum number of ticks that can be multiplied by the scale
      47              :  * factor without overflowing so we can check via a * b > max <=> a > max / b.
      48              :  *
      49              :  * However, as this is meant for interval measurements, it is unlikely that the
      50              :  * overflow path is actually taken in typical scenarios, since overflows would
      51              :  * only occur for intervals longer than 6.5 days.
      52              :  *
      53              :  * Note we utilize unsigned integers even though ticks are stored as a signed
      54              :  * value to encourage compilers to generate better assembly, since we can be
      55              :  * sure these values are not negative.
      56              :  *
      57              :  * In all other cases we are using clock_gettime(), which uses nanoseconds
      58              :  * as ticks. Hence, we set the multiplier to zero, which causes pg_ticks_to_ns
      59              :  * to return the original value.
      60              :  */
      61              : uint64      ticks_per_ns_scaled = 0;
      62              : uint64      max_ticks_no_overflow = 0;
      63              : bool        timing_initialized = false;
      64              : int         timing_clock_source = TIMING_CLOCK_SOURCE_AUTO;
      65              : 
      66              : bool        timing_tsc_enabled = false;
      67              : int32       timing_tsc_frequency_khz = -1;
      68              : 
      69              : static void set_ticks_per_ns(void);
      70              : static void set_ticks_per_ns_system(void);
      71              : 
      72              : #if PG_INSTR_TSC_CLOCK
      73              : static bool tsc_use_by_default(void);
      74              : static void set_ticks_per_ns_for_tsc(void);
      75              : #endif
      76              : 
      77              : /*
      78              :  * Initializes timing infrastructure. Must be called before making any use
      79              :  * of INSTR* macros.
      80              :  */
      81              : void
      82        36570 : pg_initialize_timing(void)
      83              : {
      84        36570 :     if (timing_initialized)
      85        24508 :         return;
      86              : 
      87        12062 :     set_ticks_per_ns_system();
      88        12062 :     timing_initialized = true;
      89              : }
      90              : 
      91              : bool
      92         3848 : pg_set_timing_clock_source(TimingClockSourceType source)
      93              : {
      94              :     Assert(timing_initialized);
      95              : 
      96              : #if PG_INSTR_TSC_CLOCK
      97         3848 :     pg_initialize_timing_tsc();
      98              : 
      99         3848 :     switch (source)
     100              :     {
     101         2562 :         case TIMING_CLOCK_SOURCE_AUTO:
     102         2562 :             timing_tsc_enabled = (timing_tsc_frequency_khz > 0) && tsc_use_by_default();
     103         2562 :             break;
     104         1283 :         case TIMING_CLOCK_SOURCE_SYSTEM:
     105         1283 :             timing_tsc_enabled = false;
     106         1283 :             break;
     107            3 :         case TIMING_CLOCK_SOURCE_TSC:
     108              :             /* Tell caller TSC is not usable */
     109            3 :             if (timing_tsc_frequency_khz <= 0)
     110            0 :                 return false;
     111            3 :             timing_tsc_enabled = true;
     112            3 :             break;
     113              :     }
     114              : #endif
     115              : 
     116         3848 :     set_ticks_per_ns();
     117         3848 :     timing_clock_source = source;
     118         3848 :     return true;
     119              : }
     120              : 
     121              : static void
     122         3848 : set_ticks_per_ns(void)
     123              : {
     124              : #if PG_INSTR_TSC_CLOCK
     125         3848 :     if (timing_tsc_enabled)
     126              :     {
     127         1284 :         set_ticks_per_ns_for_tsc();
     128         1284 :         return;
     129              :     }
     130              : #endif
     131         2564 :     set_ticks_per_ns_system();
     132              : }
     133              : 
     134              : #ifndef WIN32
     135              : 
     136              : static void
     137        14626 : set_ticks_per_ns_system(void)
     138              : {
     139        14626 :     ticks_per_ns_scaled = 0;
     140        14626 :     max_ticks_no_overflow = 0;
     141        14626 : }
     142              : 
     143              : #else                           /* WIN32 */
     144              : 
     145              : /* GetTimerFrequency returns counts per second */
     146              : static inline double
     147              : GetTimerFrequency(void)
     148              : {
     149              :     LARGE_INTEGER f;
     150              : 
     151              :     QueryPerformanceFrequency(&f);
     152              :     return (double) f.QuadPart;
     153              : }
     154              : 
     155              : static void
     156              : set_ticks_per_ns_system(void)
     157              : {
     158              :     ticks_per_ns_scaled = (NS_PER_S << TICKS_TO_NS_SHIFT) / GetTimerFrequency();
     159              :     max_ticks_no_overflow = PG_INT64_MAX / ticks_per_ns_scaled;
     160              : }
     161              : 
     162              : #endif                          /* WIN32 */
     163              : 
     164              : /* TSC specific logic */
     165              : 
     166              : #if PG_INSTR_TSC_CLOCK
     167              : 
     168              : static void tsc_detect_frequency(void);
     169              : 
     170              : /*
     171              :  * Initialize the TSC clock source by determining its usability and frequency.
     172              :  *
     173              :  * This can be called multiple times without causing repeated work, as
     174              :  * timing_tsc_frequency_khz will be set to 0 if a prior call determined the
     175              :  * TSC is not usable. On EXEC_BACKEND (Windows), the TSC frequency may also be
     176              :  * set by restore_backend_variables.
     177              :  */
     178              : void
     179         5128 : pg_initialize_timing_tsc(void)
     180              : {
     181         5128 :     if (timing_tsc_frequency_khz < 0)
     182         1281 :         tsc_detect_frequency();
     183         5128 : }
     184              : 
     185              : static void
     186         1284 : set_ticks_per_ns_for_tsc(void)
     187              : {
     188         1284 :     ticks_per_ns_scaled = ((NS_PER_S / 1000) << TICKS_TO_NS_SHIFT) / timing_tsc_frequency_khz;
     189         1284 :     max_ticks_no_overflow = PG_INT64_MAX / ticks_per_ns_scaled;
     190         1284 : }
     191              : 
     192              : /*
     193              :  * Detect the TSC frequency and whether RDTSCP is available on x86-64.
     194              :  *
     195              :  * This can't be reliably determined at compile time, since the
     196              :  * availability of an "invariant" TSC (that is not affected by CPU
     197              :  * frequency changes) is dependent on the CPU architecture. Additionally,
     198              :  * there are cases where TSC availability is impacted by virtualization,
     199              :  * where a simple cpuid feature check would not be enough.
     200              :  */
     201              : static void
     202         1281 : tsc_detect_frequency(void)
     203              : {
     204         1281 :     timing_tsc_frequency_khz = 0;
     205              : 
     206              :     /* We require RDTSCP support and an invariant TSC, bail if not available */
     207         1281 :     if (!x86_feature_available(PG_RDTSCP) || !x86_feature_available(PG_TSC_INVARIANT))
     208            0 :         return;
     209              : 
     210              :     /* Determine speed at which the TSC advances */
     211         1281 :     timing_tsc_frequency_khz = x86_tsc_frequency_khz();
     212         1281 :     if (timing_tsc_frequency_khz > 0)
     213            0 :         return;
     214              : 
     215              :     /*
     216              :      * CPUID did not give us the TSC frequency. We can instead measure the
     217              :      * frequency by comparing ticks against walltime in a calibration loop.
     218              :      */
     219         1281 :     timing_tsc_frequency_khz = pg_tsc_calibrate_frequency();
     220              : }
     221              : 
     222              : /*
     223              :  * Decides whether to use the TSC clock source if the user did not specify it
     224              :  * one way or the other, and it is available (checked separately).
     225              :  *
     226              :  * Inspired by the Linux kernel's clocksource watchdog disable logic as updated
     227              :  * in 2021 to reflect the reliability of the TSC on Intel platforms, see
     228              :  * check_system_tsc_reliable() in arch/x86/kernel/tsc.c, as well as discussion
     229              :  * in https://lore.kernel.org/lkml/87eekfk8bd.fsf@nanos.tec.linutronix.de/
     230              :  * and https://lore.kernel.org/lkml/87a6pimt1f.ffs@nanos.tec.linutronix.de/
     231              :  * for reference.
     232              :  *
     233              :  * When tsc_detect_frequency determines the TSC is viable (invariant, etc.), and
     234              :  * we're on an Intel platform (determined via TSC_ADJUST), we consider the TSC
     235              :  * trustworthy by default, matching the Linux kernel.
     236              :  *
     237              :  * On other CPU platforms (e.g. AMD), or in some virtual machines, we don't have
     238              :  * an easy way to determine the TSC's reliability. If on Linux, we can check if
     239              :  * TSC is the active clocksource, based on it having run the watchdog logic to
     240              :  * monitor TSC correctness. For other platforms the user must explicitly enable
     241              :  * it via GUC instead.
     242              :  */
     243              : static bool
     244         1281 : tsc_use_by_default(void)
     245              : {
     246         1281 :     if (x86_feature_available(PG_TSC_ADJUST))
     247         1281 :         return true;
     248              : 
     249              : #if defined(__linux__)
     250              :     {
     251              :         FILE       *fp;
     252              :         char        buf[128];
     253              : 
     254            0 :         fp = fopen("/sys/devices/system/clocksource/clocksource0/current_clocksource", "r");
     255            0 :         if (fp)
     256              :         {
     257            0 :             bool        is_tsc = (fgets(buf, sizeof(buf), fp) != NULL &&
     258            0 :                                   strcmp(buf, "tsc\n") == 0);
     259              : 
     260            0 :             fclose(fp);
     261            0 :             if (is_tsc)
     262            0 :                 return true;
     263              :         }
     264              :     }
     265              : #endif
     266              : 
     267            0 :     return false;
     268              : }
     269              : 
     270              : /*
     271              :  * Calibrate the TSC frequency by comparing TSC ticks against walltime.
     272              :  *
     273              :  * Takes initial TSC and system clock snapshots, then loops, recomputing the
     274              :  * frequency each TSC_CALIBRATION_SKIPS iterations from cumulative TSC
     275              :  * ticks divided by elapsed time.
     276              :  *
     277              :  * Once the frequency estimate stabilizes (consecutive iterations agree), we
     278              :  * consider it converged and the frequency in KHz is returned. If either too
     279              :  * many iterations or a time limit passes without convergence, 0 is returned.
     280              :  */
     281              : #define TSC_CALIBRATION_MAX_NS      (50 * NS_PER_MS)
     282              : #define TSC_CALIBRATION_ITERATIONS  1000000
     283              : #define TSC_CALIBRATION_SKIPS       100
     284              : #define TSC_CALIBRATION_STABLE_CYCLES   10
     285              : uint32
     286         1282 : pg_tsc_calibrate_frequency(void)
     287              : {
     288              :     instr_time  initial_wall;
     289              :     int64       initial_tsc;
     290         1282 :     double      freq_khz = 0;
     291         1282 :     double      prev_freq_khz = 0;
     292         1282 :     int         stable_count = 0;
     293              :     int64       prev_tsc;
     294         1282 :     int         saved_clock_source = timing_clock_source;
     295              : 
     296              :     /*
     297              :      * Frequency must be initialized to avoid recursion via
     298              :      * pg_set_timing_clock_source.
     299              :      */
     300              :     Assert(timing_tsc_frequency_khz >= 0);
     301              : 
     302              :     /* Ensure INSTR_* calls below work on system time */
     303         1282 :     pg_set_timing_clock_source(TIMING_CLOCK_SOURCE_SYSTEM);
     304              : 
     305         1282 :     INSTR_TIME_SET_CURRENT(initial_wall);
     306              : 
     307         1282 :     initial_tsc = pg_rdtscp();
     308         1282 :     prev_tsc = initial_tsc;
     309              : 
     310      2795682 :     for (int i = 0; i < TSC_CALIBRATION_ITERATIONS; i++)
     311              :     {
     312              :         instr_time  now_wall;
     313              :         int64       now_tsc;
     314              :         int64       elapsed_ns;
     315              :         int64       elapsed_ticks;
     316              : 
     317      2795682 :         INSTR_TIME_SET_CURRENT(now_wall);
     318              : 
     319      2795682 :         now_tsc = pg_rdtscp();
     320              : 
     321      2795682 :         INSTR_TIME_SUBTRACT(now_wall, initial_wall);
     322      2795682 :         elapsed_ns = INSTR_TIME_GET_NANOSEC(now_wall);
     323              : 
     324              :         /* Safety: bail out if we've taken too long */
     325      2795682 :         if (elapsed_ns >= TSC_CALIBRATION_MAX_NS)
     326         1282 :             break;
     327              : 
     328      2795682 :         elapsed_ticks = now_tsc - initial_tsc;
     329              : 
     330              :         /*
     331              :          * Skip if TSC hasn't advanced, or we walked backwards for some
     332              :          * reason.
     333              :          */
     334      2795682 :         if (now_tsc == prev_tsc || elapsed_ns <= 0 || elapsed_ticks <= 0)
     335      2766456 :             continue;
     336              : 
     337              :         /*
     338              :          * We only measure frequency every TSC_CALIBRATION_SKIPS to avoid
     339              :          * stabilizing based on just a handful of RDTSC instructions.
     340              :          */
     341      2795682 :         if (i % TSC_CALIBRATION_SKIPS != 0)
     342      2766456 :             continue;
     343              : 
     344        29226 :         freq_khz = ((double) elapsed_ticks / elapsed_ns) * 1000 * 1000;
     345              : 
     346              :         /*
     347              :          * Once freq_khz / prev_freq_khz is small, check if it stays that way.
     348              :          * If it does for long enough, we've got a winner frequency.
     349              :          */
     350        29226 :         if (prev_freq_khz != 0 && fabs(1 - freq_khz / prev_freq_khz) < 0.0001)
     351              :         {
     352        15831 :             stable_count++;
     353        15831 :             if (stable_count >= TSC_CALIBRATION_STABLE_CYCLES)
     354         1282 :                 break;
     355              :         }
     356              :         else
     357        13395 :             stable_count = 0;
     358              : 
     359        27944 :         prev_tsc = now_tsc;
     360        27944 :         prev_freq_khz = freq_khz;
     361              :     }
     362              : 
     363              :     /* Restore the previous clock source */
     364         1282 :     pg_set_timing_clock_source(saved_clock_source);
     365              : 
     366         1282 :     if (stable_count < TSC_CALIBRATION_STABLE_CYCLES)
     367            0 :         return 0;               /* did not converge */
     368              : 
     369         1282 :     return (uint32) freq_khz;
     370              : }
     371              : 
     372              : #endif                          /* PG_INSTR_TSC_CLOCK */
        

Generated by: LCOV version 2.0-1