Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * instr_time.c
4 : * Non-inline parts of the portable high-precision interval timing
5 : * implementation
6 : *
7 : * Portions Copyright (c) 2026, PostgreSQL Global Development Group
8 : *
9 : *
10 : * IDENTIFICATION
11 : * src/common/instr_time.c
12 : *
13 : *-------------------------------------------------------------------------
14 : */
15 : #ifndef FRONTEND
16 : #include "postgres.h"
17 : #else
18 : #include "postgres_fe.h"
19 : #endif
20 :
21 : #include <math.h>
22 :
23 : #include "port/pg_cpu.h"
24 : #include "portability/instr_time.h"
25 :
26 : /*
27 : * Stores what the number of ticks needs to be multiplied with to end up
28 : * with nanoseconds using integer math.
29 : *
30 : * In certain cases (TSC on x86-64, and QueryPerformanceCounter on Windows)
31 : * the ticks to nanoseconds conversion requires floating point math because:
32 : *
33 : * sec = ticks / frequency_hz
34 : * ns = ticks / frequency_hz * 1,000,000,000
35 : * ns = ticks * (1,000,000,000 / frequency_hz)
36 : * ns = ticks * (1,000,000 / frequency_khz) <-- now in kilohertz
37 : *
38 : * Here, 'ns' is usually a floating point number. For example for a 2.5 GHz CPU
39 : * the scaling factor becomes 1,000,000 / 2,500,000 = 0.4.
40 : *
41 : * To be able to use integer math we work around the lack of precision. We
42 : * first scale the integer up (left shift by TICKS_TO_NS_SHIFT) and after the
43 : * multiplication by the number of ticks in pg_ticks_to_ns() we shift right by
44 : * the same amount.
45 : *
46 : * We remember the maximum number of ticks that can be multiplied by the scale
47 : * factor without overflowing so we can check via a * b > max <=> a > max / b.
48 : *
49 : * However, as this is meant for interval measurements, it is unlikely that the
50 : * overflow path is actually taken in typical scenarios, since overflows would
51 : * only occur for intervals longer than 6.5 days.
52 : *
53 : * Note we utilize unsigned integers even though ticks are stored as a signed
54 : * value to encourage compilers to generate better assembly, since we can be
55 : * sure these values are not negative.
56 : *
57 : * In all other cases we are using clock_gettime(), which uses nanoseconds
58 : * as ticks. Hence, we set the multiplier to zero, which causes pg_ticks_to_ns
59 : * to return the original value.
60 : */
61 : uint64 ticks_per_ns_scaled = 0;
62 : uint64 max_ticks_no_overflow = 0;
63 : bool timing_initialized = false;
64 : int timing_clock_source = TIMING_CLOCK_SOURCE_AUTO;
65 :
66 : bool timing_tsc_enabled = false;
67 : int32 timing_tsc_frequency_khz = -1;
68 :
69 : static void set_ticks_per_ns(void);
70 : static void set_ticks_per_ns_system(void);
71 :
72 : #if PG_INSTR_TSC_CLOCK
73 : static bool tsc_use_by_default(void);
74 : static void set_ticks_per_ns_for_tsc(void);
75 : #endif
76 :
77 : /*
78 : * Initializes timing infrastructure. Must be called before making any use
79 : * of INSTR* macros.
80 : */
81 : void
82 36570 : pg_initialize_timing(void)
83 : {
84 36570 : if (timing_initialized)
85 24508 : return;
86 :
87 12062 : set_ticks_per_ns_system();
88 12062 : timing_initialized = true;
89 : }
90 :
91 : bool
92 3848 : pg_set_timing_clock_source(TimingClockSourceType source)
93 : {
94 : Assert(timing_initialized);
95 :
96 : #if PG_INSTR_TSC_CLOCK
97 3848 : pg_initialize_timing_tsc();
98 :
99 3848 : switch (source)
100 : {
101 2562 : case TIMING_CLOCK_SOURCE_AUTO:
102 2562 : timing_tsc_enabled = (timing_tsc_frequency_khz > 0) && tsc_use_by_default();
103 2562 : break;
104 1283 : case TIMING_CLOCK_SOURCE_SYSTEM:
105 1283 : timing_tsc_enabled = false;
106 1283 : break;
107 3 : case TIMING_CLOCK_SOURCE_TSC:
108 : /* Tell caller TSC is not usable */
109 3 : if (timing_tsc_frequency_khz <= 0)
110 0 : return false;
111 3 : timing_tsc_enabled = true;
112 3 : break;
113 : }
114 : #endif
115 :
116 3848 : set_ticks_per_ns();
117 3848 : timing_clock_source = source;
118 3848 : return true;
119 : }
120 :
121 : static void
122 3848 : set_ticks_per_ns(void)
123 : {
124 : #if PG_INSTR_TSC_CLOCK
125 3848 : if (timing_tsc_enabled)
126 : {
127 1284 : set_ticks_per_ns_for_tsc();
128 1284 : return;
129 : }
130 : #endif
131 2564 : set_ticks_per_ns_system();
132 : }
133 :
134 : #ifndef WIN32
135 :
136 : static void
137 14626 : set_ticks_per_ns_system(void)
138 : {
139 14626 : ticks_per_ns_scaled = 0;
140 14626 : max_ticks_no_overflow = 0;
141 14626 : }
142 :
143 : #else /* WIN32 */
144 :
145 : /* GetTimerFrequency returns counts per second */
146 : static inline double
147 : GetTimerFrequency(void)
148 : {
149 : LARGE_INTEGER f;
150 :
151 : QueryPerformanceFrequency(&f);
152 : return (double) f.QuadPart;
153 : }
154 :
155 : static void
156 : set_ticks_per_ns_system(void)
157 : {
158 : ticks_per_ns_scaled = (NS_PER_S << TICKS_TO_NS_SHIFT) / GetTimerFrequency();
159 : max_ticks_no_overflow = PG_INT64_MAX / ticks_per_ns_scaled;
160 : }
161 :
162 : #endif /* WIN32 */
163 :
164 : /* TSC specific logic */
165 :
166 : #if PG_INSTR_TSC_CLOCK
167 :
168 : static void tsc_detect_frequency(void);
169 :
170 : /*
171 : * Initialize the TSC clock source by determining its usability and frequency.
172 : *
173 : * This can be called multiple times without causing repeated work, as
174 : * timing_tsc_frequency_khz will be set to 0 if a prior call determined the
175 : * TSC is not usable. On EXEC_BACKEND (Windows), the TSC frequency may also be
176 : * set by restore_backend_variables.
177 : */
178 : void
179 5128 : pg_initialize_timing_tsc(void)
180 : {
181 5128 : if (timing_tsc_frequency_khz < 0)
182 1281 : tsc_detect_frequency();
183 5128 : }
184 :
185 : static void
186 1284 : set_ticks_per_ns_for_tsc(void)
187 : {
188 1284 : ticks_per_ns_scaled = ((NS_PER_S / 1000) << TICKS_TO_NS_SHIFT) / timing_tsc_frequency_khz;
189 1284 : max_ticks_no_overflow = PG_INT64_MAX / ticks_per_ns_scaled;
190 1284 : }
191 :
192 : /*
193 : * Detect the TSC frequency and whether RDTSCP is available on x86-64.
194 : *
195 : * This can't be reliably determined at compile time, since the
196 : * availability of an "invariant" TSC (that is not affected by CPU
197 : * frequency changes) is dependent on the CPU architecture. Additionally,
198 : * there are cases where TSC availability is impacted by virtualization,
199 : * where a simple cpuid feature check would not be enough.
200 : */
201 : static void
202 1281 : tsc_detect_frequency(void)
203 : {
204 1281 : timing_tsc_frequency_khz = 0;
205 :
206 : /* We require RDTSCP support and an invariant TSC, bail if not available */
207 1281 : if (!x86_feature_available(PG_RDTSCP) || !x86_feature_available(PG_TSC_INVARIANT))
208 0 : return;
209 :
210 : /* Determine speed at which the TSC advances */
211 1281 : timing_tsc_frequency_khz = x86_tsc_frequency_khz();
212 1281 : if (timing_tsc_frequency_khz > 0)
213 0 : return;
214 :
215 : /*
216 : * CPUID did not give us the TSC frequency. We can instead measure the
217 : * frequency by comparing ticks against walltime in a calibration loop.
218 : */
219 1281 : timing_tsc_frequency_khz = pg_tsc_calibrate_frequency();
220 : }
221 :
222 : /*
223 : * Decides whether to use the TSC clock source if the user did not specify it
224 : * one way or the other, and it is available (checked separately).
225 : *
226 : * Inspired by the Linux kernel's clocksource watchdog disable logic as updated
227 : * in 2021 to reflect the reliability of the TSC on Intel platforms, see
228 : * check_system_tsc_reliable() in arch/x86/kernel/tsc.c, as well as discussion
229 : * in https://lore.kernel.org/lkml/87eekfk8bd.fsf@nanos.tec.linutronix.de/
230 : * and https://lore.kernel.org/lkml/87a6pimt1f.ffs@nanos.tec.linutronix.de/
231 : * for reference.
232 : *
233 : * When tsc_detect_frequency determines the TSC is viable (invariant, etc.), and
234 : * we're on an Intel platform (determined via TSC_ADJUST), we consider the TSC
235 : * trustworthy by default, matching the Linux kernel.
236 : *
237 : * On other CPU platforms (e.g. AMD), or in some virtual machines, we don't have
238 : * an easy way to determine the TSC's reliability. If on Linux, we can check if
239 : * TSC is the active clocksource, based on it having run the watchdog logic to
240 : * monitor TSC correctness. For other platforms the user must explicitly enable
241 : * it via GUC instead.
242 : */
243 : static bool
244 1281 : tsc_use_by_default(void)
245 : {
246 1281 : if (x86_feature_available(PG_TSC_ADJUST))
247 1281 : return true;
248 :
249 : #if defined(__linux__)
250 : {
251 : FILE *fp;
252 : char buf[128];
253 :
254 0 : fp = fopen("/sys/devices/system/clocksource/clocksource0/current_clocksource", "r");
255 0 : if (fp)
256 : {
257 0 : bool is_tsc = (fgets(buf, sizeof(buf), fp) != NULL &&
258 0 : strcmp(buf, "tsc\n") == 0);
259 :
260 0 : fclose(fp);
261 0 : if (is_tsc)
262 0 : return true;
263 : }
264 : }
265 : #endif
266 :
267 0 : return false;
268 : }
269 :
270 : /*
271 : * Calibrate the TSC frequency by comparing TSC ticks against walltime.
272 : *
273 : * Takes initial TSC and system clock snapshots, then loops, recomputing the
274 : * frequency each TSC_CALIBRATION_SKIPS iterations from cumulative TSC
275 : * ticks divided by elapsed time.
276 : *
277 : * Once the frequency estimate stabilizes (consecutive iterations agree), we
278 : * consider it converged and the frequency in KHz is returned. If either too
279 : * many iterations or a time limit passes without convergence, 0 is returned.
280 : */
281 : #define TSC_CALIBRATION_MAX_NS (50 * NS_PER_MS)
282 : #define TSC_CALIBRATION_ITERATIONS 1000000
283 : #define TSC_CALIBRATION_SKIPS 100
284 : #define TSC_CALIBRATION_STABLE_CYCLES 10
285 : uint32
286 1282 : pg_tsc_calibrate_frequency(void)
287 : {
288 : instr_time initial_wall;
289 : int64 initial_tsc;
290 1282 : double freq_khz = 0;
291 1282 : double prev_freq_khz = 0;
292 1282 : int stable_count = 0;
293 : int64 prev_tsc;
294 1282 : int saved_clock_source = timing_clock_source;
295 :
296 : /*
297 : * Frequency must be initialized to avoid recursion via
298 : * pg_set_timing_clock_source.
299 : */
300 : Assert(timing_tsc_frequency_khz >= 0);
301 :
302 : /* Ensure INSTR_* calls below work on system time */
303 1282 : pg_set_timing_clock_source(TIMING_CLOCK_SOURCE_SYSTEM);
304 :
305 1282 : INSTR_TIME_SET_CURRENT(initial_wall);
306 :
307 1282 : initial_tsc = pg_rdtscp();
308 1282 : prev_tsc = initial_tsc;
309 :
310 2795682 : for (int i = 0; i < TSC_CALIBRATION_ITERATIONS; i++)
311 : {
312 : instr_time now_wall;
313 : int64 now_tsc;
314 : int64 elapsed_ns;
315 : int64 elapsed_ticks;
316 :
317 2795682 : INSTR_TIME_SET_CURRENT(now_wall);
318 :
319 2795682 : now_tsc = pg_rdtscp();
320 :
321 2795682 : INSTR_TIME_SUBTRACT(now_wall, initial_wall);
322 2795682 : elapsed_ns = INSTR_TIME_GET_NANOSEC(now_wall);
323 :
324 : /* Safety: bail out if we've taken too long */
325 2795682 : if (elapsed_ns >= TSC_CALIBRATION_MAX_NS)
326 1282 : break;
327 :
328 2795682 : elapsed_ticks = now_tsc - initial_tsc;
329 :
330 : /*
331 : * Skip if TSC hasn't advanced, or we walked backwards for some
332 : * reason.
333 : */
334 2795682 : if (now_tsc == prev_tsc || elapsed_ns <= 0 || elapsed_ticks <= 0)
335 2766456 : continue;
336 :
337 : /*
338 : * We only measure frequency every TSC_CALIBRATION_SKIPS to avoid
339 : * stabilizing based on just a handful of RDTSC instructions.
340 : */
341 2795682 : if (i % TSC_CALIBRATION_SKIPS != 0)
342 2766456 : continue;
343 :
344 29226 : freq_khz = ((double) elapsed_ticks / elapsed_ns) * 1000 * 1000;
345 :
346 : /*
347 : * Once freq_khz / prev_freq_khz is small, check if it stays that way.
348 : * If it does for long enough, we've got a winner frequency.
349 : */
350 29226 : if (prev_freq_khz != 0 && fabs(1 - freq_khz / prev_freq_khz) < 0.0001)
351 : {
352 15831 : stable_count++;
353 15831 : if (stable_count >= TSC_CALIBRATION_STABLE_CYCLES)
354 1282 : break;
355 : }
356 : else
357 13395 : stable_count = 0;
358 :
359 27944 : prev_tsc = now_tsc;
360 27944 : prev_freq_khz = freq_khz;
361 : }
362 :
363 : /* Restore the previous clock source */
364 1282 : pg_set_timing_clock_source(saved_clock_source);
365 :
366 1282 : if (stable_count < TSC_CALIBRATION_STABLE_CYCLES)
367 0 : return 0; /* did not converge */
368 :
369 1282 : return (uint32) freq_khz;
370 : }
371 :
372 : #endif /* PG_INSTR_TSC_CLOCK */
|