Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * instr_time.c
4 : * Non-inline parts of the portable high-precision interval timing
5 : * implementation
6 : *
7 : * Portions Copyright (c) 2026, PostgreSQL Global Development Group
8 : *
9 : *
10 : * IDENTIFICATION
11 : * src/common/instr_time.c
12 : *
13 : *-------------------------------------------------------------------------
14 : */
15 : #ifndef FRONTEND
16 : #include "postgres.h"
17 : #else
18 : #include "postgres_fe.h"
19 : #endif
20 :
21 : #include <math.h>
22 :
23 : #include "port/pg_cpu.h"
24 : #include "portability/instr_time.h"
25 :
26 : /*
27 : * Stores what the number of ticks needs to be multiplied with to end up
28 : * with nanoseconds using integer math.
29 : *
30 : * In certain cases (TSC on x86-64, and QueryPerformanceCounter on Windows)
31 : * the ticks to nanoseconds conversion requires floating point math because:
32 : *
33 : * sec = ticks / frequency_hz
34 : * ns = ticks / frequency_hz * 1,000,000,000
35 : * ns = ticks * (1,000,000,000 / frequency_hz)
36 : * ns = ticks * (1,000,000 / frequency_khz) <-- now in kilohertz
37 : *
38 : * Here, 'ns' is usually a floating point number. For example for a 2.5 GHz CPU
39 : * the scaling factor becomes 1,000,000 / 2,500,000 = 0.4.
40 : *
41 : * To be able to use integer math we work around the lack of precision. We
42 : * first scale the integer up (left shift by TICKS_TO_NS_SHIFT) and after the
43 : * multiplication by the number of ticks in pg_ticks_to_ns() we shift right by
44 : * the same amount.
45 : *
46 : * We remember the maximum number of ticks that can be multiplied by the scale
47 : * factor without overflowing so we can check via a * b > max <=> a > max / b.
48 : *
49 : * However, as this is meant for interval measurements, it is unlikely that the
50 : * overflow path is actually taken in typical scenarios, since overflows would
51 : * only occur for intervals longer than 6.5 days.
52 : *
53 : * Note we utilize unsigned integers even though ticks are stored as a signed
54 : * value to encourage compilers to generate better assembly, since we can be
55 : * sure these values are not negative.
56 : *
57 : * In all other cases we are using clock_gettime(), which uses nanoseconds
58 : * as ticks. Hence, we set the multiplier to zero, which causes pg_ticks_to_ns
59 : * to return the original value.
60 : */
61 : uint64 ticks_per_ns_scaled = 0;
62 : uint64 max_ticks_no_overflow = 0;
63 : bool timing_initialized = false;
64 : int timing_clock_source = TIMING_CLOCK_SOURCE_AUTO;
65 :
66 : bool timing_tsc_enabled = false;
67 : int32 timing_tsc_frequency_khz = -1;
68 :
69 : static void set_ticks_per_ns(void);
70 : static void set_ticks_per_ns_system(void);
71 :
72 : #if PG_INSTR_TSC_CLOCK
73 : static TscClockSourceInfo tsc_info = {.calibrated_frequency_khz = -1};
74 :
75 : static bool tsc_use_by_default(void);
76 : static void set_ticks_per_ns_for_tsc(void);
77 : #endif
78 :
79 : /*
80 : * Initializes timing infrastructure. Must be called before making any use
81 : * of INSTR* macros.
82 : */
83 : void
84 37129 : pg_initialize_timing(void)
85 : {
86 37129 : if (timing_initialized)
87 24910 : return;
88 :
89 12219 : set_ticks_per_ns_system();
90 12219 : timing_initialized = true;
91 : }
92 :
93 : bool
94 3882 : pg_set_timing_clock_source(TimingClockSourceType source)
95 : {
96 : Assert(timing_initialized);
97 :
98 : #if PG_INSTR_TSC_CLOCK
99 3882 : pg_initialize_timing_tsc();
100 :
101 3882 : switch (source)
102 : {
103 2586 : case TIMING_CLOCK_SOURCE_AUTO:
104 2586 : timing_tsc_enabled = (timing_tsc_frequency_khz > 0) && tsc_use_by_default();
105 2586 : break;
106 1294 : case TIMING_CLOCK_SOURCE_SYSTEM:
107 1294 : timing_tsc_enabled = false;
108 1294 : break;
109 2 : case TIMING_CLOCK_SOURCE_TSC:
110 : /* Tell caller TSC is not usable */
111 2 : if (timing_tsc_frequency_khz <= 0)
112 0 : return false;
113 2 : timing_tsc_enabled = true;
114 2 : break;
115 : }
116 : #endif
117 :
118 3882 : set_ticks_per_ns();
119 3882 : timing_clock_source = source;
120 3882 : return true;
121 : }
122 :
123 : static void
124 3882 : set_ticks_per_ns(void)
125 : {
126 : #if PG_INSTR_TSC_CLOCK
127 3882 : if (timing_tsc_enabled)
128 : {
129 1294 : set_ticks_per_ns_for_tsc();
130 1294 : return;
131 : }
132 : #endif
133 2588 : set_ticks_per_ns_system();
134 : }
135 :
136 : #ifndef WIN32
137 :
138 : static void
139 14807 : set_ticks_per_ns_system(void)
140 : {
141 14807 : ticks_per_ns_scaled = 0;
142 14807 : max_ticks_no_overflow = 0;
143 14807 : }
144 :
145 : #else /* WIN32 */
146 :
147 : /* GetTimerFrequency returns counts per second */
148 : static inline double
149 : GetTimerFrequency(void)
150 : {
151 : LARGE_INTEGER f;
152 :
153 : QueryPerformanceFrequency(&f);
154 : return (double) f.QuadPart;
155 : }
156 :
157 : static void
158 : set_ticks_per_ns_system(void)
159 : {
160 : ticks_per_ns_scaled = (NS_PER_S << TICKS_TO_NS_SHIFT) / GetTimerFrequency();
161 : max_ticks_no_overflow = PG_INT64_MAX / ticks_per_ns_scaled;
162 : }
163 :
164 : #endif /* WIN32 */
165 :
166 : /* TSC specific logic */
167 :
168 : #if PG_INSTR_TSC_CLOCK
169 :
170 : static void tsc_detect_frequency(void);
171 : static uint32 pg_tsc_calibrate_frequency(void);
172 :
173 : /*
174 : * Initialize the TSC clock source by determining its usability and frequency.
175 : *
176 : * This can be called multiple times without causing repeated work, as
177 : * timing_tsc_frequency_khz will be set to 0 if a prior call determined the
178 : * TSC is not usable. On EXEC_BACKEND (Windows), the TSC frequency may also be
179 : * set by restore_backend_variables.
180 : */
181 : void
182 5174 : pg_initialize_timing_tsc(void)
183 : {
184 5174 : if (timing_tsc_frequency_khz < 0)
185 1293 : tsc_detect_frequency();
186 5174 : }
187 :
188 : static void
189 1294 : set_ticks_per_ns_for_tsc(void)
190 : {
191 1294 : ticks_per_ns_scaled = ((NS_PER_S / 1000) << TICKS_TO_NS_SHIFT) / timing_tsc_frequency_khz;
192 1294 : max_ticks_no_overflow = PG_INT64_MAX / ticks_per_ns_scaled;
193 1294 : }
194 :
195 : /*
196 : * Detect the TSC frequency and whether RDTSCP is available on x86-64.
197 : *
198 : * This can't be reliably determined at compile time, since the
199 : * availability of an "invariant" TSC (that is not affected by CPU
200 : * frequency changes) is dependent on the CPU architecture. Additionally,
201 : * there are cases where TSC availability is impacted by virtualization,
202 : * where a simple cpuid feature check would not be enough.
203 : */
204 : static void
205 1293 : tsc_detect_frequency(void)
206 : {
207 1293 : timing_tsc_frequency_khz = 0;
208 1293 : tsc_info.frequency_khz = 0;
209 1293 : tsc_info.frequency_source[0] = '\0';
210 :
211 1293 : strlcat(tsc_info.frequency_source, "x86",
212 : sizeof(tsc_info.frequency_source));
213 :
214 : /* We require RDTSCP support and an invariant TSC, bail if not available */
215 1293 : if (!x86_feature_available(PG_RDTSCP))
216 : {
217 0 : strlcat(tsc_info.frequency_source, ", no rdtscp",
218 : sizeof(tsc_info.frequency_source));
219 0 : return;
220 : }
221 :
222 1293 : if (!x86_feature_available(PG_TSC_INVARIANT))
223 : {
224 0 : strlcat(tsc_info.frequency_source, ", not invariant",
225 : sizeof(tsc_info.frequency_source));
226 0 : return;
227 : }
228 :
229 : /* Determine speed at which the TSC advances */
230 1293 : timing_tsc_frequency_khz = x86_tsc_frequency_khz(tsc_info.frequency_source,
231 : sizeof(tsc_info.frequency_source));
232 1293 : if (timing_tsc_frequency_khz > 0)
233 : {
234 0 : tsc_info.frequency_khz = timing_tsc_frequency_khz;
235 0 : return;
236 : }
237 :
238 : /*
239 : * CPUID did not give us the TSC frequency. We can instead measure the
240 : * frequency by comparing ticks against walltime in a calibration loop.
241 : */
242 1293 : if (tsc_info.calibrated_frequency_khz < 0)
243 1293 : tsc_info.calibrated_frequency_khz = pg_tsc_calibrate_frequency();
244 :
245 1293 : timing_tsc_frequency_khz = tsc_info.calibrated_frequency_khz;
246 1293 : if (timing_tsc_frequency_khz > 0)
247 : {
248 1292 : strlcat(tsc_info.frequency_source, ", calibration",
249 : sizeof(tsc_info.frequency_source));
250 1292 : tsc_info.frequency_khz = timing_tsc_frequency_khz;
251 : }
252 : }
253 :
254 : /*
255 : * Decides whether to use the TSC clock source if the user did not specify it
256 : * one way or the other, and it is available (checked separately).
257 : *
258 : * Inspired by the Linux kernel's clocksource watchdog disable logic as updated
259 : * in 2021 to reflect the reliability of the TSC on Intel platforms, see
260 : * check_system_tsc_reliable() in arch/x86/kernel/tsc.c, as well as discussion
261 : * in https://lore.kernel.org/lkml/87eekfk8bd.fsf@nanos.tec.linutronix.de/
262 : * and https://lore.kernel.org/lkml/87a6pimt1f.ffs@nanos.tec.linutronix.de/
263 : * for reference.
264 : *
265 : * When tsc_detect_frequency determines the TSC is viable (invariant, etc.), and
266 : * we're on an Intel platform (determined via TSC_ADJUST), we consider the TSC
267 : * trustworthy by default, matching the Linux kernel.
268 : *
269 : * On other CPU platforms (e.g. AMD), or in some virtual machines, we don't have
270 : * an easy way to determine the TSC's reliability. If on Linux, we can check if
271 : * TSC is the active clocksource, based on it having run the watchdog logic to
272 : * monitor TSC correctness. For other platforms the user must explicitly enable
273 : * it via GUC instead.
274 : */
275 : static bool
276 1292 : tsc_use_by_default(void)
277 : {
278 1292 : if (x86_feature_available(PG_TSC_ADJUST))
279 1292 : return true;
280 :
281 : #if defined(__linux__)
282 : {
283 : FILE *fp;
284 : char buf[128];
285 :
286 0 : fp = fopen("/sys/devices/system/clocksource/clocksource0/current_clocksource", "r");
287 0 : if (fp)
288 : {
289 0 : bool is_tsc = (fgets(buf, sizeof(buf), fp) != NULL &&
290 0 : strcmp(buf, "tsc\n") == 0);
291 :
292 0 : fclose(fp);
293 0 : if (is_tsc)
294 0 : return true;
295 : }
296 : }
297 : #endif
298 :
299 0 : return false;
300 : }
301 :
302 : /*
303 : * Calibrate the TSC frequency by comparing TSC ticks against walltime.
304 : *
305 : * Takes initial TSC and system clock snapshots, then loops, recomputing the
306 : * frequency each TSC_CALIBRATION_SKIPS iterations from cumulative TSC
307 : * ticks divided by elapsed time.
308 : *
309 : * Once the frequency estimate stabilizes (consecutive iterations agree), we
310 : * consider it converged and the frequency in KHz is returned. If either too
311 : * many iterations or a time limit passes without convergence, 0 is returned.
312 : */
313 : #define TSC_CALIBRATION_MAX_NS (50 * NS_PER_MS)
314 : #define TSC_CALIBRATION_ITERATIONS 1000000
315 : #define TSC_CALIBRATION_SKIPS 100
316 : #define TSC_CALIBRATION_STABLE_CYCLES 10
317 : static uint32
318 1293 : pg_tsc_calibrate_frequency(void)
319 : {
320 : instr_time initial_wall;
321 : int64 initial_tsc;
322 1293 : double freq_khz = 0;
323 1293 : double prev_freq_khz = 0;
324 1293 : int stable_count = 0;
325 : int64 prev_tsc;
326 1293 : int saved_clock_source = timing_clock_source;
327 :
328 : /*
329 : * Frequency must be initialized to avoid recursion via
330 : * pg_set_timing_clock_source.
331 : */
332 : Assert(timing_tsc_frequency_khz >= 0);
333 :
334 : /* Ensure INSTR_* calls below work on system time */
335 1293 : pg_set_timing_clock_source(TIMING_CLOCK_SOURCE_SYSTEM);
336 :
337 1293 : INSTR_TIME_SET_CURRENT(initial_wall);
338 :
339 1293 : initial_tsc = pg_rdtscp();
340 1293 : prev_tsc = initial_tsc;
341 :
342 2759537 : for (int i = 0; i < TSC_CALIBRATION_ITERATIONS; i++)
343 : {
344 : instr_time now_wall;
345 : int64 now_tsc;
346 : int64 elapsed_ns;
347 : int64 elapsed_ticks;
348 :
349 2759537 : INSTR_TIME_SET_CURRENT(now_wall);
350 :
351 2759537 : now_tsc = pg_rdtscp();
352 :
353 2759537 : INSTR_TIME_SUBTRACT(now_wall, initial_wall);
354 2759537 : elapsed_ns = INSTR_TIME_GET_NANOSEC(now_wall);
355 :
356 : /* Safety: bail out if we've taken too long */
357 2759537 : if (elapsed_ns >= TSC_CALIBRATION_MAX_NS)
358 1293 : break;
359 :
360 2759536 : elapsed_ticks = now_tsc - initial_tsc;
361 :
362 : /*
363 : * Skip if TSC hasn't advanced, or we walked backwards for some
364 : * reason.
365 : */
366 2759536 : if (now_tsc == prev_tsc || elapsed_ns <= 0 || elapsed_ticks <= 0)
367 2730661 : continue;
368 :
369 : /*
370 : * We only measure frequency every TSC_CALIBRATION_SKIPS to avoid
371 : * stabilizing based on just a handful of RDTSC instructions.
372 : */
373 2759536 : if (i % TSC_CALIBRATION_SKIPS != 0)
374 2730661 : continue;
375 :
376 28875 : freq_khz = ((double) elapsed_ticks / elapsed_ns) * 1000 * 1000;
377 :
378 : /*
379 : * Once freq_khz / prev_freq_khz is small, check if it stays that way.
380 : * If it does for long enough, we've got a winner frequency.
381 : */
382 28875 : if (prev_freq_khz != 0 && fabs(1 - freq_khz / prev_freq_khz) < 0.0001)
383 : {
384 15778 : stable_count++;
385 15778 : if (stable_count >= TSC_CALIBRATION_STABLE_CYCLES)
386 1292 : break;
387 : }
388 : else
389 13097 : stable_count = 0;
390 :
391 27583 : prev_tsc = now_tsc;
392 27583 : prev_freq_khz = freq_khz;
393 : }
394 :
395 : /* Restore the previous clock source */
396 1293 : pg_set_timing_clock_source(saved_clock_source);
397 :
398 1293 : if (stable_count < TSC_CALIBRATION_STABLE_CYCLES)
399 1 : return 0; /* did not converge */
400 :
401 1292 : return (uint32) freq_khz;
402 : }
403 :
404 : /*
405 : * Returns TSC clock source information for diagnostic purposes.
406 : *
407 : * On first call, may run the TSC calibration loop (if not already done during
408 : * frequency detection) which can take up to TSC_CALIBRATION_MAX_NS.
409 : * Subsequent calls return cached results.
410 : *
411 : * Note: This won't return the right info in EXEC_BACKEND builds if this were
412 : * used in the backend (which it currently is not), as tsc_info is not copied
413 : * using read_backend_variables - only the TSC frequency is.
414 : */
415 : const TscClockSourceInfo *
416 1 : pg_timing_tsc_clock_source_info(void)
417 : {
418 1 : if (tsc_info.frequency_khz > 0 && tsc_info.calibrated_frequency_khz < 0)
419 0 : tsc_info.calibrated_frequency_khz = pg_tsc_calibrate_frequency();
420 :
421 1 : return &tsc_info;
422 : }
423 :
424 : #endif /* PG_INSTR_TSC_CLOCK */
|