Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * arch-x86.h
4 : * Atomic operations considerations specific to intel x86
5 : *
6 : * Note that we actually require a 486 upwards because the 386 doesn't have
7 : * support for xadd and cmpxchg. Given that the 386 isn't supported anywhere
8 : * anymore that's not much of a restriction luckily.
9 : *
10 : * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
11 : * Portions Copyright (c) 1994, Regents of the University of California
12 : *
13 : * NOTES:
14 : *
15 : * src/include/port/atomics/arch-x86.h
16 : *
17 : *-------------------------------------------------------------------------
18 : */
19 :
20 : /*
21 : * Both 32 and 64 bit x86 do not allow loads to be reordered with other loads,
22 : * or stores to be reordered with other stores, but a load can be performed
23 : * before a subsequent store.
24 : *
25 : * Technically, some x86-ish chips support uncached memory access and/or
26 : * special instructions that are weakly ordered. In those cases we'd need
27 : * the read and write barriers to be lfence and sfence. But since we don't
28 : * do those things, a compiler barrier should be enough.
29 : *
30 : * "lock; addl" has worked for longer than "mfence". It's also rumored to be
31 : * faster in many scenarios.
32 : */
33 :
34 : #if defined(__GNUC__) || defined(__INTEL_COMPILER)
35 : #if defined(__i386__) || defined(__i386)
36 : #define pg_memory_barrier_impl() \
37 : __asm__ __volatile__ ("lock; addl $0,0(%%esp)" : : : "memory", "cc")
38 : #elif defined(__x86_64__)
39 : #define pg_memory_barrier_impl() \
40 : __asm__ __volatile__ ("lock; addl $0,0(%%rsp)" : : : "memory", "cc")
41 : #endif
42 : #endif /* defined(__GNUC__) || defined(__INTEL_COMPILER) */
43 :
44 : #define pg_read_barrier_impl() pg_compiler_barrier_impl()
45 : #define pg_write_barrier_impl() pg_compiler_barrier_impl()
46 :
47 : /*
48 : * Provide implementation for atomics using inline assembly on x86 gcc. It's
49 : * nice to support older gcc's and the compare/exchange implementation here is
50 : * actually more efficient than the * __sync variant.
51 : */
52 : #if defined(__GNUC__) || defined(__INTEL_COMPILER)
53 :
54 : #define PG_HAVE_ATOMIC_FLAG_SUPPORT
55 : typedef struct pg_atomic_flag
56 : {
57 : volatile char value;
58 : } pg_atomic_flag;
59 :
60 : #define PG_HAVE_ATOMIC_U32_SUPPORT
61 : typedef struct pg_atomic_uint32
62 : {
63 : volatile uint32 value;
64 : } pg_atomic_uint32;
65 :
66 : /*
67 : * It's too complicated to write inline asm for 64bit types on 32bit and the
68 : * 486 can't do it anyway.
69 : */
70 : #ifdef __x86_64__
71 : #define PG_HAVE_ATOMIC_U64_SUPPORT
72 : typedef struct pg_atomic_uint64
73 : {
74 : /* alignment guaranteed due to being on a 64bit platform */
75 : volatile uint64 value;
76 : } pg_atomic_uint64;
77 : #endif /* __x86_64__ */
78 :
79 : #endif /* defined(__GNUC__) || defined(__INTEL_COMPILER) */
80 :
81 : #if !defined(PG_HAVE_SPIN_DELAY)
82 : /*
83 : * This sequence is equivalent to the PAUSE instruction ("rep" is
84 : * ignored by old IA32 processors if the following instruction is
85 : * not a string operation); the IA-32 Architecture Software
86 : * Developer's Manual, Vol. 3, Section 7.7.2 describes why using
87 : * PAUSE in the inner loop of a spin lock is necessary for good
88 : * performance:
89 : *
90 : * The PAUSE instruction improves the performance of IA-32
91 : * processors supporting Hyper-Threading Technology when
92 : * executing spin-wait loops and other routines where one
93 : * thread is accessing a shared lock or semaphore in a tight
94 : * polling loop. When executing a spin-wait loop, the
95 : * processor can suffer a severe performance penalty when
96 : * exiting the loop because it detects a possible memory order
97 : * violation and flushes the core processor's pipeline. The
98 : * PAUSE instruction provides a hint to the processor that the
99 : * code sequence is a spin-wait loop. The processor uses this
100 : * hint to avoid the memory order violation and prevent the
101 : * pipeline flush. In addition, the PAUSE instruction
102 : * de-pipelines the spin-wait loop to prevent it from
103 : * consuming execution resources excessively.
104 : */
105 : #if defined(__GNUC__) || defined(__INTEL_COMPILER)
106 : #define PG_HAVE_SPIN_DELAY
107 : static __inline__ void
108 : pg_spin_delay_impl(void)
109 : {
110 : __asm__ __volatile__(" rep; nop \n");
111 : }
112 : #elif defined(_MSC_VER) && defined(__x86_64__)
113 : #define PG_HAVE_SPIN_DELAY
114 : static __forceinline void
115 : pg_spin_delay_impl(void)
116 : {
117 : _mm_pause();
118 : }
119 : #elif defined(_MSC_VER)
120 : #define PG_HAVE_SPIN_DELAY
121 : static __forceinline void
122 : pg_spin_delay_impl(void)
123 : {
124 : /* See comment for gcc code. Same code, MASM syntax */
125 : __asm rep nop;
126 : }
127 : #endif
128 : #endif /* !defined(PG_HAVE_SPIN_DELAY) */
129 :
130 :
131 : #if defined(__GNUC__) || defined(__INTEL_COMPILER)
132 :
133 : #define PG_HAVE_ATOMIC_TEST_SET_FLAG
134 : static inline bool
135 152246 : pg_atomic_test_set_flag_impl(volatile pg_atomic_flag *ptr)
136 : {
137 152246 : char _res = 1;
138 :
139 152246 : __asm__ __volatile__(
140 : " lock \n"
141 : " xchgb %0,%1 \n"
142 : : "+q"(_res), "+m"(ptr->value)
143 : :
144 : : "memory");
145 152246 : return _res == 0;
146 : }
147 :
148 : #define PG_HAVE_ATOMIC_CLEAR_FLAG
149 : static inline void
150 6760 : pg_atomic_clear_flag_impl(volatile pg_atomic_flag *ptr)
151 : {
152 : /*
153 : * On a TSO architecture like x86 it's sufficient to use a compiler
154 : * barrier to achieve release semantics.
155 : */
156 6760 : __asm__ __volatile__("" ::: "memory");
157 6760 : ptr->value = 0;
158 6760 : }
159 :
160 : #define PG_HAVE_ATOMIC_COMPARE_EXCHANGE_U32
161 : static inline bool
162 914888242 : pg_atomic_compare_exchange_u32_impl(volatile pg_atomic_uint32 *ptr,
163 : uint32 *expected, uint32 newval)
164 : {
165 : char ret;
166 :
167 : /*
168 : * Perform cmpxchg and use the zero flag which it implicitly sets when
169 : * equal to measure the success.
170 : */
171 914888242 : __asm__ __volatile__(
172 : " lock \n"
173 : " cmpxchgl %4,%5 \n"
174 : " setz %2 \n"
175 : : "=a" (*expected), "=m"(ptr->value), "=q" (ret)
176 914888242 : : "a" (*expected), "r" (newval), "m"(ptr->value)
177 : : "memory", "cc");
178 914888242 : return (bool) ret;
179 : }
180 :
181 : #define PG_HAVE_ATOMIC_FETCH_ADD_U32
182 : static inline uint32
183 10663050 : pg_atomic_fetch_add_u32_impl(volatile pg_atomic_uint32 *ptr, int32 add_)
184 : {
185 : uint32 res;
186 10663050 : __asm__ __volatile__(
187 : " lock \n"
188 : " xaddl %0,%1 \n"
189 : : "=q"(res), "=m"(ptr->value)
190 : : "0" (add_), "m"(ptr->value)
191 : : "memory", "cc");
192 10663050 : return res;
193 : }
194 :
195 : #ifdef __x86_64__
196 :
197 : #define PG_HAVE_ATOMIC_COMPARE_EXCHANGE_U64
198 : static inline bool
199 3625366 : pg_atomic_compare_exchange_u64_impl(volatile pg_atomic_uint64 *ptr,
200 : uint64 *expected, uint64 newval)
201 : {
202 : char ret;
203 :
204 : AssertPointerAlignment(expected, 8);
205 :
206 : /*
207 : * Perform cmpxchg and use the zero flag which it implicitly sets when
208 : * equal to measure the success.
209 : */
210 3625366 : __asm__ __volatile__(
211 : " lock \n"
212 : " cmpxchgq %4,%5 \n"
213 : " setz %2 \n"
214 : : "=a" (*expected), "=m"(ptr->value), "=q" (ret)
215 3625366 : : "a" (*expected), "r" (newval), "m"(ptr->value)
216 : : "memory", "cc");
217 3625366 : return (bool) ret;
218 : }
219 :
220 : #define PG_HAVE_ATOMIC_FETCH_ADD_U64
221 : static inline uint64
222 4597422 : pg_atomic_fetch_add_u64_impl(volatile pg_atomic_uint64 *ptr, int64 add_)
223 : {
224 : uint64 res;
225 4597422 : __asm__ __volatile__(
226 : " lock \n"
227 : " xaddq %0,%1 \n"
228 : : "=q"(res), "=m"(ptr->value)
229 : : "0" (add_), "m"(ptr->value)
230 : : "memory", "cc");
231 4597422 : return res;
232 : }
233 :
234 : #endif /* __x86_64__ */
235 :
236 : #endif /* defined(__GNUC__) || defined(__INTEL_COMPILER) */
237 :
238 : /*
239 : * 8 byte reads / writes have single-copy atomicity on 32 bit x86 platforms
240 : * since at least the 586. As well as on all x86-64 cpus.
241 : */
242 : #if defined(__i568__) || defined(__i668__) || /* gcc i586+ */ \
243 : (defined(_M_IX86) && _M_IX86 >= 500) || /* msvc i586+ */ \
244 : defined(__x86_64__) || defined(__x86_64) || defined(_M_X64) /* gcc, sunpro, msvc */
245 : #define PG_HAVE_8BYTE_SINGLE_COPY_ATOMICITY
246 : #endif /* 8 byte single-copy atomicity */
|