Line data Source code
1 : /* Copyright (C) 2007-2024 Free Software Foundation, Inc.
2 :
3 : This file is part of GCC.
4 :
5 : GCC is free software; you can redistribute it and/or modify
6 : it under the terms of the GNU General Public License as published by
7 : the Free Software Foundation; either version 3, or (at your option)
8 : any later version.
9 :
10 : GCC is distributed in the hope that it will be useful,
11 : but WITHOUT ANY WARRANTY; without even the implied warranty of
12 : MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 : GNU General Public License for more details.
14 :
15 : Under Section 7 of GPL version 3, you are granted additional
16 : permissions described in the GCC Runtime Library Exception, version
17 : 3.1, as published by the Free Software Foundation.
18 :
19 : You should have received a copy of the GNU General Public License and
20 : a copy of the GCC Runtime Library Exception along with this program;
21 : see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
22 : <http://www.gnu.org/licenses/>. */
23 :
24 : /* Implemented from the specification included in the Intel C++ Compiler
25 : User Guide and Reference, version 10.0. */
26 :
27 : #ifndef _SMMINTRIN_H_INCLUDED
28 : #define _SMMINTRIN_H_INCLUDED
29 :
30 : /* We need definitions from the SSSE3, SSE3, SSE2 and SSE header
31 : files. */
32 : #include <tmmintrin.h>
33 :
34 : #ifndef __SSE4_1__
35 : #pragma GCC push_options
36 : #pragma GCC target("sse4.1")
37 : #define __DISABLE_SSE4_1__
38 : #endif /* __SSE4_1__ */
39 :
40 : /* Rounding mode macros. */
41 : #define _MM_FROUND_TO_NEAREST_INT 0x00
42 : #define _MM_FROUND_TO_NEG_INF 0x01
43 : #define _MM_FROUND_TO_POS_INF 0x02
44 : #define _MM_FROUND_TO_ZERO 0x03
45 : #define _MM_FROUND_CUR_DIRECTION 0x04
46 :
47 : #define _MM_FROUND_RAISE_EXC 0x00
48 : #define _MM_FROUND_NO_EXC 0x08
49 :
50 : #define _MM_FROUND_NINT \
51 : (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_RAISE_EXC)
52 : #define _MM_FROUND_FLOOR \
53 : (_MM_FROUND_TO_NEG_INF | _MM_FROUND_RAISE_EXC)
54 : #define _MM_FROUND_CEIL \
55 : (_MM_FROUND_TO_POS_INF | _MM_FROUND_RAISE_EXC)
56 : #define _MM_FROUND_TRUNC \
57 : (_MM_FROUND_TO_ZERO | _MM_FROUND_RAISE_EXC)
58 : #define _MM_FROUND_RINT \
59 : (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC)
60 : #define _MM_FROUND_NEARBYINT \
61 : (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC)
62 :
63 : /* Test Instruction */
64 : /* Packed integer 128-bit bitwise comparison. Return 1 if
65 : (__V & __M) == 0. */
66 : extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
67 : _mm_testz_si128 (__m128i __M, __m128i __V)
68 : {
69 : return __builtin_ia32_ptestz128 ((__v2di)__M, (__v2di)__V);
70 : }
71 :
72 : /* Packed integer 128-bit bitwise comparison. Return 1 if
73 : (__V & ~__M) == 0. */
74 : extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
75 : _mm_testc_si128 (__m128i __M, __m128i __V)
76 : {
77 : return __builtin_ia32_ptestc128 ((__v2di)__M, (__v2di)__V);
78 : }
79 :
80 : /* Packed integer 128-bit bitwise comparison. Return 1 if
81 : (__V & __M) != 0 && (__V & ~__M) != 0. */
82 : extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
83 : _mm_testnzc_si128 (__m128i __M, __m128i __V)
84 : {
85 : return __builtin_ia32_ptestnzc128 ((__v2di)__M, (__v2di)__V);
86 : }
87 :
88 : /* Macros for packed integer 128-bit comparison intrinsics. */
89 : #define _mm_test_all_zeros(M, V) _mm_testz_si128 ((M), (V))
90 :
91 : #define _mm_test_all_ones(V) \
92 : _mm_testc_si128 ((V), _mm_cmpeq_epi32 ((V), (V)))
93 :
94 : #define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128 ((M), (V))
95 :
96 : /* Packed/scalar double precision floating point rounding. */
97 :
98 : #ifdef __OPTIMIZE__
99 : extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
100 : _mm_round_pd (__m128d __V, const int __M)
101 : {
102 : return (__m128d) __builtin_ia32_roundpd ((__v2df)__V, __M);
103 : }
104 :
105 : extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
106 : _mm_round_sd(__m128d __D, __m128d __V, const int __M)
107 : {
108 : return (__m128d) __builtin_ia32_roundsd ((__v2df)__D,
109 : (__v2df)__V,
110 : __M);
111 : }
112 : #else
113 : #define _mm_round_pd(V, M) \
114 : ((__m128d) __builtin_ia32_roundpd ((__v2df)(__m128d)(V), (int)(M)))
115 :
116 : #define _mm_round_sd(D, V, M) \
117 : ((__m128d) __builtin_ia32_roundsd ((__v2df)(__m128d)(D), \
118 : (__v2df)(__m128d)(V), (int)(M)))
119 : #endif
120 :
121 : /* Packed/scalar single precision floating point rounding. */
122 :
123 : #ifdef __OPTIMIZE__
124 : extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
125 : _mm_round_ps (__m128 __V, const int __M)
126 : {
127 : return (__m128) __builtin_ia32_roundps ((__v4sf)__V, __M);
128 : }
129 :
130 : extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
131 : _mm_round_ss (__m128 __D, __m128 __V, const int __M)
132 : {
133 : return (__m128) __builtin_ia32_roundss ((__v4sf)__D,
134 : (__v4sf)__V,
135 : __M);
136 : }
137 : #else
138 : #define _mm_round_ps(V, M) \
139 : ((__m128) __builtin_ia32_roundps ((__v4sf)(__m128)(V), (int)(M)))
140 :
141 : #define _mm_round_ss(D, V, M) \
142 : ((__m128) __builtin_ia32_roundss ((__v4sf)(__m128)(D), \
143 : (__v4sf)(__m128)(V), (int)(M)))
144 : #endif
145 :
146 : /* Macros for ceil/floor intrinsics. */
147 : #define _mm_ceil_pd(V) _mm_round_pd ((V), _MM_FROUND_CEIL)
148 : #define _mm_ceil_sd(D, V) _mm_round_sd ((D), (V), _MM_FROUND_CEIL)
149 :
150 : #define _mm_floor_pd(V) _mm_round_pd((V), _MM_FROUND_FLOOR)
151 : #define _mm_floor_sd(D, V) _mm_round_sd ((D), (V), _MM_FROUND_FLOOR)
152 :
153 : #define _mm_ceil_ps(V) _mm_round_ps ((V), _MM_FROUND_CEIL)
154 : #define _mm_ceil_ss(D, V) _mm_round_ss ((D), (V), _MM_FROUND_CEIL)
155 :
156 : #define _mm_floor_ps(V) _mm_round_ps ((V), _MM_FROUND_FLOOR)
157 : #define _mm_floor_ss(D, V) _mm_round_ss ((D), (V), _MM_FROUND_FLOOR)
158 :
159 : /* SSE4.1 */
160 :
161 : /* Integer blend instructions - select data from 2 sources using
162 : constant/variable mask. */
163 :
164 : #ifdef __OPTIMIZE__
165 : extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
166 : _mm_blend_epi16 (__m128i __X, __m128i __Y, const int __M)
167 : {
168 : return (__m128i) __builtin_ia32_pblendw128 ((__v8hi)__X,
169 : (__v8hi)__Y,
170 : __M);
171 : }
172 : #else
173 : #define _mm_blend_epi16(X, Y, M) \
174 : ((__m128i) __builtin_ia32_pblendw128 ((__v8hi)(__m128i)(X), \
175 : (__v8hi)(__m128i)(Y), (int)(M)))
176 : #endif
177 :
178 : extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
179 : _mm_blendv_epi8 (__m128i __X, __m128i __Y, __m128i __M)
180 : {
181 : return (__m128i) __builtin_ia32_pblendvb128 ((__v16qi)__X,
182 : (__v16qi)__Y,
183 : (__v16qi)__M);
184 : }
185 :
186 : /* Single precision floating point blend instructions - select data
187 : from 2 sources using constant/variable mask. */
188 :
189 : #ifdef __OPTIMIZE__
190 : extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
191 : _mm_blend_ps (__m128 __X, __m128 __Y, const int __M)
192 : {
193 : return (__m128) __builtin_ia32_blendps ((__v4sf)__X,
194 : (__v4sf)__Y,
195 : __M);
196 : }
197 : #else
198 : #define _mm_blend_ps(X, Y, M) \
199 : ((__m128) __builtin_ia32_blendps ((__v4sf)(__m128)(X), \
200 : (__v4sf)(__m128)(Y), (int)(M)))
201 : #endif
202 :
203 : extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
204 : _mm_blendv_ps (__m128 __X, __m128 __Y, __m128 __M)
205 : {
206 : return (__m128) __builtin_ia32_blendvps ((__v4sf)__X,
207 : (__v4sf)__Y,
208 : (__v4sf)__M);
209 : }
210 :
211 : /* Double precision floating point blend instructions - select data
212 : from 2 sources using constant/variable mask. */
213 :
214 : #ifdef __OPTIMIZE__
215 : extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
216 : _mm_blend_pd (__m128d __X, __m128d __Y, const int __M)
217 : {
218 : return (__m128d) __builtin_ia32_blendpd ((__v2df)__X,
219 : (__v2df)__Y,
220 : __M);
221 : }
222 : #else
223 : #define _mm_blend_pd(X, Y, M) \
224 : ((__m128d) __builtin_ia32_blendpd ((__v2df)(__m128d)(X), \
225 : (__v2df)(__m128d)(Y), (int)(M)))
226 : #endif
227 :
228 : extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
229 : _mm_blendv_pd (__m128d __X, __m128d __Y, __m128d __M)
230 : {
231 : return (__m128d) __builtin_ia32_blendvpd ((__v2df)__X,
232 : (__v2df)__Y,
233 : (__v2df)__M);
234 : }
235 :
236 : /* Dot product instructions with mask-defined summing and zeroing parts
237 : of result. */
238 :
239 : #ifdef __OPTIMIZE__
240 : extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
241 : _mm_dp_ps (__m128 __X, __m128 __Y, const int __M)
242 : {
243 : return (__m128) __builtin_ia32_dpps ((__v4sf)__X,
244 : (__v4sf)__Y,
245 : __M);
246 : }
247 :
248 : extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
249 : _mm_dp_pd (__m128d __X, __m128d __Y, const int __M)
250 : {
251 : return (__m128d) __builtin_ia32_dppd ((__v2df)__X,
252 : (__v2df)__Y,
253 : __M);
254 : }
255 : #else
256 : #define _mm_dp_ps(X, Y, M) \
257 : ((__m128) __builtin_ia32_dpps ((__v4sf)(__m128)(X), \
258 : (__v4sf)(__m128)(Y), (int)(M)))
259 :
260 : #define _mm_dp_pd(X, Y, M) \
261 : ((__m128d) __builtin_ia32_dppd ((__v2df)(__m128d)(X), \
262 : (__v2df)(__m128d)(Y), (int)(M)))
263 : #endif
264 :
265 : /* Packed integer 64-bit comparison, zeroing or filling with ones
266 : corresponding parts of result. */
267 : extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
268 : _mm_cmpeq_epi64 (__m128i __X, __m128i __Y)
269 : {
270 : return (__m128i) ((__v2di)__X == (__v2di)__Y);
271 : }
272 :
273 : /* Min/max packed integer instructions. */
274 :
275 : extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
276 : _mm_min_epi8 (__m128i __X, __m128i __Y)
277 : {
278 : return (__m128i) __builtin_ia32_pminsb128 ((__v16qi)__X, (__v16qi)__Y);
279 : }
280 :
281 : extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
282 : _mm_max_epi8 (__m128i __X, __m128i __Y)
283 : {
284 : return (__m128i) __builtin_ia32_pmaxsb128 ((__v16qi)__X, (__v16qi)__Y);
285 : }
286 :
287 : extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
288 : _mm_min_epu16 (__m128i __X, __m128i __Y)
289 : {
290 : return (__m128i) __builtin_ia32_pminuw128 ((__v8hi)__X, (__v8hi)__Y);
291 : }
292 :
293 : extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
294 : _mm_max_epu16 (__m128i __X, __m128i __Y)
295 : {
296 : return (__m128i) __builtin_ia32_pmaxuw128 ((__v8hi)__X, (__v8hi)__Y);
297 : }
298 :
299 : extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
300 : _mm_min_epi32 (__m128i __X, __m128i __Y)
301 : {
302 : return (__m128i) __builtin_ia32_pminsd128 ((__v4si)__X, (__v4si)__Y);
303 : }
304 :
305 : extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
306 : _mm_max_epi32 (__m128i __X, __m128i __Y)
307 : {
308 : return (__m128i) __builtin_ia32_pmaxsd128 ((__v4si)__X, (__v4si)__Y);
309 : }
310 :
311 : extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
312 : _mm_min_epu32 (__m128i __X, __m128i __Y)
313 : {
314 : return (__m128i) __builtin_ia32_pminud128 ((__v4si)__X, (__v4si)__Y);
315 : }
316 :
317 : extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
318 : _mm_max_epu32 (__m128i __X, __m128i __Y)
319 : {
320 : return (__m128i) __builtin_ia32_pmaxud128 ((__v4si)__X, (__v4si)__Y);
321 : }
322 :
323 : /* Packed integer 32-bit multiplication with truncation of upper
324 : halves of results. */
325 : extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
326 : _mm_mullo_epi32 (__m128i __X, __m128i __Y)
327 : {
328 : return (__m128i) ((__v4su)__X * (__v4su)__Y);
329 : }
330 :
331 : /* Packed integer 32-bit multiplication of 2 pairs of operands
332 : with two 64-bit results. */
333 : extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
334 : _mm_mul_epi32 (__m128i __X, __m128i __Y)
335 : {
336 : return (__m128i) __builtin_ia32_pmuldq128 ((__v4si)__X, (__v4si)__Y);
337 : }
338 :
339 : /* Insert single precision float into packed single precision array
340 : element selected by index N. The bits [7-6] of N define S
341 : index, the bits [5-4] define D index, and bits [3-0] define
342 : zeroing mask for D. */
343 :
344 : #ifdef __OPTIMIZE__
345 : extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
346 : _mm_insert_ps (__m128 __D, __m128 __S, const int __N)
347 : {
348 : return (__m128) __builtin_ia32_insertps128 ((__v4sf)__D,
349 : (__v4sf)__S,
350 : __N);
351 : }
352 : #else
353 : #define _mm_insert_ps(D, S, N) \
354 : ((__m128) __builtin_ia32_insertps128 ((__v4sf)(__m128)(D), \
355 : (__v4sf)(__m128)(S), (int)(N)))
356 : #endif
357 :
358 : /* Helper macro to create the N value for _mm_insert_ps. */
359 : #define _MM_MK_INSERTPS_NDX(S, D, M) (((S) << 6) | ((D) << 4) | (M))
360 :
361 : /* Extract binary representation of single precision float from packed
362 : single precision array element of X selected by index N. */
363 :
364 : #ifdef __OPTIMIZE__
365 : extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
366 : _mm_extract_ps (__m128 __X, const int __N)
367 : {
368 : union { int __i; float __f; } __tmp;
369 : __tmp.__f = __builtin_ia32_vec_ext_v4sf ((__v4sf)__X, __N);
370 : return __tmp.__i;
371 : }
372 : #else
373 : #define _mm_extract_ps(X, N) \
374 : (__extension__ \
375 : ({ \
376 : union { int __i; float __f; } __tmp; \
377 : __tmp.__f = __builtin_ia32_vec_ext_v4sf ((__v4sf)(__m128)(X), \
378 : (int)(N)); \
379 : __tmp.__i; \
380 : }))
381 : #endif
382 :
383 : /* Extract binary representation of single precision float into
384 : D from packed single precision array element of S selected
385 : by index N. */
386 : #define _MM_EXTRACT_FLOAT(D, S, N) \
387 : { (D) = __builtin_ia32_vec_ext_v4sf ((__v4sf)(S), (N)); }
388 :
389 : /* Extract specified single precision float element into the lower
390 : part of __m128. */
391 : #define _MM_PICK_OUT_PS(X, N) \
392 : _mm_insert_ps (_mm_setzero_ps (), (X), \
393 : _MM_MK_INSERTPS_NDX ((N), 0, 0x0e))
394 :
395 : /* Insert integer, S, into packed integer array element of D
396 : selected by index N. */
397 :
398 : #ifdef __OPTIMIZE__
399 : extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
400 : _mm_insert_epi8 (__m128i __D, int __S, const int __N)
401 : {
402 : return (__m128i) __builtin_ia32_vec_set_v16qi ((__v16qi)__D,
403 : __S, __N);
404 : }
405 :
406 : extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
407 : _mm_insert_epi32 (__m128i __D, int __S, const int __N)
408 : {
409 : return (__m128i) __builtin_ia32_vec_set_v4si ((__v4si)__D,
410 : __S, __N);
411 : }
412 :
413 : #ifdef __x86_64__
414 : extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
415 : _mm_insert_epi64 (__m128i __D, long long __S, const int __N)
416 : {
417 : return (__m128i) __builtin_ia32_vec_set_v2di ((__v2di)__D,
418 : __S, __N);
419 : }
420 : #endif
421 : #else
422 : #define _mm_insert_epi8(D, S, N) \
423 : ((__m128i) __builtin_ia32_vec_set_v16qi ((__v16qi)(__m128i)(D), \
424 : (int)(S), (int)(N)))
425 :
426 : #define _mm_insert_epi32(D, S, N) \
427 : ((__m128i) __builtin_ia32_vec_set_v4si ((__v4si)(__m128i)(D), \
428 : (int)(S), (int)(N)))
429 :
430 : #ifdef __x86_64__
431 : #define _mm_insert_epi64(D, S, N) \
432 : ((__m128i) __builtin_ia32_vec_set_v2di ((__v2di)(__m128i)(D), \
433 : (long long)(S), (int)(N)))
434 : #endif
435 : #endif
436 :
437 : /* Extract integer from packed integer array element of X selected by
438 : index N. */
439 :
440 : #ifdef __OPTIMIZE__
441 : extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
442 : _mm_extract_epi8 (__m128i __X, const int __N)
443 : {
444 : return (unsigned char) __builtin_ia32_vec_ext_v16qi ((__v16qi)__X, __N);
445 : }
446 :
447 : extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
448 : _mm_extract_epi32 (__m128i __X, const int __N)
449 : {
450 : return __builtin_ia32_vec_ext_v4si ((__v4si)__X, __N);
451 : }
452 :
453 : #ifdef __x86_64__
454 : extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
455 : _mm_extract_epi64 (__m128i __X, const int __N)
456 : {
457 : return __builtin_ia32_vec_ext_v2di ((__v2di)__X, __N);
458 : }
459 : #endif
460 : #else
461 : #define _mm_extract_epi8(X, N) \
462 : ((int) (unsigned char) __builtin_ia32_vec_ext_v16qi ((__v16qi)(__m128i)(X), (int)(N)))
463 : #define _mm_extract_epi32(X, N) \
464 : ((int) __builtin_ia32_vec_ext_v4si ((__v4si)(__m128i)(X), (int)(N)))
465 :
466 : #ifdef __x86_64__
467 : #define _mm_extract_epi64(X, N) \
468 : ((long long) __builtin_ia32_vec_ext_v2di ((__v2di)(__m128i)(X), (int)(N)))
469 : #endif
470 : #endif
471 :
472 : /* Return horizontal packed word minimum and its index in bits [15:0]
473 : and bits [18:16] respectively. */
474 : extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
475 : _mm_minpos_epu16 (__m128i __X)
476 : {
477 : return (__m128i) __builtin_ia32_phminposuw128 ((__v8hi)__X);
478 : }
479 :
480 : /* Packed integer sign-extension. */
481 :
482 : extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
483 : _mm_cvtepi8_epi32 (__m128i __X)
484 : {
485 : return (__m128i) __builtin_ia32_pmovsxbd128 ((__v16qi)__X);
486 : }
487 :
488 : extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
489 : _mm_cvtepi16_epi32 (__m128i __X)
490 : {
491 : return (__m128i) __builtin_ia32_pmovsxwd128 ((__v8hi)__X);
492 : }
493 :
494 : extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
495 : _mm_cvtepi8_epi64 (__m128i __X)
496 : {
497 : return (__m128i) __builtin_ia32_pmovsxbq128 ((__v16qi)__X);
498 : }
499 :
500 : extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
501 : _mm_cvtepi32_epi64 (__m128i __X)
502 : {
503 : return (__m128i) __builtin_ia32_pmovsxdq128 ((__v4si)__X);
504 : }
505 :
506 : extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
507 : _mm_cvtepi16_epi64 (__m128i __X)
508 : {
509 : return (__m128i) __builtin_ia32_pmovsxwq128 ((__v8hi)__X);
510 : }
511 :
512 : extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
513 : _mm_cvtepi8_epi16 (__m128i __X)
514 : {
515 : return (__m128i) __builtin_ia32_pmovsxbw128 ((__v16qi)__X);
516 : }
517 :
518 : /* Packed integer zero-extension. */
519 :
520 : extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
521 : _mm_cvtepu8_epi32 (__m128i __X)
522 : {
523 : return (__m128i) __builtin_ia32_pmovzxbd128 ((__v16qi)__X);
524 : }
525 :
526 : extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
527 : _mm_cvtepu16_epi32 (__m128i __X)
528 : {
529 : return (__m128i) __builtin_ia32_pmovzxwd128 ((__v8hi)__X);
530 : }
531 :
532 : extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
533 : _mm_cvtepu8_epi64 (__m128i __X)
534 : {
535 : return (__m128i) __builtin_ia32_pmovzxbq128 ((__v16qi)__X);
536 : }
537 :
538 : extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
539 : _mm_cvtepu32_epi64 (__m128i __X)
540 : {
541 : return (__m128i) __builtin_ia32_pmovzxdq128 ((__v4si)__X);
542 : }
543 :
544 : extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
545 : _mm_cvtepu16_epi64 (__m128i __X)
546 : {
547 : return (__m128i) __builtin_ia32_pmovzxwq128 ((__v8hi)__X);
548 : }
549 :
550 : extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
551 : _mm_cvtepu8_epi16 (__m128i __X)
552 : {
553 : return (__m128i) __builtin_ia32_pmovzxbw128 ((__v16qi)__X);
554 : }
555 :
556 : /* Pack 8 double words from 2 operands into 8 words of result with
557 : unsigned saturation. */
558 : extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
559 : _mm_packus_epi32 (__m128i __X, __m128i __Y)
560 : {
561 : return (__m128i) __builtin_ia32_packusdw128 ((__v4si)__X, (__v4si)__Y);
562 : }
563 :
564 : /* Sum absolute 8-bit integer difference of adjacent groups of 4
565 : byte integers in the first 2 operands. Starting offsets within
566 : operands are determined by the 3rd mask operand. */
567 :
568 : #ifdef __OPTIMIZE__
569 : extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
570 : _mm_mpsadbw_epu8 (__m128i __X, __m128i __Y, const int __M)
571 : {
572 : return (__m128i) __builtin_ia32_mpsadbw128 ((__v16qi)__X,
573 : (__v16qi)__Y, __M);
574 : }
575 : #else
576 : #define _mm_mpsadbw_epu8(X, Y, M) \
577 : ((__m128i) __builtin_ia32_mpsadbw128 ((__v16qi)(__m128i)(X), \
578 : (__v16qi)(__m128i)(Y), (int)(M)))
579 : #endif
580 :
581 : /* Load double quadword using non-temporal aligned hint. */
582 : extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
583 : _mm_stream_load_si128 (__m128i *__X)
584 : {
585 : return (__m128i) __builtin_ia32_movntdqa ((__v2di *) __X);
586 : }
587 :
588 : #ifndef __SSE4_2__
589 : #pragma GCC push_options
590 : #pragma GCC target("sse4.2")
591 : #define __DISABLE_SSE4_2__
592 : #endif /* __SSE4_2__ */
593 :
594 : /* These macros specify the source data format. */
595 : #define _SIDD_UBYTE_OPS 0x00
596 : #define _SIDD_UWORD_OPS 0x01
597 : #define _SIDD_SBYTE_OPS 0x02
598 : #define _SIDD_SWORD_OPS 0x03
599 :
600 : /* These macros specify the comparison operation. */
601 : #define _SIDD_CMP_EQUAL_ANY 0x00
602 : #define _SIDD_CMP_RANGES 0x04
603 : #define _SIDD_CMP_EQUAL_EACH 0x08
604 : #define _SIDD_CMP_EQUAL_ORDERED 0x0c
605 :
606 : /* These macros specify the polarity. */
607 : #define _SIDD_POSITIVE_POLARITY 0x00
608 : #define _SIDD_NEGATIVE_POLARITY 0x10
609 : #define _SIDD_MASKED_POSITIVE_POLARITY 0x20
610 : #define _SIDD_MASKED_NEGATIVE_POLARITY 0x30
611 :
612 : /* These macros specify the output selection in _mm_cmpXstri (). */
613 : #define _SIDD_LEAST_SIGNIFICANT 0x00
614 : #define _SIDD_MOST_SIGNIFICANT 0x40
615 :
616 : /* These macros specify the output selection in _mm_cmpXstrm (). */
617 : #define _SIDD_BIT_MASK 0x00
618 : #define _SIDD_UNIT_MASK 0x40
619 :
620 : /* Intrinsics for text/string processing. */
621 :
622 : #ifdef __OPTIMIZE__
623 : extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
624 : _mm_cmpistrm (__m128i __X, __m128i __Y, const int __M)
625 : {
626 : return (__m128i) __builtin_ia32_pcmpistrm128 ((__v16qi)__X,
627 : (__v16qi)__Y,
628 : __M);
629 : }
630 :
631 : extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
632 : _mm_cmpistri (__m128i __X, __m128i __Y, const int __M)
633 : {
634 : return __builtin_ia32_pcmpistri128 ((__v16qi)__X,
635 : (__v16qi)__Y,
636 : __M);
637 : }
638 :
639 : extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
640 : _mm_cmpestrm (__m128i __X, int __LX, __m128i __Y, int __LY, const int __M)
641 : {
642 : return (__m128i) __builtin_ia32_pcmpestrm128 ((__v16qi)__X, __LX,
643 : (__v16qi)__Y, __LY,
644 : __M);
645 : }
646 :
647 : extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
648 : _mm_cmpestri (__m128i __X, int __LX, __m128i __Y, int __LY, const int __M)
649 : {
650 : return __builtin_ia32_pcmpestri128 ((__v16qi)__X, __LX,
651 : (__v16qi)__Y, __LY,
652 : __M);
653 : }
654 : #else
655 : #define _mm_cmpistrm(X, Y, M) \
656 : ((__m128i) __builtin_ia32_pcmpistrm128 ((__v16qi)(__m128i)(X), \
657 : (__v16qi)(__m128i)(Y), (int)(M)))
658 : #define _mm_cmpistri(X, Y, M) \
659 : ((int) __builtin_ia32_pcmpistri128 ((__v16qi)(__m128i)(X), \
660 : (__v16qi)(__m128i)(Y), (int)(M)))
661 :
662 : #define _mm_cmpestrm(X, LX, Y, LY, M) \
663 : ((__m128i) __builtin_ia32_pcmpestrm128 ((__v16qi)(__m128i)(X), \
664 : (int)(LX), (__v16qi)(__m128i)(Y), \
665 : (int)(LY), (int)(M)))
666 : #define _mm_cmpestri(X, LX, Y, LY, M) \
667 : ((int) __builtin_ia32_pcmpestri128 ((__v16qi)(__m128i)(X), (int)(LX), \
668 : (__v16qi)(__m128i)(Y), (int)(LY), \
669 : (int)(M)))
670 : #endif
671 :
672 : /* Intrinsics for text/string processing and reading values of
673 : EFlags. */
674 :
675 : #ifdef __OPTIMIZE__
676 : extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
677 : _mm_cmpistra (__m128i __X, __m128i __Y, const int __M)
678 : {
679 : return __builtin_ia32_pcmpistria128 ((__v16qi)__X,
680 : (__v16qi)__Y,
681 : __M);
682 : }
683 :
684 : extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
685 : _mm_cmpistrc (__m128i __X, __m128i __Y, const int __M)
686 : {
687 : return __builtin_ia32_pcmpistric128 ((__v16qi)__X,
688 : (__v16qi)__Y,
689 : __M);
690 : }
691 :
692 : extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
693 : _mm_cmpistro (__m128i __X, __m128i __Y, const int __M)
694 : {
695 : return __builtin_ia32_pcmpistrio128 ((__v16qi)__X,
696 : (__v16qi)__Y,
697 : __M);
698 : }
699 :
700 : extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
701 : _mm_cmpistrs (__m128i __X, __m128i __Y, const int __M)
702 : {
703 : return __builtin_ia32_pcmpistris128 ((__v16qi)__X,
704 : (__v16qi)__Y,
705 : __M);
706 : }
707 :
708 : extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
709 : _mm_cmpistrz (__m128i __X, __m128i __Y, const int __M)
710 : {
711 : return __builtin_ia32_pcmpistriz128 ((__v16qi)__X,
712 : (__v16qi)__Y,
713 : __M);
714 : }
715 :
716 : extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
717 : _mm_cmpestra (__m128i __X, int __LX, __m128i __Y, int __LY, const int __M)
718 : {
719 : return __builtin_ia32_pcmpestria128 ((__v16qi)__X, __LX,
720 : (__v16qi)__Y, __LY,
721 : __M);
722 : }
723 :
724 : extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
725 : _mm_cmpestrc (__m128i __X, int __LX, __m128i __Y, int __LY, const int __M)
726 : {
727 : return __builtin_ia32_pcmpestric128 ((__v16qi)__X, __LX,
728 : (__v16qi)__Y, __LY,
729 : __M);
730 : }
731 :
732 : extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
733 : _mm_cmpestro (__m128i __X, int __LX, __m128i __Y, int __LY, const int __M)
734 : {
735 : return __builtin_ia32_pcmpestrio128 ((__v16qi)__X, __LX,
736 : (__v16qi)__Y, __LY,
737 : __M);
738 : }
739 :
740 : extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
741 : _mm_cmpestrs (__m128i __X, int __LX, __m128i __Y, int __LY, const int __M)
742 : {
743 : return __builtin_ia32_pcmpestris128 ((__v16qi)__X, __LX,
744 : (__v16qi)__Y, __LY,
745 : __M);
746 : }
747 :
748 : extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
749 : _mm_cmpestrz (__m128i __X, int __LX, __m128i __Y, int __LY, const int __M)
750 : {
751 : return __builtin_ia32_pcmpestriz128 ((__v16qi)__X, __LX,
752 : (__v16qi)__Y, __LY,
753 : __M);
754 : }
755 : #else
756 : #define _mm_cmpistra(X, Y, M) \
757 : ((int) __builtin_ia32_pcmpistria128 ((__v16qi)(__m128i)(X), \
758 : (__v16qi)(__m128i)(Y), (int)(M)))
759 : #define _mm_cmpistrc(X, Y, M) \
760 : ((int) __builtin_ia32_pcmpistric128 ((__v16qi)(__m128i)(X), \
761 : (__v16qi)(__m128i)(Y), (int)(M)))
762 : #define _mm_cmpistro(X, Y, M) \
763 : ((int) __builtin_ia32_pcmpistrio128 ((__v16qi)(__m128i)(X), \
764 : (__v16qi)(__m128i)(Y), (int)(M)))
765 : #define _mm_cmpistrs(X, Y, M) \
766 : ((int) __builtin_ia32_pcmpistris128 ((__v16qi)(__m128i)(X), \
767 : (__v16qi)(__m128i)(Y), (int)(M)))
768 : #define _mm_cmpistrz(X, Y, M) \
769 : ((int) __builtin_ia32_pcmpistriz128 ((__v16qi)(__m128i)(X), \
770 : (__v16qi)(__m128i)(Y), (int)(M)))
771 :
772 : #define _mm_cmpestra(X, LX, Y, LY, M) \
773 : ((int) __builtin_ia32_pcmpestria128 ((__v16qi)(__m128i)(X), (int)(LX), \
774 : (__v16qi)(__m128i)(Y), (int)(LY), \
775 : (int)(M)))
776 : #define _mm_cmpestrc(X, LX, Y, LY, M) \
777 : ((int) __builtin_ia32_pcmpestric128 ((__v16qi)(__m128i)(X), (int)(LX), \
778 : (__v16qi)(__m128i)(Y), (int)(LY), \
779 : (int)(M)))
780 : #define _mm_cmpestro(X, LX, Y, LY, M) \
781 : ((int) __builtin_ia32_pcmpestrio128 ((__v16qi)(__m128i)(X), (int)(LX), \
782 : (__v16qi)(__m128i)(Y), (int)(LY), \
783 : (int)(M)))
784 : #define _mm_cmpestrs(X, LX, Y, LY, M) \
785 : ((int) __builtin_ia32_pcmpestris128 ((__v16qi)(__m128i)(X), (int)(LX), \
786 : (__v16qi)(__m128i)(Y), (int)(LY), \
787 : (int)(M)))
788 : #define _mm_cmpestrz(X, LX, Y, LY, M) \
789 : ((int) __builtin_ia32_pcmpestriz128 ((__v16qi)(__m128i)(X), (int)(LX), \
790 : (__v16qi)(__m128i)(Y), (int)(LY), \
791 : (int)(M)))
792 : #endif
793 :
794 : /* Packed integer 64-bit comparison, zeroing or filling with ones
795 : corresponding parts of result. */
796 : extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
797 : _mm_cmpgt_epi64 (__m128i __X, __m128i __Y)
798 : {
799 : return (__m128i) ((__v2di)__X > (__v2di)__Y);
800 : }
801 :
802 : #ifdef __DISABLE_SSE4_2__
803 : #undef __DISABLE_SSE4_2__
804 : #pragma GCC pop_options
805 : #endif /* __DISABLE_SSE4_2__ */
806 :
807 : #ifdef __DISABLE_SSE4_1__
808 : #undef __DISABLE_SSE4_1__
809 : #pragma GCC pop_options
810 : #endif /* __DISABLE_SSE4_1__ */
811 :
812 : #include <popcntintrin.h>
813 :
814 : #ifndef __CRC32__
815 : #pragma GCC push_options
816 : #pragma GCC target("crc32")
817 : #define __DISABLE_CRC32__
818 : #endif /* __CRC32__ */
819 :
820 : /* Accumulate CRC32 (polynomial 0x11EDC6F41) value. */
821 : extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
822 : _mm_crc32_u8 (unsigned int __C, unsigned char __V)
823 : {
824 88242106 : return __builtin_ia32_crc32qi (__C, __V);
825 : }
826 :
827 : extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
828 : _mm_crc32_u16 (unsigned int __C, unsigned short __V)
829 : {
830 : return __builtin_ia32_crc32hi (__C, __V);
831 : }
832 :
833 : extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
834 : _mm_crc32_u32 (unsigned int __C, unsigned int __V)
835 : {
836 50926579 : return __builtin_ia32_crc32si (__C, __V);
837 : }
838 :
839 : #ifdef __x86_64__
840 : extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
841 : _mm_crc32_u64 (unsigned long long __C, unsigned long long __V)
842 : {
843 3978223312 : return __builtin_ia32_crc32di (__C, __V);
844 : }
845 : #endif
846 :
847 : #ifdef __DISABLE_CRC32__
848 : #undef __DISABLE_CRC32__
849 : #pragma GCC pop_options
850 : #endif /* __DISABLE_CRC32__ */
851 :
852 : #endif /* _SMMINTRIN_H_INCLUDED */
|