LCOV - code coverage report
Current view: top level - src/include/port - simd.h (source / functions) Hit Total Coverage
Test: PostgreSQL 19devel Lines: 60 60 100.0 %
Date: 2025-10-10 17:18:49 Functions: 25 25 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * simd.h
       4             :  *    Support for platform-specific vector operations.
       5             :  *
       6             :  * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
       7             :  * Portions Copyright (c) 1994, Regents of the University of California
       8             :  *
       9             :  * src/include/port/simd.h
      10             :  *
      11             :  * NOTES
      12             :  * - VectorN in this file refers to a register where the element operands
      13             :  * are N bits wide. The vector width is platform-specific, so users that care
      14             :  * about that will need to inspect "sizeof(VectorN)".
      15             :  *
      16             :  *-------------------------------------------------------------------------
      17             :  */
      18             : #ifndef SIMD_H
      19             : #define SIMD_H
      20             : 
      21             : #if (defined(__x86_64__) || defined(_M_AMD64))
      22             : /*
      23             :  * SSE2 instructions are part of the spec for the 64-bit x86 ISA. We assume
      24             :  * that compilers targeting this architecture understand SSE2 intrinsics.
      25             :  *
      26             :  * We use emmintrin.h rather than the comprehensive header immintrin.h in
      27             :  * order to exclude extensions beyond SSE2. This is because MSVC, at least,
      28             :  * will allow the use of intrinsics that haven't been enabled at compile
      29             :  * time.
      30             :  */
      31             : #include <emmintrin.h>
      32             : #define USE_SSE2
      33             : typedef __m128i Vector8;
      34             : typedef __m128i Vector32;
      35             : 
      36             : #elif defined(__aarch64__) && defined(__ARM_NEON)
      37             : /*
      38             :  * We use the Neon instructions if the compiler provides access to them (as
      39             :  * indicated by __ARM_NEON) and we are on aarch64.  While Neon support is
      40             :  * technically optional for aarch64, it appears that all available 64-bit
      41             :  * hardware does have it.  Neon exists in some 32-bit hardware too, but we
      42             :  * could not realistically use it there without a run-time check, which seems
      43             :  * not worth the trouble for now.
      44             :  */
      45             : #include <arm_neon.h>
      46             : #define USE_NEON
      47             : typedef uint8x16_t Vector8;
      48             : typedef uint32x4_t Vector32;
      49             : 
      50             : #else
      51             : /*
      52             :  * If no SIMD instructions are available, we can in some cases emulate vector
      53             :  * operations using bitwise operations on unsigned integers.  Note that many
      54             :  * of the functions in this file presently do not have non-SIMD
      55             :  * implementations.  In particular, none of the functions involving Vector32
      56             :  * are implemented without SIMD since it's likely not worthwhile to represent
      57             :  * two 32-bit integers using a uint64.
      58             :  */
      59             : #define USE_NO_SIMD
      60             : typedef uint64 Vector8;
      61             : #endif
      62             : 
      63             : /* load/store operations */
      64             : static inline void vector8_load(Vector8 *v, const uint8 *s);
      65             : #ifndef USE_NO_SIMD
      66             : static inline void vector32_load(Vector32 *v, const uint32 *s);
      67             : #endif
      68             : 
      69             : /* assignment operations */
      70             : static inline Vector8 vector8_broadcast(const uint8 c);
      71             : #ifndef USE_NO_SIMD
      72             : static inline Vector32 vector32_broadcast(const uint32 c);
      73             : #endif
      74             : 
      75             : /* element-wise comparisons to a scalar */
      76             : static inline bool vector8_has(const Vector8 v, const uint8 c);
      77             : static inline bool vector8_has_zero(const Vector8 v);
      78             : static inline bool vector8_has_le(const Vector8 v, const uint8 c);
      79             : static inline bool vector8_is_highbit_set(const Vector8 v);
      80             : #ifndef USE_NO_SIMD
      81             : static inline bool vector32_is_highbit_set(const Vector32 v);
      82             : static inline uint32 vector8_highbit_mask(const Vector8 v);
      83             : #endif
      84             : 
      85             : /* arithmetic operations */
      86             : static inline Vector8 vector8_or(const Vector8 v1, const Vector8 v2);
      87             : #ifndef USE_NO_SIMD
      88             : static inline Vector32 vector32_or(const Vector32 v1, const Vector32 v2);
      89             : #endif
      90             : 
      91             : /*
      92             :  * comparisons between vectors
      93             :  *
      94             :  * Note: These return a vector rather than boolean, which is why we don't
      95             :  * have non-SIMD implementations.
      96             :  */
      97             : #ifndef USE_NO_SIMD
      98             : static inline Vector8 vector8_eq(const Vector8 v1, const Vector8 v2);
      99             : static inline Vector8 vector8_min(const Vector8 v1, const Vector8 v2);
     100             : static inline Vector32 vector32_eq(const Vector32 v1, const Vector32 v2);
     101             : #endif
     102             : 
     103             : /*
     104             :  * Load a chunk of memory into the given vector.
     105             :  */
     106             : static inline void
     107    29663220 : vector8_load(Vector8 *v, const uint8 *s)
     108             : {
     109             : #if defined(USE_SSE2)
     110    29663220 :     *v = _mm_loadu_si128((const __m128i *) s);
     111             : #elif defined(USE_NEON)
     112             :     *v = vld1q_u8(s);
     113             : #else
     114             :     memcpy(v, s, sizeof(Vector8));
     115             : #endif
     116    29663220 : }
     117             : 
     118             : #ifndef USE_NO_SIMD
     119             : static inline void
     120         496 : vector32_load(Vector32 *v, const uint32 *s)
     121             : {
     122             : #ifdef USE_SSE2
     123         496 :     *v = _mm_loadu_si128((const __m128i *) s);
     124             : #elif defined(USE_NEON)
     125             :     *v = vld1q_u32(s);
     126             : #endif
     127         496 : }
     128             : #endif                          /* ! USE_NO_SIMD */
     129             : 
     130             : /*
     131             :  * Store a vector into the given memory address.
     132             :  */
     133             : #ifndef USE_NO_SIMD
     134             : static inline void
     135     4633062 : vector8_store(uint8 *s, Vector8 v)
     136             : {
     137             : #ifdef USE_SSE2
     138             :     _mm_storeu_si128((Vector8 *) s, v);
     139             : #elif defined(USE_NEON)
     140             :     vst1q_u8(s, v);
     141             : #endif
     142     4633062 : }
     143             : #endif                          /* ! USE_NO_SIMD */
     144             : 
     145             : /*
     146             :  * Create a vector with all elements set to the same value.
     147             :  */
     148             : static inline Vector8
     149    47644140 : vector8_broadcast(const uint8 c)
     150             : {
     151             : #if defined(USE_SSE2)
     152    95288280 :     return _mm_set1_epi8(c);
     153             : #elif defined(USE_NEON)
     154             :     return vdupq_n_u8(c);
     155             : #else
     156             :     return ~UINT64CONST(0) / 0xFF * c;
     157             : #endif
     158             : }
     159             : 
     160             : #ifndef USE_NO_SIMD
     161             : static inline Vector32
     162    17468876 : vector32_broadcast(const uint32 c)
     163             : {
     164             : #ifdef USE_SSE2
     165    34937752 :     return _mm_set1_epi32(c);
     166             : #elif defined(USE_NEON)
     167             :     return vdupq_n_u32(c);
     168             : #endif
     169             : }
     170             : #endif                          /* ! USE_NO_SIMD */
     171             : 
     172             : /*
     173             :  * Return true if any elements in the vector are equal to the given scalar.
     174             :  */
     175             : static inline bool
     176     6630216 : vector8_has(const Vector8 v, const uint8 c)
     177             : {
     178             :     bool        result;
     179             : 
     180             :     /* pre-compute the result for assert checking */
     181             : #ifdef USE_ASSERT_CHECKING
     182             :     bool        assert_result = false;
     183             : 
     184             :     for (Size i = 0; i < sizeof(Vector8); i++)
     185             :     {
     186             :         if (((const uint8 *) &v)[i] == c)
     187             :         {
     188             :             assert_result = true;
     189             :             break;
     190             :         }
     191             :     }
     192             : #endif                          /* USE_ASSERT_CHECKING */
     193             : 
     194             : #if defined(USE_NO_SIMD)
     195             :     /* any bytes in v equal to c will evaluate to zero via XOR */
     196             :     result = vector8_has_zero(v ^ vector8_broadcast(c));
     197             : #else
     198     6630216 :     result = vector8_is_highbit_set(vector8_eq(v, vector8_broadcast(c)));
     199             : #endif
     200             : 
     201             :     Assert(assert_result == result);
     202     6630216 :     return result;
     203             : }
     204             : 
     205             : /*
     206             :  * Convenience function equivalent to vector8_has(v, 0)
     207             :  */
     208             : static inline bool
     209             : vector8_has_zero(const Vector8 v)
     210             : {
     211             : #if defined(USE_NO_SIMD)
     212             :     /*
     213             :      * We cannot call vector8_has() here, because that would lead to a
     214             :      * circular definition.
     215             :      */
     216             :     return vector8_has_le(v, 0);
     217             : #else
     218             :     return vector8_has(v, 0);
     219             : #endif
     220             : }
     221             : 
     222             : /*
     223             :  * Return true if any elements in the vector are less than or equal to the
     224             :  * given scalar.
     225             :  */
     226             : static inline bool
     227      820358 : vector8_has_le(const Vector8 v, const uint8 c)
     228             : {
     229      820358 :     bool        result = false;
     230             : #ifdef USE_SSE2
     231             :     Vector8     umin;
     232             :     Vector8     cmpe;
     233             : #endif
     234             : 
     235             :     /* pre-compute the result for assert checking */
     236             : #ifdef USE_ASSERT_CHECKING
     237             :     bool        assert_result = false;
     238             : 
     239             :     for (Size i = 0; i < sizeof(Vector8); i++)
     240             :     {
     241             :         if (((const uint8 *) &v)[i] <= c)
     242             :         {
     243             :             assert_result = true;
     244             :             break;
     245             :         }
     246             :     }
     247             : #endif                          /* USE_ASSERT_CHECKING */
     248             : 
     249             : #if defined(USE_NO_SIMD)
     250             : 
     251             :     /*
     252             :      * To find bytes <= c, we can use bitwise operations to find bytes < c+1,
     253             :      * but it only works if c+1 <= 128 and if the highest bit in v is not set.
     254             :      * Adapted from
     255             :      * https://graphics.stanford.edu/~seander/bithacks.html#HasLessInWord
     256             :      */
     257             :     if ((int64) v >= 0 && c < 0x80)
     258             :         result = (v - vector8_broadcast(c + 1)) & ~v & vector8_broadcast(0x80);
     259             :     else
     260             :     {
     261             :         /* one byte at a time */
     262             :         for (Size i = 0; i < sizeof(Vector8); i++)
     263             :         {
     264             :             if (((const uint8 *) &v)[i] <= c)
     265             :             {
     266             :                 result = true;
     267             :                 break;
     268             :             }
     269             :         }
     270             :     }
     271             : #elif defined(USE_SSE2)
     272      820358 :     umin = vector8_min(v, vector8_broadcast(c));
     273      820358 :     cmpe = vector8_eq(umin, v);
     274      820358 :     result = vector8_is_highbit_set(cmpe);
     275             : #elif defined(USE_NEON)
     276             :     result = vminvq_u8(v) <= c;
     277             : #endif
     278             : 
     279             :     Assert(assert_result == result);
     280      820358 :     return result;
     281             : }
     282             : 
     283             : /*
     284             :  * Returns true if any elements in the vector are greater than or equal to the
     285             :  * given scalar.
     286             :  */
     287             : #ifndef USE_NO_SIMD
     288             : static inline bool
     289      490892 : vector8_has_ge(const Vector8 v, const uint8 c)
     290             : {
     291             : #ifdef USE_SSE2
     292      490892 :     Vector8     umax = _mm_max_epu8(v, vector8_broadcast(c));
     293      490892 :     Vector8     cmpe = vector8_eq(umax, v);
     294             : 
     295      490892 :     return vector8_is_highbit_set(cmpe);
     296             : #elif defined(USE_NEON)
     297             :     return vmaxvq_u8(v) >= c;
     298             : #endif
     299             : }
     300             : #endif                          /* ! USE_NO_SIMD */
     301             : 
     302             : /*
     303             :  * Return true if the high bit of any element is set
     304             :  */
     305             : static inline bool
     306    11421110 : vector8_is_highbit_set(const Vector8 v)
     307             : {
     308             : #ifdef USE_SSE2
     309    11421110 :     return _mm_movemask_epi8(v) != 0;
     310             : #elif defined(USE_NEON)
     311             :     return vmaxvq_u8(v) > 0x7F;
     312             : #else
     313             :     return v & vector8_broadcast(0x80);
     314             : #endif
     315             : }
     316             : 
     317             : /*
     318             :  * Exactly like vector8_is_highbit_set except for the input type, so it
     319             :  * looks at each byte separately.
     320             :  *
     321             :  * XXX x86 uses the same underlying type for 8-bit, 16-bit, and 32-bit
     322             :  * integer elements, but Arm does not, hence the need for a separate
     323             :  * function. We could instead adopt the behavior of Arm's vmaxvq_u32(), i.e.
     324             :  * check each 32-bit element, but that would require an additional mask
     325             :  * operation on x86.
     326             :  */
     327             : #ifndef USE_NO_SIMD
     328             : static inline bool
     329         124 : vector32_is_highbit_set(const Vector32 v)
     330             : {
     331             : #if defined(USE_NEON)
     332             :     return vector8_is_highbit_set((Vector8) v);
     333             : #else
     334         124 :     return vector8_is_highbit_set(v);
     335             : #endif
     336             : }
     337             : #endif                          /* ! USE_NO_SIMD */
     338             : 
     339             : /*
     340             :  * Return a bitmask formed from the high-bit of each element.
     341             :  */
     342             : #ifndef USE_NO_SIMD
     343             : static inline uint32
     344    12627676 : vector8_highbit_mask(const Vector8 v)
     345             : {
     346             : #ifdef USE_SSE2
     347    12627676 :     return (uint32) _mm_movemask_epi8(v);
     348             : #elif defined(USE_NEON)
     349             :     /*
     350             :      * Note: It would be faster to use vget_lane_u64 and vshrn_n_u16, but that
     351             :      * returns a uint64, making it inconvenient to combine mask values from
     352             :      * multiple vectors.
     353             :      */
     354             :     static const uint8 mask[16] = {
     355             :         1 << 0, 1 << 1, 1 << 2, 1 << 3,
     356             :         1 << 4, 1 << 5, 1 << 6, 1 << 7,
     357             :         1 << 0, 1 << 1, 1 << 2, 1 << 3,
     358             :         1 << 4, 1 << 5, 1 << 6, 1 << 7,
     359             :     };
     360             : 
     361             :     uint8x16_t  masked = vandq_u8(vld1q_u8(mask), (uint8x16_t) vshrq_n_s8((int8x16_t) v, 7));
     362             :     uint8x16_t  maskedhi = vextq_u8(masked, masked, 8);
     363             : 
     364             :     return (uint32) vaddvq_u16((uint16x8_t) vzip1q_u8(masked, maskedhi));
     365             : #endif
     366             : }
     367             : #endif                          /* ! USE_NO_SIMD */
     368             : 
     369             : /*
     370             :  * Return the bitwise OR of the inputs
     371             :  */
     372             : static inline Vector8
     373    14408972 : vector8_or(const Vector8 v1, const Vector8 v2)
     374             : {
     375             : #ifdef USE_SSE2
     376    14408972 :     return _mm_or_si128(v1, v2);
     377             : #elif defined(USE_NEON)
     378             :     return vorrq_u8(v1, v2);
     379             : #else
     380             :     return v1 | v2;
     381             : #endif
     382             : }
     383             : 
     384             : #ifndef USE_NO_SIMD
     385             : static inline Vector32
     386         372 : vector32_or(const Vector32 v1, const Vector32 v2)
     387             : {
     388             : #ifdef USE_SSE2
     389         372 :     return _mm_or_si128(v1, v2);
     390             : #elif defined(USE_NEON)
     391             :     return vorrq_u32(v1, v2);
     392             : #endif
     393             : }
     394             : #endif                          /* ! USE_NO_SIMD */
     395             : 
     396             : /*
     397             :  * Return the bitwise AND of the inputs.
     398             :  */
     399             : #ifndef USE_NO_SIMD
     400             : static inline Vector8
     401    11229692 : vector8_and(const Vector8 v1, const Vector8 v2)
     402             : {
     403             : #ifdef USE_SSE2
     404    11229692 :     return _mm_and_si128(v1, v2);
     405             : #elif defined(USE_NEON)
     406             :     return vandq_u8(v1, v2);
     407             : #endif
     408             : }
     409             : #endif                          /* ! USE_NO_SIMD */
     410             : 
     411             : /*
     412             :  * Return the result of adding the respective elements of the input vectors.
     413             :  */
     414             : #ifndef USE_NO_SIMD
     415             : static inline Vector8
     416     9757016 : vector8_add(const Vector8 v1, const Vector8 v2)
     417             : {
     418             : #ifdef USE_SSE2
     419     9757016 :     return _mm_add_epi8(v1, v2);
     420             : #elif defined(USE_NEON)
     421             :     return vaddq_u8(v1, v2);
     422             : #endif
     423             : }
     424             : #endif                          /* ! USE_NO_SIMD */
     425             : 
     426             : /*
     427             :  * Return the result of subtracting the respective elements of the input
     428             :  * vectors using signed saturation (i.e., if the operation would yield a value
     429             :  * less than -128, -128 is returned instead).  For more information on
     430             :  * saturation arithmetic, see
     431             :  * https://en.wikipedia.org/wiki/Saturation_arithmetic
     432             :  */
     433             : #ifndef USE_NO_SIMD
     434             : static inline Vector8
     435      490892 : vector8_issub(const Vector8 v1, const Vector8 v2)
     436             : {
     437             : #ifdef USE_SSE2
     438      490892 :     return _mm_subs_epi8(v1, v2);
     439             : #elif defined(USE_NEON)
     440             :     return (Vector8) vqsubq_s8((int8x16_t) v1, (int8x16_t) v2);
     441             : #endif
     442             : }
     443             : #endif                          /* ! USE_NO_SIMD */
     444             : 
     445             : /*
     446             :  * Return a vector with all bits set in each lane where the corresponding
     447             :  * lanes in the inputs are equal.
     448             :  */
     449             : #ifndef USE_NO_SIMD
     450             : static inline Vector8
     451    27528182 : vector8_eq(const Vector8 v1, const Vector8 v2)
     452             : {
     453             : #ifdef USE_SSE2
     454    27528182 :     return _mm_cmpeq_epi8(v1, v2);
     455             : #elif defined(USE_NEON)
     456             :     return vceqq_u8(v1, v2);
     457             : #endif
     458             : }
     459             : #endif                          /* ! USE_NO_SIMD */
     460             : 
     461             : #ifndef USE_NO_SIMD
     462             : static inline Vector32
     463         496 : vector32_eq(const Vector32 v1, const Vector32 v2)
     464             : {
     465             : #ifdef USE_SSE2
     466         496 :     return _mm_cmpeq_epi32(v1, v2);
     467             : #elif defined(USE_NEON)
     468             :     return vceqq_u32(v1, v2);
     469             : #endif
     470             : }
     471             : #endif                          /* ! USE_NO_SIMD */
     472             : 
     473             : /*
     474             :  * Return a vector with all bits set for each lane of v1 that is greater than
     475             :  * the corresponding lane of v2.  NB: The comparison treats the elements as
     476             :  * signed.
     477             :  */
     478             : #ifndef USE_NO_SIMD
     479             : static inline Vector8
     480     5860292 : vector8_gt(const Vector8 v1, const Vector8 v2)
     481             : {
     482             : #ifdef USE_SSE2
     483     5860292 :     return _mm_cmpgt_epi8(v1, v2);
     484             : #elif defined(USE_NEON)
     485             :     return vcgtq_s8((int8x16_t) v1, (int8x16_t) v2);
     486             : #endif
     487             : }
     488             : #endif                          /* ! USE_NO_SIMD */
     489             : 
     490             : /*
     491             :  * Given two vectors, return a vector with the minimum element of each.
     492             :  */
     493             : #ifndef USE_NO_SIMD
     494             : static inline Vector8
     495     1029554 : vector8_min(const Vector8 v1, const Vector8 v2)
     496             : {
     497             : #ifdef USE_SSE2
     498     1029554 :     return _mm_min_epu8(v1, v2);
     499             : #elif defined(USE_NEON)
     500             :     return vminq_u8(v1, v2);
     501             : #endif
     502             : }
     503             : #endif                          /* ! USE_NO_SIMD */
     504             : 
     505             : /*
     506             :  * Interleave elements of low halves (e.g., for SSE2, bits 0-63) of given
     507             :  * vectors.  Bytes 0, 2, 4, etc. use v1, and bytes 1, 3, 5, etc. use v2.
     508             :  */
     509             : #ifndef USE_NO_SIMD
     510             : static inline Vector8
     511     3421038 : vector8_interleave_low(const Vector8 v1, const Vector8 v2)
     512             : {
     513             : #ifdef USE_SSE2
     514     3421038 :     return _mm_unpacklo_epi8(v1, v2);
     515             : #elif defined(USE_NEON)
     516             :     return vzip1q_u8(v1, v2);
     517             : #endif
     518             : }
     519             : #endif                          /* ! USE_NO_SIMD */
     520             : 
     521             : /*
     522             :  * Interleave elements of high halves (e.g., for SSE2, bits 64-127) of given
     523             :  * vectors.  Bytes 0, 2, 4, etc. use v1, and bytes 1, 3, 5, etc. use v2.
     524             :  */
     525             : #ifndef USE_NO_SIMD
     526             : static inline Vector8
     527     2193808 : vector8_interleave_high(const Vector8 v1, const Vector8 v2)
     528             : {
     529             : #ifdef USE_SSE2
     530     2193808 :     return _mm_unpackhi_epi8(v1, v2);
     531             : #elif defined(USE_NEON)
     532             :     return vzip2q_u8(v1, v2);
     533             : #endif
     534             : }
     535             : #endif                          /* ! USE_NO_SIMD */
     536             : 
     537             : /*
     538             :  * Pack 16-bit elements in the given vectors into a single vector of 8-bit
     539             :  * elements.  The first half of the return vector (e.g., for SSE2, bits 0-63)
     540             :  * uses v1, and the second half (e.g., for SSE2, bits 64-127) uses v2.
     541             :  *
     542             :  * NB: The upper 8-bits of each 16-bit element must be zeros, else this will
     543             :  * produce different results on different architectures.
     544             :  */
     545             : #ifndef USE_NO_SIMD
     546             : static inline Vector8
     547      245446 : vector8_pack_16(const Vector8 v1, const Vector8 v2)
     548             : {
     549             :     Vector8     mask PG_USED_FOR_ASSERTS_ONLY;
     550             : 
     551      245446 :     mask = vector8_interleave_low(vector8_broadcast(0), vector8_broadcast(0xff));
     552             :     Assert(!vector8_has_ge(vector8_and(v1, mask), 1));
     553             :     Assert(!vector8_has_ge(vector8_and(v2, mask), 1));
     554             : #ifdef USE_SSE2
     555      245446 :     return _mm_packus_epi16(v1, v2);
     556             : #elif defined(USE_NEON)
     557             :     return vuzp1q_u8(v1, v2);
     558             : #endif
     559             : }
     560             : #endif                          /* ! USE_NO_SIMD */
     561             : 
     562             : /*
     563             :  * Unsigned shift left of each 32-bit element in the vector by "i" bits.
     564             :  *
     565             :  * XXX AArch64 requires an integer literal, so we have to list all expected
     566             :  * values of "i" from all callers in a switch statement.  If you add a new
     567             :  * caller, be sure your expected values of "i" are handled.
     568             :  */
     569             : #ifndef USE_NO_SIMD
     570             : static inline Vector8
     571      490892 : vector8_shift_left(const Vector8 v1, int i)
     572             : {
     573             : #ifdef USE_SSE2
     574      490892 :     return _mm_slli_epi32(v1, i);
     575             : #elif defined(USE_NEON)
     576             :     switch (i)
     577             :     {
     578             :         case 4:
     579             :             return (Vector8) vshlq_n_u32((Vector32) v1, 4);
     580             :         default:
     581             :             Assert(false);
     582             :             return vector8_broadcast(0);
     583             :     }
     584             : #endif
     585             : }
     586             : #endif                          /* ! USE_NO_SIMD */
     587             : 
     588             : /*
     589             :  * Unsigned shift right of each 32-bit element in the vector by "i" bits.
     590             :  *
     591             :  * XXX AArch64 requires an integer literal, so we have to list all expected
     592             :  * values of "i" from all callers in a switch statement.  If you add a new
     593             :  * caller, be sure your expected values of "i" are handled.
     594             :  */
     595             : #ifndef USE_NO_SIMD
     596             : static inline Vector8
     597     2684700 : vector8_shift_right(const Vector8 v1, int i)
     598             : {
     599             : #ifdef USE_SSE2
     600     2684700 :     return _mm_srli_epi32(v1, i);
     601             : #elif defined(USE_NEON)
     602             :     switch (i)
     603             :     {
     604             :         case 4:
     605             :             return (Vector8) vshrq_n_u32((Vector32) v1, 4);
     606             :         case 8:
     607             :             return (Vector8) vshrq_n_u32((Vector32) v1, 8);
     608             :         default:
     609             :             Assert(false);
     610             :             return vector8_broadcast(0);
     611             :     }
     612             : #endif
     613             : }
     614             : #endif                          /* ! USE_NO_SIMD */
     615             : 
     616             : #endif                          /* SIMD_H */

Generated by: LCOV version 1.16