LCOV - code coverage report
Current view: top level - src/include/port - simd.h (source / functions) Hit Total Coverage
Test: PostgreSQL 17devel Lines: 37 37 100.0 %
Date: 2024-05-08 15:11:20 Functions: 16 16 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * simd.h
       4             :  *    Support for platform-specific vector operations.
       5             :  *
       6             :  * Portions Copyright (c) 1996-2024, PostgreSQL Global Development Group
       7             :  * Portions Copyright (c) 1994, Regents of the University of California
       8             :  *
       9             :  * src/include/port/simd.h
      10             :  *
      11             :  * NOTES
      12             :  * - VectorN in this file refers to a register where the element operands
      13             :  * are N bits wide. The vector width is platform-specific, so users that care
      14             :  * about that will need to inspect "sizeof(VectorN)".
      15             :  *
      16             :  *-------------------------------------------------------------------------
      17             :  */
      18             : #ifndef SIMD_H
      19             : #define SIMD_H
      20             : 
      21             : #if (defined(__x86_64__) || defined(_M_AMD64))
      22             : /*
      23             :  * SSE2 instructions are part of the spec for the 64-bit x86 ISA. We assume
      24             :  * that compilers targeting this architecture understand SSE2 intrinsics.
      25             :  *
      26             :  * We use emmintrin.h rather than the comprehensive header immintrin.h in
      27             :  * order to exclude extensions beyond SSE2. This is because MSVC, at least,
      28             :  * will allow the use of intrinsics that haven't been enabled at compile
      29             :  * time.
      30             :  */
      31             : #include <emmintrin.h>
      32             : #define USE_SSE2
      33             : typedef __m128i Vector8;
      34             : typedef __m128i Vector32;
      35             : 
      36             : #elif defined(__aarch64__) && defined(__ARM_NEON)
      37             : /*
      38             :  * We use the Neon instructions if the compiler provides access to them (as
      39             :  * indicated by __ARM_NEON) and we are on aarch64.  While Neon support is
      40             :  * technically optional for aarch64, it appears that all available 64-bit
      41             :  * hardware does have it.  Neon exists in some 32-bit hardware too, but we
      42             :  * could not realistically use it there without a run-time check, which seems
      43             :  * not worth the trouble for now.
      44             :  */
      45             : #include <arm_neon.h>
      46             : #define USE_NEON
      47             : typedef uint8x16_t Vector8;
      48             : typedef uint32x4_t Vector32;
      49             : 
      50             : #else
      51             : /*
      52             :  * If no SIMD instructions are available, we can in some cases emulate vector
      53             :  * operations using bitwise operations on unsigned integers.  Note that many
      54             :  * of the functions in this file presently do not have non-SIMD
      55             :  * implementations.  In particular, none of the functions involving Vector32
      56             :  * are implemented without SIMD since it's likely not worthwhile to represent
      57             :  * two 32-bit integers using a uint64.
      58             :  */
      59             : #define USE_NO_SIMD
      60             : typedef uint64 Vector8;
      61             : #endif
      62             : 
      63             : /* load/store operations */
      64             : static inline void vector8_load(Vector8 *v, const uint8 *s);
      65             : #ifndef USE_NO_SIMD
      66             : static inline void vector32_load(Vector32 *v, const uint32 *s);
      67             : #endif
      68             : 
      69             : /* assignment operations */
      70             : static inline Vector8 vector8_broadcast(const uint8 c);
      71             : #ifndef USE_NO_SIMD
      72             : static inline Vector32 vector32_broadcast(const uint32 c);
      73             : #endif
      74             : 
      75             : /* element-wise comparisons to a scalar */
      76             : static inline bool vector8_has(const Vector8 v, const uint8 c);
      77             : static inline bool vector8_has_zero(const Vector8 v);
      78             : static inline bool vector8_has_le(const Vector8 v, const uint8 c);
      79             : static inline bool vector8_is_highbit_set(const Vector8 v);
      80             : #ifndef USE_NO_SIMD
      81             : static inline bool vector32_is_highbit_set(const Vector32 v);
      82             : static inline uint32 vector8_highbit_mask(const Vector8 v);
      83             : #endif
      84             : 
      85             : /* arithmetic operations */
      86             : static inline Vector8 vector8_or(const Vector8 v1, const Vector8 v2);
      87             : #ifndef USE_NO_SIMD
      88             : static inline Vector32 vector32_or(const Vector32 v1, const Vector32 v2);
      89             : static inline Vector8 vector8_ssub(const Vector8 v1, const Vector8 v2);
      90             : #endif
      91             : 
      92             : /*
      93             :  * comparisons between vectors
      94             :  *
      95             :  * Note: These return a vector rather than boolean, which is why we don't
      96             :  * have non-SIMD implementations.
      97             :  */
      98             : #ifndef USE_NO_SIMD
      99             : static inline Vector8 vector8_eq(const Vector8 v1, const Vector8 v2);
     100             : static inline Vector8 vector8_min(const Vector8 v1, const Vector8 v2);
     101             : static inline Vector32 vector32_eq(const Vector32 v1, const Vector32 v2);
     102             : #endif
     103             : 
     104             : /*
     105             :  * Load a chunk of memory into the given vector.
     106             :  */
     107             : static inline void
     108    22817886 : vector8_load(Vector8 *v, const uint8 *s)
     109             : {
     110             : #if defined(USE_SSE2)
     111    22817886 :     *v = _mm_loadu_si128((const __m128i *) s);
     112             : #elif defined(USE_NEON)
     113             :     *v = vld1q_u8(s);
     114             : #else
     115             :     memcpy(v, s, sizeof(Vector8));
     116             : #endif
     117    22817886 : }
     118             : 
     119             : #ifndef USE_NO_SIMD
     120             : static inline void
     121         272 : vector32_load(Vector32 *v, const uint32 *s)
     122             : {
     123             : #ifdef USE_SSE2
     124         272 :     *v = _mm_loadu_si128((const __m128i *) s);
     125             : #elif defined(USE_NEON)
     126             :     *v = vld1q_u32(s);
     127             : #endif
     128         272 : }
     129             : #endif                          /* ! USE_NO_SIMD */
     130             : 
     131             : /*
     132             :  * Create a vector with all elements set to the same value.
     133             :  */
     134             : static inline Vector8
     135    20220584 : vector8_broadcast(const uint8 c)
     136             : {
     137             : #if defined(USE_SSE2)
     138    40441168 :     return _mm_set1_epi8(c);
     139             : #elif defined(USE_NEON)
     140             :     return vdupq_n_u8(c);
     141             : #else
     142             :     return ~UINT64CONST(0) / 0xFF * c;
     143             : #endif
     144             : }
     145             : 
     146             : #ifndef USE_NO_SIMD
     147             : static inline Vector32
     148    10200708 : vector32_broadcast(const uint32 c)
     149             : {
     150             : #ifdef USE_SSE2
     151    20401416 :     return _mm_set1_epi32(c);
     152             : #elif defined(USE_NEON)
     153             :     return vdupq_n_u32(c);
     154             : #endif
     155             : }
     156             : #endif                          /* ! USE_NO_SIMD */
     157             : 
     158             : /*
     159             :  * Return true if any elements in the vector are equal to the given scalar.
     160             :  */
     161             : static inline bool
     162     5117482 : vector8_has(const Vector8 v, const uint8 c)
     163             : {
     164             :     bool        result;
     165             : 
     166             :     /* pre-compute the result for assert checking */
     167             : #ifdef USE_ASSERT_CHECKING
     168             :     bool        assert_result = false;
     169             : 
     170             :     for (Size i = 0; i < sizeof(Vector8); i++)
     171             :     {
     172             :         if (((const uint8 *) &v)[i] == c)
     173             :         {
     174             :             assert_result = true;
     175             :             break;
     176             :         }
     177             :     }
     178             : #endif                          /* USE_ASSERT_CHECKING */
     179             : 
     180             : #if defined(USE_NO_SIMD)
     181             :     /* any bytes in v equal to c will evaluate to zero via XOR */
     182             :     result = vector8_has_zero(v ^ vector8_broadcast(c));
     183             : #else
     184     5117482 :     result = vector8_is_highbit_set(vector8_eq(v, vector8_broadcast(c)));
     185             : #endif
     186             : 
     187             :     Assert(assert_result == result);
     188     5117482 :     return result;
     189             : }
     190             : 
     191             : /*
     192             :  * Convenience function equivalent to vector8_has(v, 0)
     193             :  */
     194             : static inline bool
     195      485620 : vector8_has_zero(const Vector8 v)
     196             : {
     197             : #if defined(USE_NO_SIMD)
     198             :     /*
     199             :      * We cannot call vector8_has() here, because that would lead to a
     200             :      * circular definition.
     201             :      */
     202             :     return vector8_has_le(v, 0);
     203             : #else
     204      485620 :     return vector8_has(v, 0);
     205             : #endif
     206             : }
     207             : 
     208             : /*
     209             :  * Return true if any elements in the vector are less than or equal to the
     210             :  * given scalar.
     211             :  */
     212             : static inline bool
     213      485620 : vector8_has_le(const Vector8 v, const uint8 c)
     214             : {
     215      485620 :     bool        result = false;
     216             : 
     217             :     /* pre-compute the result for assert checking */
     218             : #ifdef USE_ASSERT_CHECKING
     219             :     bool        assert_result = false;
     220             : 
     221             :     for (Size i = 0; i < sizeof(Vector8); i++)
     222             :     {
     223             :         if (((const uint8 *) &v)[i] <= c)
     224             :         {
     225             :             assert_result = true;
     226             :             break;
     227             :         }
     228             :     }
     229             : #endif                          /* USE_ASSERT_CHECKING */
     230             : 
     231             : #if defined(USE_NO_SIMD)
     232             : 
     233             :     /*
     234             :      * To find bytes <= c, we can use bitwise operations to find bytes < c+1,
     235             :      * but it only works if c+1 <= 128 and if the highest bit in v is not set.
     236             :      * Adapted from
     237             :      * https://graphics.stanford.edu/~seander/bithacks.html#HasLessInWord
     238             :      */
     239             :     if ((int64) v >= 0 && c < 0x80)
     240             :         result = (v - vector8_broadcast(c + 1)) & ~v & vector8_broadcast(0x80);
     241             :     else
     242             :     {
     243             :         /* one byte at a time */
     244             :         for (Size i = 0; i < sizeof(Vector8); i++)
     245             :         {
     246             :             if (((const uint8 *) &v)[i] <= c)
     247             :             {
     248             :                 result = true;
     249             :                 break;
     250             :             }
     251             :         }
     252             :     }
     253             : #else
     254             : 
     255             :     /*
     256             :      * Use saturating subtraction to find bytes <= c, which will present as
     257             :      * NUL bytes.  This approach is a workaround for the lack of unsigned
     258             :      * comparison instructions on some architectures.
     259             :      */
     260      485620 :     result = vector8_has_zero(vector8_ssub(v, vector8_broadcast(c)));
     261             : #endif
     262             : 
     263             :     Assert(assert_result == result);
     264      485620 :     return result;
     265             : }
     266             : 
     267             : /*
     268             :  * Return true if the high bit of any element is set
     269             :  */
     270             : static inline bool
     271     8001190 : vector8_is_highbit_set(const Vector8 v)
     272             : {
     273             : #ifdef USE_SSE2
     274     8001190 :     return _mm_movemask_epi8(v) != 0;
     275             : #elif defined(USE_NEON)
     276             :     return vmaxvq_u8(v) > 0x7F;
     277             : #else
     278             :     return v & vector8_broadcast(0x80);
     279             : #endif
     280             : }
     281             : 
     282             : /*
     283             :  * Exactly like vector8_is_highbit_set except for the input type, so it
     284             :  * looks at each byte separately.
     285             :  *
     286             :  * XXX x86 uses the same underlying type for 8-bit, 16-bit, and 32-bit
     287             :  * integer elements, but Arm does not, hence the need for a separate
     288             :  * function. We could instead adopt the behavior of Arm's vmaxvq_u32(), i.e.
     289             :  * check each 32-bit element, but that would require an additional mask
     290             :  * operation on x86.
     291             :  */
     292             : #ifndef USE_NO_SIMD
     293             : static inline bool
     294          68 : vector32_is_highbit_set(const Vector32 v)
     295             : {
     296             : #if defined(USE_NEON)
     297             :     return vector8_is_highbit_set((Vector8) v);
     298             : #else
     299          68 :     return vector8_is_highbit_set(v);
     300             : #endif
     301             : }
     302             : #endif                          /* ! USE_NO_SIMD */
     303             : 
     304             : /*
     305             :  * Return a bitmask formed from the high-bit of each element.
     306             :  */
     307             : #ifndef USE_NO_SIMD
     308             : static inline uint32
     309    11933124 : vector8_highbit_mask(const Vector8 v)
     310             : {
     311             : #ifdef USE_SSE2
     312    11933124 :     return (uint32) _mm_movemask_epi8(v);
     313             : #elif defined(USE_NEON)
     314             :     /*
     315             :      * Note: It would be faster to use vget_lane_u64 and vshrn_n_u16, but that
     316             :      * returns a uint64, making it inconvenient to combine mask values from
     317             :      * multiple vectors.
     318             :      */
     319             :     static const uint8 mask[16] = {
     320             :         1 << 0, 1 << 1, 1 << 2, 1 << 3,
     321             :         1 << 4, 1 << 5, 1 << 6, 1 << 7,
     322             :         1 << 0, 1 << 1, 1 << 2, 1 << 3,
     323             :         1 << 4, 1 << 5, 1 << 6, 1 << 7,
     324             :     };
     325             : 
     326             :     uint8x16_t  masked = vandq_u8(vld1q_u8(mask), (uint8x16_t) vshrq_n_s8((int8x16_t) v, 7));
     327             :     uint8x16_t  maskedhi = vextq_u8(masked, masked, 8);
     328             : 
     329             :     return (uint32) vaddvq_u16((uint16x8_t) vzip1q_u8(masked, maskedhi));
     330             : #endif
     331             : }
     332             : #endif                          /* ! USE_NO_SIMD */
     333             : 
     334             : /*
     335             :  * Return the bitwise OR of the inputs
     336             :  */
     337             : static inline Vector8
     338    11534560 : vector8_or(const Vector8 v1, const Vector8 v2)
     339             : {
     340             : #ifdef USE_SSE2
     341    11534560 :     return _mm_or_si128(v1, v2);
     342             : #elif defined(USE_NEON)
     343             :     return vorrq_u8(v1, v2);
     344             : #else
     345             :     return v1 | v2;
     346             : #endif
     347             : }
     348             : 
     349             : #ifndef USE_NO_SIMD
     350             : static inline Vector32
     351         204 : vector32_or(const Vector32 v1, const Vector32 v2)
     352             : {
     353             : #ifdef USE_SSE2
     354         204 :     return _mm_or_si128(v1, v2);
     355             : #elif defined(USE_NEON)
     356             :     return vorrq_u32(v1, v2);
     357             : #endif
     358             : }
     359             : #endif                          /* ! USE_NO_SIMD */
     360             : 
     361             : /*
     362             :  * Return the result of subtracting the respective elements of the input
     363             :  * vectors using saturation (i.e., if the operation would yield a value less
     364             :  * than zero, zero is returned instead).  For more information on saturation
     365             :  * arithmetic, see https://en.wikipedia.org/wiki/Saturation_arithmetic
     366             :  */
     367             : #ifndef USE_NO_SIMD
     368             : static inline Vector8
     369      485620 : vector8_ssub(const Vector8 v1, const Vector8 v2)
     370             : {
     371             : #ifdef USE_SSE2
     372      485620 :     return _mm_subs_epu8(v1, v2);
     373             : #elif defined(USE_NEON)
     374             :     return vqsubq_u8(v1, v2);
     375             : #endif
     376             : }
     377             : #endif                          /* ! USE_NO_SIMD */
     378             : 
     379             : /*
     380             :  * Return a vector with all bits set in each lane where the corresponding
     381             :  * lanes in the inputs are equal.
     382             :  */
     383             : #ifndef USE_NO_SIMD
     384             : static inline Vector8
     385    22817886 : vector8_eq(const Vector8 v1, const Vector8 v2)
     386             : {
     387             : #ifdef USE_SSE2
     388    22817886 :     return _mm_cmpeq_epi8(v1, v2);
     389             : #elif defined(USE_NEON)
     390             :     return vceqq_u8(v1, v2);
     391             : #endif
     392             : }
     393             : #endif                          /* ! USE_NO_SIMD */
     394             : 
     395             : #ifndef USE_NO_SIMD
     396             : static inline Vector32
     397         272 : vector32_eq(const Vector32 v1, const Vector32 v2)
     398             : {
     399             : #ifdef USE_SSE2
     400         272 :     return _mm_cmpeq_epi32(v1, v2);
     401             : #elif defined(USE_NEON)
     402             :     return vceqq_u32(v1, v2);
     403             : #endif
     404             : }
     405             : #endif                          /* ! USE_NO_SIMD */
     406             : 
     407             : /*
     408             :  * Given two vectors, return a vector with the minimum element of each.
     409             :  */
     410             : #ifndef USE_NO_SIMD
     411             : static inline Vector8
     412      209452 : vector8_min(const Vector8 v1, const Vector8 v2)
     413             : {
     414             : #ifdef USE_SSE2
     415      209452 :     return _mm_min_epu8(v1, v2);
     416             : #elif defined(USE_NEON)
     417             :     return vminq_u8(v1, v2);
     418             : #endif
     419             : }
     420             : #endif                          /* ! USE_NO_SIMD */
     421             : 
     422             : #endif                          /* SIMD_H */

Generated by: LCOV version 1.14