LCOV - code coverage report
Current view: top level - src/port - pg_popcount_avx512.c (source / functions) Hit Total Coverage
Test: PostgreSQL 17devel Lines: 0 48 0.0 %
Date: 2024-05-02 11:11:27 Functions: 0 2 0.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * pg_popcount_avx512.c
       4             :  *    Holds the AVX-512 pg_popcount() implementation.
       5             :  *
       6             :  * Copyright (c) 2024, PostgreSQL Global Development Group
       7             :  *
       8             :  * IDENTIFICATION
       9             :  *    src/port/pg_popcount_avx512.c
      10             :  *
      11             :  *-------------------------------------------------------------------------
      12             :  */
      13             : #include "c.h"
      14             : 
      15             : #include <immintrin.h>
      16             : 
      17             : #include "port/pg_bitutils.h"
      18             : 
      19             : /*
      20             :  * It's probably unlikely that TRY_POPCNT_FAST won't be set if we are able to
      21             :  * use AVX-512 intrinsics, but we check it anyway to be sure.  We piggy-back on
      22             :  * the function pointers that are only used when TRY_POPCNT_FAST is set.
      23             :  */
      24             : #ifdef TRY_POPCNT_FAST
      25             : 
      26             : /*
      27             :  * pg_popcount_avx512
      28             :  *      Returns the number of 1-bits in buf
      29             :  */
      30             : uint64
      31           0 : pg_popcount_avx512(const char *buf, int bytes)
      32             : {
      33             :     __m512i     val,
      34             :                 cnt;
      35           0 :     __m512i     accum = _mm512_setzero_si512();
      36             :     const char *final;
      37             :     int         tail_idx;
      38           0 :     __mmask64   mask = ~UINT64CONST(0);
      39             : 
      40             :     /*
      41             :      * Align buffer down to avoid double load overhead from unaligned access.
      42             :      * Calculate a mask to ignore preceding bytes.  Find start offset of final
      43             :      * iteration and ensure it is not empty.
      44             :      */
      45           0 :     mask <<= ((uintptr_t) buf) % sizeof(__m512i);
      46           0 :     tail_idx = (((uintptr_t) buf + bytes - 1) % sizeof(__m512i)) + 1;
      47           0 :     final = (const char *) TYPEALIGN_DOWN(sizeof(__m512i), buf + bytes - 1);
      48           0 :     buf = (const char *) TYPEALIGN_DOWN(sizeof(__m512i), buf);
      49             : 
      50             :     /*
      51             :      * Iterate through all but the final iteration.  Starting from the second
      52             :      * iteration, the mask is ignored.
      53             :      */
      54           0 :     if (buf < final)
      55             :     {
      56           0 :         val = _mm512_maskz_loadu_epi8(mask, (const __m512i *) buf);
      57           0 :         cnt = _mm512_popcnt_epi64(val);
      58           0 :         accum = _mm512_add_epi64(accum, cnt);
      59             : 
      60           0 :         buf += sizeof(__m512i);
      61           0 :         mask = ~UINT64CONST(0);
      62             : 
      63           0 :         for (; buf < final; buf += sizeof(__m512i))
      64             :         {
      65           0 :             val = _mm512_load_si512((const __m512i *) buf);
      66           0 :             cnt = _mm512_popcnt_epi64(val);
      67           0 :             accum = _mm512_add_epi64(accum, cnt);
      68             :         }
      69             :     }
      70             : 
      71             :     /* Final iteration needs to ignore bytes that are not within the length */
      72           0 :     mask &= (~UINT64CONST(0) >> (sizeof(__m512i) - tail_idx));
      73             : 
      74           0 :     val = _mm512_maskz_loadu_epi8(mask, (const __m512i *) buf);
      75           0 :     cnt = _mm512_popcnt_epi64(val);
      76           0 :     accum = _mm512_add_epi64(accum, cnt);
      77             : 
      78           0 :     return _mm512_reduce_add_epi64(accum);
      79             : }
      80             : 
      81             : /*
      82             :  * pg_popcount_masked_avx512
      83             :  *      Returns the number of 1-bits in buf after applying the mask to each byte
      84             :  */
      85             : uint64
      86           0 : pg_popcount_masked_avx512(const char *buf, int bytes, bits8 mask)
      87             : {
      88             :     __m512i     val,
      89             :                 vmasked,
      90             :                 cnt;
      91           0 :     __m512i     accum = _mm512_setzero_si512();
      92             :     const char *final;
      93             :     int         tail_idx;
      94           0 :     __mmask64   bmask = ~UINT64CONST(0);
      95           0 :     const       __m512i maskv = _mm512_set1_epi8(mask);
      96             : 
      97             :     /*
      98             :      * Align buffer down to avoid double load overhead from unaligned access.
      99             :      * Calculate a mask to ignore preceding bytes.  Find start offset of final
     100             :      * iteration and ensure it is not empty.
     101             :      */
     102           0 :     bmask <<= ((uintptr_t) buf) % sizeof(__m512i);
     103           0 :     tail_idx = (((uintptr_t) buf + bytes - 1) % sizeof(__m512i)) + 1;
     104           0 :     final = (const char *) TYPEALIGN_DOWN(sizeof(__m512i), buf + bytes - 1);
     105           0 :     buf = (const char *) TYPEALIGN_DOWN(sizeof(__m512i), buf);
     106             : 
     107             :     /*
     108             :      * Iterate through all but the final iteration.  Starting from the second
     109             :      * iteration, the mask is ignored.
     110             :      */
     111           0 :     if (buf < final)
     112             :     {
     113           0 :         val = _mm512_maskz_loadu_epi8(bmask, (const __m512i *) buf);
     114           0 :         vmasked = _mm512_and_si512(val, maskv);
     115           0 :         cnt = _mm512_popcnt_epi64(vmasked);
     116           0 :         accum = _mm512_add_epi64(accum, cnt);
     117             : 
     118           0 :         buf += sizeof(__m512i);
     119           0 :         bmask = ~UINT64CONST(0);
     120             : 
     121           0 :         for (; buf < final; buf += sizeof(__m512i))
     122             :         {
     123           0 :             val = _mm512_load_si512((const __m512i *) buf);
     124           0 :             vmasked = _mm512_and_si512(val, maskv);
     125           0 :             cnt = _mm512_popcnt_epi64(vmasked);
     126           0 :             accum = _mm512_add_epi64(accum, cnt);
     127             :         }
     128             :     }
     129             : 
     130             :     /* Final iteration needs to ignore bytes that are not within the length */
     131           0 :     bmask &= (~UINT64CONST(0) >> (sizeof(__m512i) - tail_idx));
     132             : 
     133           0 :     val = _mm512_maskz_loadu_epi8(bmask, (const __m512i *) buf);
     134           0 :     vmasked = _mm512_and_si512(val, maskv);
     135           0 :     cnt = _mm512_popcnt_epi64(vmasked);
     136           0 :     accum = _mm512_add_epi64(accum, cnt);
     137             : 
     138           0 :     return _mm512_reduce_add_epi64(accum);
     139             : }
     140             : 
     141             : #endif                          /* TRY_POPCNT_FAST */

Generated by: LCOV version 1.14