Line data Source code
1 : /*------------------------------------------------------------------------- 2 : * 3 : * pg_popcount_avx512.c 4 : * Holds the AVX-512 pg_popcount() implementation. 5 : * 6 : * Copyright (c) 2024, PostgreSQL Global Development Group 7 : * 8 : * IDENTIFICATION 9 : * src/port/pg_popcount_avx512.c 10 : * 11 : *------------------------------------------------------------------------- 12 : */ 13 : #include "c.h" 14 : 15 : #include <immintrin.h> 16 : 17 : #include "port/pg_bitutils.h" 18 : 19 : /* 20 : * It's probably unlikely that TRY_POPCNT_FAST won't be set if we are able to 21 : * use AVX-512 intrinsics, but we check it anyway to be sure. We piggy-back on 22 : * the function pointers that are only used when TRY_POPCNT_FAST is set. 23 : */ 24 : #ifdef TRY_POPCNT_FAST 25 : 26 : /* 27 : * pg_popcount_avx512 28 : * Returns the number of 1-bits in buf 29 : */ 30 : uint64 31 0 : pg_popcount_avx512(const char *buf, int bytes) 32 : { 33 : __m512i val, 34 : cnt; 35 0 : __m512i accum = _mm512_setzero_si512(); 36 : const char *final; 37 : int tail_idx; 38 0 : __mmask64 mask = ~UINT64CONST(0); 39 : 40 : /* 41 : * Align buffer down to avoid double load overhead from unaligned access. 42 : * Calculate a mask to ignore preceding bytes. Find start offset of final 43 : * iteration and ensure it is not empty. 44 : */ 45 0 : mask <<= ((uintptr_t) buf) % sizeof(__m512i); 46 0 : tail_idx = (((uintptr_t) buf + bytes - 1) % sizeof(__m512i)) + 1; 47 0 : final = (const char *) TYPEALIGN_DOWN(sizeof(__m512i), buf + bytes - 1); 48 0 : buf = (const char *) TYPEALIGN_DOWN(sizeof(__m512i), buf); 49 : 50 : /* 51 : * Iterate through all but the final iteration. Starting from the second 52 : * iteration, the mask is ignored. 53 : */ 54 0 : if (buf < final) 55 : { 56 0 : val = _mm512_maskz_loadu_epi8(mask, (const __m512i *) buf); 57 0 : cnt = _mm512_popcnt_epi64(val); 58 0 : accum = _mm512_add_epi64(accum, cnt); 59 : 60 0 : buf += sizeof(__m512i); 61 0 : mask = ~UINT64CONST(0); 62 : 63 0 : for (; buf < final; buf += sizeof(__m512i)) 64 : { 65 0 : val = _mm512_load_si512((const __m512i *) buf); 66 0 : cnt = _mm512_popcnt_epi64(val); 67 0 : accum = _mm512_add_epi64(accum, cnt); 68 : } 69 : } 70 : 71 : /* Final iteration needs to ignore bytes that are not within the length */ 72 0 : mask &= (~UINT64CONST(0) >> (sizeof(__m512i) - tail_idx)); 73 : 74 0 : val = _mm512_maskz_loadu_epi8(mask, (const __m512i *) buf); 75 0 : cnt = _mm512_popcnt_epi64(val); 76 0 : accum = _mm512_add_epi64(accum, cnt); 77 : 78 0 : return _mm512_reduce_add_epi64(accum); 79 : } 80 : 81 : /* 82 : * pg_popcount_masked_avx512 83 : * Returns the number of 1-bits in buf after applying the mask to each byte 84 : */ 85 : uint64 86 0 : pg_popcount_masked_avx512(const char *buf, int bytes, bits8 mask) 87 : { 88 : __m512i val, 89 : vmasked, 90 : cnt; 91 0 : __m512i accum = _mm512_setzero_si512(); 92 : const char *final; 93 : int tail_idx; 94 0 : __mmask64 bmask = ~UINT64CONST(0); 95 0 : const __m512i maskv = _mm512_set1_epi8(mask); 96 : 97 : /* 98 : * Align buffer down to avoid double load overhead from unaligned access. 99 : * Calculate a mask to ignore preceding bytes. Find start offset of final 100 : * iteration and ensure it is not empty. 101 : */ 102 0 : bmask <<= ((uintptr_t) buf) % sizeof(__m512i); 103 0 : tail_idx = (((uintptr_t) buf + bytes - 1) % sizeof(__m512i)) + 1; 104 0 : final = (const char *) TYPEALIGN_DOWN(sizeof(__m512i), buf + bytes - 1); 105 0 : buf = (const char *) TYPEALIGN_DOWN(sizeof(__m512i), buf); 106 : 107 : /* 108 : * Iterate through all but the final iteration. Starting from the second 109 : * iteration, the mask is ignored. 110 : */ 111 0 : if (buf < final) 112 : { 113 0 : val = _mm512_maskz_loadu_epi8(bmask, (const __m512i *) buf); 114 0 : vmasked = _mm512_and_si512(val, maskv); 115 0 : cnt = _mm512_popcnt_epi64(vmasked); 116 0 : accum = _mm512_add_epi64(accum, cnt); 117 : 118 0 : buf += sizeof(__m512i); 119 0 : bmask = ~UINT64CONST(0); 120 : 121 0 : for (; buf < final; buf += sizeof(__m512i)) 122 : { 123 0 : val = _mm512_load_si512((const __m512i *) buf); 124 0 : vmasked = _mm512_and_si512(val, maskv); 125 0 : cnt = _mm512_popcnt_epi64(vmasked); 126 0 : accum = _mm512_add_epi64(accum, cnt); 127 : } 128 : } 129 : 130 : /* Final iteration needs to ignore bytes that are not within the length */ 131 0 : bmask &= (~UINT64CONST(0) >> (sizeof(__m512i) - tail_idx)); 132 : 133 0 : val = _mm512_maskz_loadu_epi8(bmask, (const __m512i *) buf); 134 0 : vmasked = _mm512_and_si512(val, maskv); 135 0 : cnt = _mm512_popcnt_epi64(vmasked); 136 0 : accum = _mm512_add_epi64(accum, cnt); 137 : 138 0 : return _mm512_reduce_add_epi64(accum); 139 : } 140 : 141 : #endif /* TRY_POPCNT_FAST */