Skip to content

Commit

Permalink
Speedup AVX512 trailing bytes
Browse files Browse the repository at this point in the history
  • Loading branch information
kimwalisch committed Jun 29, 2024
1 parent 972333a commit 6b986a4
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 23 deletions.
10 changes: 7 additions & 3 deletions benchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -92,15 +92,19 @@ int main(int argc, char* argv[])

#if defined(LIBPOPCNT_HAVE_CPUID)
int cpuid = get_cpuid();
if ((cpuid & LIBPOPCNT_BIT_AVX512_VPOPCNTDQ) && bytes >= 48)
if ((cpuid & LIBPOPCNT_BIT_AVX512_VPOPCNTDQ) && bytes >= 40)
algo = "AVX512";
else if ((cpuid & LIBPOPCNT_BIT_AVX2) && bytes >= 512)
algo = "AVX2";
else if (cpuid & LIBPOPCNT_BIT_POPCNT)
algo = "POPCNT";
#else
#if defined(LIBPOPCNT_HAVE_AVX512) && (defined(__AVX512__) || (defined(__AVX512F__) && defined(__AVX512VPOPCNTDQ__)))
if (algo.empty() && bytes >= 48)
#if defined(LIBPOPCNT_HAVE_AVX512) && (defined(__AVX512__) || \
(defined(__AVX512F__) && \
defined(__AVX512BW__) && \
defined(__AVX512VPOPCNTDQ__) && \
defined(__AVX512BITALG__)))
if (algo.empty() && bytes >= 40)
algo = "AVX512";
#endif
#if defined(LIBPOPCNT_HAVE_AVX2) && defined(__AVX2__)
Expand Down
30 changes: 10 additions & 20 deletions libpopcnt.h
Original file line number Diff line number Diff line change
Expand Up @@ -173,10 +173,10 @@ extern "C" {
*/
static inline uint64_t popcnt64_bitwise(uint64_t x)
{
uint64_t m1 = 0x5555555555555555ll;
uint64_t m2 = 0x3333333333333333ll;
uint64_t m4 = 0x0F0F0F0F0F0F0F0Fll;
uint64_t h01 = 0x0101010101010101ll;
uint64_t m1 = 0x5555555555555555ull;
uint64_t m2 = 0x3333333333333333ull;
uint64_t m4 = 0x0F0F0F0F0F0F0F0Full;
uint64_t h01 = 0x0101010101010101ull;

x -= (x >> 1) & m1;
x = (x & m2) + ((x >> 2) & m2);
Expand Down Expand Up @@ -521,22 +521,12 @@ static inline uint64_t popcnt_avx512(const uint8_t* ptr8, uint64_t size)
cnt = _mm512_add_epi64(cnt, vec);
}

/* Process last 64 bytes */
if (i < size64)
{
__mmask8 mask = (__mmask8) (0xff >> (i + 8 - size64));
__m512i vec = _mm512_maskz_loadu_epi64(mask , &ptr64[i]);
vec = _mm512_popcnt_epi64(vec);
cnt = _mm512_add_epi64(cnt, vec);
}

uint64_t bytes = size % sizeof(uint64_t);
i *= sizeof(uint64_t);

/* Process last 8 bytes */
if (bytes != 0)
/* Process last 64 bytes */
if (i < size)
{
i = size - bytes;
__mmask64 mask = (__mmask64) (0xff >> (i + 8 - size));
__mmask64 mask = (__mmask64) (0xffffffffffffffffull >> (i + 64 - size));
__m512i vec = _mm512_maskz_loadu_epi8(mask, &ptr8[i]);
__m512i cnt8 = _mm512_popcnt_epi8(vec);
cnt8 = _mm512_sad_epu8(cnt8, _mm512_setzero_si512());
Expand Down Expand Up @@ -594,10 +584,10 @@ static uint64_t popcnt(const void* data, uint64_t size)
defined(__AVX512VPOPCNTDQ__) && \
defined(__AVX512BITALG__))
/* For tiny arrays AVX512 is not worth it */
if (i + 48 <= size)
if (i + 40 <= size)
#else
if ((cpuid & LIBPOPCNT_BIT_AVX512_VPOPCNTDQ) &&
i + 48 <= size)
i + 40 <= size)
#endif
return popcnt_avx512(ptr, size);
#endif
Expand Down

0 comments on commit 6b986a4

Please sign in to comment.