From c15019736cf2eb0bb7414987757f5d4278526a93 Mon Sep 17 00:00:00 2001 From: "Marcus D. R. Klarqvist" Date: Mon, 19 Aug 2019 00:22:39 +0100 Subject: [PATCH] assertion --- benchmark.cpp | 58 +++++++++++++++++++++++++++------------------------ libalgebra.h | 9 ++++---- 2 files changed, 35 insertions(+), 32 deletions(-) diff --git a/benchmark.cpp b/benchmark.cpp index 09fba3d..905f006 100644 --- a/benchmark.cpp +++ b/benchmark.cpp @@ -33,6 +33,32 @@ uint64_t* generate_random_data(uint32_t n_bitmaps) { return mem; } +#if !defined(__clang__) && !defined(_MSC_VER) +__attribute__((optimize("no-tree-vectorize"))) +#endif +uint64_t popcount_scalar_naive_nosimd(const uint8_t* data, size_t len) { + uint64_t total = 0; + // for (int i = 0; i < len; ++i) { + // total += STORM_popcount64(data1[i] & data2[i]); + // } + // assert(len % 8 == 0); + + for (int j = 0; j < len; j += 8) { + // total += STORM_popcount64(data[i]); + // diff = data1[i] & data2[i]; + total += STORM_popcnt_lookup8bit[data[j+0]]; + total += STORM_popcnt_lookup8bit[data[j+1]]; + total += STORM_popcnt_lookup8bit[data[j+2]]; + total += STORM_popcnt_lookup8bit[data[j+3]]; + total += STORM_popcnt_lookup8bit[data[j+4]]; + total += STORM_popcnt_lookup8bit[data[j+5]]; + total += STORM_popcnt_lookup8bit[data[j+6]]; + total += STORM_popcnt_lookup8bit[data[j+7]]; + } + + return total; +} + #ifdef __linux__ #include // for __NR_perf_event_open @@ -266,10 +292,14 @@ int linux_popcount_wrapper(std::string name, unified.start(); // Call argument subroutine pointer. - total += (*f)((uint8_t*)mem1, n_bitmaps*8); + uint64_t a = (*f)((uint8_t*)mem1, n_bitmaps*8); unified.end(results); allresults.push_back(results); + uint64_t b = popcount_scalar_naive_nosimd((uint8_t*)mem1, n_bitmaps*8); + assert(a == b); + total += a; + STORM_aligned_free(mem1); } @@ -332,32 +362,6 @@ uint64_t get_cpu_cycles() { return result; }; -#if !defined(__clang__) && !defined(_MSC_VER) -__attribute__((optimize("no-tree-vectorize"))) -#endif -uint64_t popcount_scalar_naive_nosimd(const uint8_t* data, size_t len) { - uint64_t total = 0; - // for (int i = 0; i < len; ++i) { - // total += STORM_popcount64(data1[i] & data2[i]); - // } - // assert(len % 8 == 0); - - for (int j = 0; j < len; j += 8) { - // total += STORM_popcount64(data[i]); - // diff = data1[i] & data2[i]; - total += STORM_popcnt_lookup8bit[data[j+0]]; - total += STORM_popcnt_lookup8bit[data[j+1]]; - total += STORM_popcnt_lookup8bit[data[j+2]]; - total += STORM_popcnt_lookup8bit[data[j+3]]; - total += STORM_popcnt_lookup8bit[data[j+4]]; - total += STORM_popcnt_lookup8bit[data[j+5]]; - total += STORM_popcnt_lookup8bit[data[j+6]]; - total += STORM_popcnt_lookup8bit[data[j+7]]; - } - - return total; -} - #if !defined(__clang__) && !defined(_MSC_VER) __attribute__((optimize("no-tree-vectorize"))) #endif diff --git a/libalgebra.h b/libalgebra.h index 1948dde..ac0a790 100644 --- a/libalgebra.h +++ b/libalgebra.h @@ -2935,10 +2935,9 @@ uint64_t STORM_popcnt_avx512(const uint64_t* data, const size_t n_ints) { uint64_t count = 0; - const uint64_t n_64b = n_ints / 64; - const uint32_t n_cycles = n_64b / 8; - const uint32_t n_cycles_avx2 = (n_64b % 8) / 4; - const uint32_t n_cycles_sse = ((n_64b % 8) % 4) / 2; + const uint32_t n_cycles = n_ints / 8; + const uint32_t n_cycles_avx2 = (n_ints % 8) / 4; + const uint32_t n_cycles_sse = ((n_ints % 8) % 4) / 2; const __m512i* r1 = (__m512i*)&data[0]; const __m256i* r2 = (__m256i*)&data[n_cycles*8]; @@ -2948,7 +2947,7 @@ uint64_t STORM_popcnt_avx512(const uint64_t* data, count += STORM_popcnt_csa_avx2(r2, n_cycles_avx2); count += STORM_popcnt_csa_sse4(r3, n_cycles_sse); - for (int i = (8*n_cycles + 4*n_cycles + 2*n_cycles_sse); i < n_64b; ++i) { + for (int i = (8*n_cycles + 4*n_cycles + 2*n_cycles_sse); i < n_ints; ++i) { count += STORM_POPCOUNT(data[i]); }