diff --git a/.travis.yml b/.travis.yml index 633dbe6..e7fbd6e 100644 --- a/.travis.yml +++ b/.travis.yml @@ -191,4 +191,9 @@ before_script: script: - cmake . - make - - ./benchmark \ No newline at end of file + - | + if [[ "${TRAVIS_OS_NAME}" == "linux" ]]; then + sudo ./benchmark -r 10 + else + ./benchmark -r 10 + fi \ No newline at end of file diff --git a/README.md b/README.md index bd16743..d8b98ec 100644 --- a/README.md +++ b/README.md @@ -29,7 +29,7 @@ The core algorithms are described in the papers: ### Speedup -Sample performance metrics (practical upper limit) on AVX512BW machine. We simulate a single data array or pairs of data arrays and compute the same statistics many times. This reflect the fastest possible throughput if you never have to leave the destination cache-level. +Sample performance metrics (practical upper limit) on AVX512BW machine. We simulate a single data array or pairs of data arrays in a aligned memory location and compute the same statistics many times using the command `benchmark -p -r 10000` (required Linux `perf` subsystem). This reflect the fastest possible throughput if you never have to leave the destination cache-level. The host architecture used is a 10 nm Cannon Lake [Core i3-8121U](https://ark.intel.com/content/www/us/en/ark/products/136863/intel-core-i3-8121u-processor-4m-cache-up-to-3-20-ghz.html) with gcc (GCC) 8.2.1 20180905 (Red Hat 8.2.1-3). ### POSPOPCNT @@ -49,24 +49,58 @@ CPUs compared to a naive unvectorized solution ### POPCNT Fold speedup compared to a naive unvectorized algorithm -(`popcount_scalar_naive_nosimd`) for different array sizes. - -| Algorithm | 8 | 32 | 128 | 256 | 512 | 1024 | 2048 | 4096 | 8192 | -|-----------|-----|-----|------|------|------|-------|-------|-------|-------| -| popcnt | 3.3 | 6.5 | 20.1 | 25.4 | 62.4 | 116.7 | 175.1 | 226.4 | 247.8 | +(`popcount_scalar_naive_nosimd`) for different array sizes as (CPU cycles/64-bit word, Instructions/64-bit word): + +| Words | libalgebra.h | Scalar | Speedup | +|---------|--------------|---------------|---------| +| 4 | 27.75 (37) | 26.75 (33.5) | 1 | +| 8 | 16.38 (25.5) | 17.38 (30.25) | 1.1 | +| 16 | 10.5 (19.94) | 12.75 (28.63) | 1.2 | +| 32 | 7.72 (17.16) | 10.69 (27.81) | 1.4 | +| 64 | 3.09 (4.36) | 9.61 (27.41) | 3.1 | +| 128 | 2.53 (2.73) | 8.84 (27.2) | 3.5 | +| 256 | 1.35 (1.7) | 8.5 (27.1) | 6.3 | +| 512 | 0.67 (1.18) | 8.33 (27.05) | 12.4 | +| 1024 | 0.5 (0.92) | 8.25 (27.03) | 16.4 | +| 2048 | 0.41 (0.79) | 8.15 (27.01) | 20.1 | +| 4096 | 0.46 (0.72) | 8.12 (27.01) | 17.8 | +| 8192 | 0.39 (0.69) | 8.11 (27) | 21 | +| 16384 | 0.39 (0.67) | 8.1 (27) | 20.6 | +| 32768 | 0.89 (0.66) | 8.1 (27) | 9.1 | +| 65536 | 0.84 (0.66) | 8.1 (27) | 9.6 | +| 131072 | 0.68 (0.66) | 8.09 (27) | 11.9 | +| 262144 | 1.11 (0.66) | 8.09 (27) | 7.3 | +| 524288 | 1.84 (0.66) | 8.12 (27) | 4.4 | +| 1048576 | 1.95 (0.66) | 8.15 (27) | 4.2 | ### Set algebra Fold speedup compared to naive unvectorized solution (`*_scalar_naive_nosimd`) -for different array sizes (in number of _pairs_ of 64-bit values). These +for different array sizes (in number of _pairs_ of 64-bit word but results reported per _single_ 64-bit word). These functions are identifical with the exception of the bitwise operator used (AND, OR, or XOR) which all have identical latency and throughput (CPI). -| Algorithm | 8 | 32 | 128 | 256 | 512 | 1024 | 2048 | 4096 | 8192 | -|-----------------|------|-------|-------|-------|-------|-------|-------|-------|-------| -| intersect count | 4.73 | 10.8 | 17.58 | 24.82 | 31 | 35.78 | 37.75 | 23.08 | 20.81 | -| union count | 4.64 | 10.96 | 17.19 | 24.88 | 31.09 | 35.74 | 37.95 | 22.92 | 21.11 | -| diff count | 4.57 | 10.93 | 17.31 | 24.78 | 30.98 | 35.74 | 37.87 | 23.31 | 21.42 | +| Words | libalgebra.h | Scalar | Speedup | +|---------|--------------|---------------|---------| +| 4 | 17.63 (8.63) | 14.63 (22.75) | 0.8 | +| 8 | 8.13 (5.44) | 10 (20.88) | 1.2 | +| 16 | 4.69 (3.84) | 7.91 (19.94) | 1.7 | +| 32 | 2.38 (2.56) | 6.59 (19.47) | 2.8 | +| 64 | 1.82 (2.06) | 5.87 (19.23) | 3.2 | +| 128 | 0.88 (0.89) | 5.43 (19.12) | 6.2 | +| 256 | 0.57 (0.64) | 5.18 (19.06) | 9.2 | +| 512 | 0.41 (0.51) | 5.11 (19.03) | 12.4 | +| 1024 | 0.33 (0.45) | 5.06 (19.02) | 15.3 | +| 2048 | 0.39 (0.41) | 5.03 (19.01) | 13.1 | +| 4096 | 0.36 (0.4) | 5.02 (19) | 13.9 | +| 8192 | 0.37 (0.39) | 5.01 (19) | 13.7 | +| 16384 | 0.55 (0.39) | 5.01 (19) | 9.1 | +| 32768 | 0.55 (0.39) | 5 (19) | 9.2 | +| 65536 | 0.52 (0.38) | 5 (19) | 9.7 | +| 131072 | 0.56 (0.38) | 5.01 (19) | 9 | +| 262144 | 1.25 (0.38) | 5.02 (19) | 4 | +| 524288 | 1.76 (0.38) | 5.03 (19) | 2.9 | +| 1048576 | 1.81 (0.38) | 5.07 (19) | 2.8 | ## C/C++ API @@ -101,7 +135,7 @@ int STORM_pospopcnt_u16(const uint16_t* data, uint32_t size, uint32_t* flags); * Compute the intersection, union, or diff cardinality between pairs of bitmaps * @data1: A 64-bit array * @data2: A 64-bit array - * @size: Size of data in bytes + * @size: Size of data in 64-bit words */ // Intersect cardinality uint64_t STORM_intersect_count(const uint64_t* data1, const uint64_t* data2, const uint32_t size); @@ -111,6 +145,25 @@ uint64_t STORM_union_count(const uint64_t* data1, const uint64_t* data2, const u uint64_t STORM_diff_count(const uint64_t* data1, const uint64_t* data2, const uint32_t size); ``` +### Advanced use + +Retrieve a function pointer to the optimal function given the target length. + +```C +STORM_compute_func STORM_get_intersection_count_func(const size_t n_bitmaps_vector); +STORM_compute_func STORM_get_union_count_func(const size_t n_bitmaps_vector); +STORM_compute_func STORM_get_diff_count_func(const size_t n_bitmaps_vector); +``` + +Portable memory alignment. + +```C +#include "libalgebra.h" + +void* STORM_aligned_malloc(size_t alignment, size_t size); +void STORM_aligned_free(void* memblock); +``` + ## How it works On x86 CPUs ```libalgebra.h``` uses a combination of algorithms depending on the input vector size and what instruction set your CPU supports. These checks are performed during **run-time**. \ No newline at end of file diff --git a/benchmark.cpp b/benchmark.cpp index f140a2e..905f006 100644 --- a/benchmark.cpp +++ b/benchmark.cpp @@ -5,21 +5,58 @@ #include #include #include +#if !defined(_MSC_VER) +#include "getopt.h" +#endif + +uint64_t* generate_random_data(uint32_t n_bitmaps) { + // Clear data + // uint32_t n_bitmaps = ceil(n / 64.0); + // memset(data, 0, sizeof(uint64_t)*n_bitmaps); + uint64_t* mem = (uint64_t*)STORM_aligned_malloc(STORM_get_alignment(), n_bitmaps*sizeof(uint64_t)); -void generate_random_data(uint64_t* data, uint32_t range, uint32_t n) { // PRNG - std::uniform_int_distribution distr(0, range-1); // right inclusive + std::uniform_int_distribution distr(0, std::numeric_limits::max()-1); // right inclusive std::random_device rd; // obtain a random number from hardware std::mt19937 eng(rd()); // seed the generator // Generate some random data. uint32_t n_unique = 0; // while (n_unique < n) { - for (int i = 0; i < n; ++i) { - uint32_t val = distr(eng); - n_unique += (data[val / 64] & (1ULL << (val % 64))) == 0; - data[val / 64] |= 1ULL << (val % 64); + for (int i = 0; i < n_bitmaps; ++i) { + uint32_t val1 = distr(eng); + uint32_t val2 = distr(eng); + uint64_t x = ((uint64_t)val1 << 32) | val2; + mem[i] = x; + } + + return mem; +} + +#if !defined(__clang__) && !defined(_MSC_VER) +__attribute__((optimize("no-tree-vectorize"))) +#endif +uint64_t popcount_scalar_naive_nosimd(const uint8_t* data, size_t len) { + uint64_t total = 0; + // for (int i = 0; i < len; ++i) { + // total += STORM_popcount64(data1[i] & data2[i]); + // } + // assert(len % 8 == 0); + + for (int j = 0; j < len; j += 8) { + // total += STORM_popcount64(data[i]); + // diff = data1[i] & data2[i]; + total += STORM_popcnt_lookup8bit[data[j+0]]; + total += STORM_popcnt_lookup8bit[data[j+1]]; + total += STORM_popcnt_lookup8bit[data[j+2]]; + total += STORM_popcnt_lookup8bit[data[j+3]]; + total += STORM_popcnt_lookup8bit[data[j+4]]; + total += STORM_popcnt_lookup8bit[data[j+5]]; + total += STORM_popcnt_lookup8bit[data[j+6]]; + total += STORM_popcnt_lookup8bit[data[j+7]]; } + + return total; } #ifdef __linux__ @@ -152,8 +189,6 @@ compute_averages(std::vector< std::vector > allresults) { int linux_set_algebra_wrapper(std::string name, STORM_compute_func f, int iterations, - uint64_t* STORM_RESTRICT data1, - uint64_t* STORM_RESTRICT data2, uint32_t range, uint32_t n_values, uint32_t n_bitmaps, @@ -177,21 +212,41 @@ int linux_set_algebra_wrapper(std::string name, volatile uint64_t total = 0; // voltatile to prevent compiler to remove work through optimization for (uint32_t i = 0; i < iterations; i++) { - generate_random_data(data1, range, n_values); - generate_random_data(data2, range, n_values); + uint64_t* mem1 = generate_random_data(n_values); + uint64_t* mem2 = generate_random_data(n_values); unified.start(); // Call argument subroutine pointer. - total += (*f)(data1, data2, n_bitmaps); + total += (*f)(mem1, mem2, n_bitmaps); unified.end(results); allresults.push_back(results); + + STORM_aligned_free(mem1); + STORM_aligned_free(mem2); } std::vector mins = compute_mins(allresults); std::vector avg = compute_averages(allresults); - printf("%s-%u:\n",name.c_str(),n_bitmaps); if (verbose) { + printf("%s\t%u\t%.2f\t%.3f\t%.3f\t%llu\t%llu\t%llu\t%llu\t%llu\t%.1f\t%.1f\t%.1f\t%.1f\t%.1f\n", + name.c_str(), + n_bitmaps, + double(mins[1]) / mins[0], + double(mins[0]) / (2*n_bitmaps), + double(mins[1]) / (2*n_bitmaps), + mins[0], + mins[1], + mins[2], + mins[3], + mins[4], + avg[0], + avg[1], + avg[2], + avg[3], + avg[4]); + } else { + printf("%s-%u:\n",name.c_str(),n_bitmaps); printf("instructions per cycle %4.2f, cycles per 64-bit word: %4.3f, " "instructions per 64-bit word %4.3f \n", double(mins[1]) / mins[0], double(mins[0]) / (2*n_bitmaps), double(mins[1]) / (2*n_bitmaps)); @@ -202,8 +257,6 @@ int linux_set_algebra_wrapper(std::string name, printf("avg: %8.1f cycles, %8.1f instructions, \t%8.1f branch mis., %8.1f " "cache ref., %8.1f cache mis.\n", avg[0], avg[1], avg[2], avg[3], avg[4]); - } else { - printf("cycles per 64-bit word: %4.3f; ref cycles per 64-bit word: %4.3f \n", double(mins[0]) / (2*n_bitmaps), double(mins[5]) / (2*n_bitmaps)); } return 1; @@ -212,10 +265,9 @@ int linux_set_algebra_wrapper(std::string name, int linux_popcount_wrapper(std::string name, STORM_popcnt_func f, int iterations, - uint64_t* data, uint32_t range, uint32_t n_values, - uint32_t n_bitmaps, + uint32_t n_bitmaps, bool verbose) { std::vector evts; @@ -236,23 +288,46 @@ int linux_popcount_wrapper(std::string name, volatile uint64_t total = 0; // voltatile to prevent compiler to remove work through optimization for (uint32_t i = 0; i < iterations; i++) { - generate_random_data(data, range, n_values); + uint64_t* mem1 = generate_random_data(n_values); unified.start(); // Call argument subroutine pointer. - total += (*f)((uint8_t*)data, n_bitmaps*8); + uint64_t a = (*f)((uint8_t*)mem1, n_bitmaps*8); unified.end(results); allresults.push_back(results); + + uint64_t b = popcount_scalar_naive_nosimd((uint8_t*)mem1, n_bitmaps*8); + assert(a == b); + total += a; + + STORM_aligned_free(mem1); } std::vector mins = compute_mins(allresults); std::vector avg = compute_averages(allresults); - printf("%s-%u:\n",name.c_str(),n_bitmaps); if (verbose) { + printf("%s\t%u\t%.2f\t%.3f\t%.3f\t%llu\t%llu\t%llu\t%llu\t%llu\t%.1f\t%.1f\t%.1f\t%.1f\t%.1f\n", + name.c_str(), + n_bitmaps, + double(mins[1]) / mins[0], + double(mins[0]) / (n_bitmaps), + double(mins[1]) / (n_bitmaps), + mins[0], + mins[1], + mins[2], + mins[3], + mins[4], + avg[0], + avg[1], + avg[2], + avg[3], + avg[4]); + } else { + printf("%s-%u:\n",name.c_str(),n_bitmaps); printf("instructions per cycle %4.2f, cycles per 64-bit word: %4.3f, " "instructions per 64-bit word %4.3f \n", - double(mins[1]) / mins[0], double(mins[0]) / n_bitmaps, double(mins[1]) / n_bitmaps); + double(mins[1]) / mins[0], double(mins[0]) / (n_bitmaps), double(mins[1]) / (n_bitmaps)); // first we display mins printf("min: %8llu cycles, %8llu instructions, \t%8llu branch mis., %8llu " "cache ref., %8llu cache mis.\n", @@ -260,81 +335,12 @@ int linux_popcount_wrapper(std::string name, printf("avg: %8.1f cycles, %8.1f instructions, \t%8.1f branch mis., %8.1f " "cache ref., %8.1f cache mis.\n", avg[0], avg[1], avg[2], avg[3], avg[4]); - } else { - printf("cycles per 64-bit word: %4.3f; ref cycles per 64-bit word: %4.3f \n", double(mins[0]) / n_bitmaps, double(mins[5]) / n_bitmaps); } return 1; } #endif // end is linux -uint8_t popcnt_lookup8bit[256] = { - /* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2, - /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3, - /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3, - /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4, - /* 10 */ 1, /* 11 */ 2, /* 12 */ 2, /* 13 */ 3, - /* 14 */ 2, /* 15 */ 3, /* 16 */ 3, /* 17 */ 4, - /* 18 */ 2, /* 19 */ 3, /* 1a */ 3, /* 1b */ 4, - /* 1c */ 3, /* 1d */ 4, /* 1e */ 4, /* 1f */ 5, - /* 20 */ 1, /* 21 */ 2, /* 22 */ 2, /* 23 */ 3, - /* 24 */ 2, /* 25 */ 3, /* 26 */ 3, /* 27 */ 4, - /* 28 */ 2, /* 29 */ 3, /* 2a */ 3, /* 2b */ 4, - /* 2c */ 3, /* 2d */ 4, /* 2e */ 4, /* 2f */ 5, - /* 30 */ 2, /* 31 */ 3, /* 32 */ 3, /* 33 */ 4, - /* 34 */ 3, /* 35 */ 4, /* 36 */ 4, /* 37 */ 5, - /* 38 */ 3, /* 39 */ 4, /* 3a */ 4, /* 3b */ 5, - /* 3c */ 4, /* 3d */ 5, /* 3e */ 5, /* 3f */ 6, - /* 40 */ 1, /* 41 */ 2, /* 42 */ 2, /* 43 */ 3, - /* 44 */ 2, /* 45 */ 3, /* 46 */ 3, /* 47 */ 4, - /* 48 */ 2, /* 49 */ 3, /* 4a */ 3, /* 4b */ 4, - /* 4c */ 3, /* 4d */ 4, /* 4e */ 4, /* 4f */ 5, - /* 50 */ 2, /* 51 */ 3, /* 52 */ 3, /* 53 */ 4, - /* 54 */ 3, /* 55 */ 4, /* 56 */ 4, /* 57 */ 5, - /* 58 */ 3, /* 59 */ 4, /* 5a */ 4, /* 5b */ 5, - /* 5c */ 4, /* 5d */ 5, /* 5e */ 5, /* 5f */ 6, - /* 60 */ 2, /* 61 */ 3, /* 62 */ 3, /* 63 */ 4, - /* 64 */ 3, /* 65 */ 4, /* 66 */ 4, /* 67 */ 5, - /* 68 */ 3, /* 69 */ 4, /* 6a */ 4, /* 6b */ 5, - /* 6c */ 4, /* 6d */ 5, /* 6e */ 5, /* 6f */ 6, - /* 70 */ 3, /* 71 */ 4, /* 72 */ 4, /* 73 */ 5, - /* 74 */ 4, /* 75 */ 5, /* 76 */ 5, /* 77 */ 6, - /* 78 */ 4, /* 79 */ 5, /* 7a */ 5, /* 7b */ 6, - /* 7c */ 5, /* 7d */ 6, /* 7e */ 6, /* 7f */ 7, - /* 80 */ 1, /* 81 */ 2, /* 82 */ 2, /* 83 */ 3, - /* 84 */ 2, /* 85 */ 3, /* 86 */ 3, /* 87 */ 4, - /* 88 */ 2, /* 89 */ 3, /* 8a */ 3, /* 8b */ 4, - /* 8c */ 3, /* 8d */ 4, /* 8e */ 4, /* 8f */ 5, - /* 90 */ 2, /* 91 */ 3, /* 92 */ 3, /* 93 */ 4, - /* 94 */ 3, /* 95 */ 4, /* 96 */ 4, /* 97 */ 5, - /* 98 */ 3, /* 99 */ 4, /* 9a */ 4, /* 9b */ 5, - /* 9c */ 4, /* 9d */ 5, /* 9e */ 5, /* 9f */ 6, - /* a0 */ 2, /* a1 */ 3, /* a2 */ 3, /* a3 */ 4, - /* a4 */ 3, /* a5 */ 4, /* a6 */ 4, /* a7 */ 5, - /* a8 */ 3, /* a9 */ 4, /* aa */ 4, /* ab */ 5, - /* ac */ 4, /* ad */ 5, /* ae */ 5, /* af */ 6, - /* b0 */ 3, /* b1 */ 4, /* b2 */ 4, /* b3 */ 5, - /* b4 */ 4, /* b5 */ 5, /* b6 */ 5, /* b7 */ 6, - /* b8 */ 4, /* b9 */ 5, /* ba */ 5, /* bb */ 6, - /* bc */ 5, /* bd */ 6, /* be */ 6, /* bf */ 7, - /* c0 */ 2, /* c1 */ 3, /* c2 */ 3, /* c3 */ 4, - /* c4 */ 3, /* c5 */ 4, /* c6 */ 4, /* c7 */ 5, - /* c8 */ 3, /* c9 */ 4, /* ca */ 4, /* cb */ 5, - /* cc */ 4, /* cd */ 5, /* ce */ 5, /* cf */ 6, - /* d0 */ 3, /* d1 */ 4, /* d2 */ 4, /* d3 */ 5, - /* d4 */ 4, /* d5 */ 5, /* d6 */ 5, /* d7 */ 6, - /* d8 */ 4, /* d9 */ 5, /* da */ 5, /* db */ 6, - /* dc */ 5, /* dd */ 6, /* de */ 6, /* df */ 7, - /* e0 */ 3, /* e1 */ 4, /* e2 */ 4, /* e3 */ 5, - /* e4 */ 4, /* e5 */ 5, /* e6 */ 5, /* e7 */ 6, - /* e8 */ 4, /* e9 */ 5, /* ea */ 5, /* eb */ 6, - /* ec */ 5, /* ed */ 6, /* ee */ 6, /* ef */ 7, - /* f0 */ 4, /* f1 */ 5, /* f2 */ 5, /* f3 */ 6, - /* f4 */ 5, /* f5 */ 6, /* f6 */ 6, /* f7 */ 7, - /* f8 */ 5, /* f9 */ 6, /* fa */ 6, /* fb */ 7, - /* fc */ 6, /* fd */ 7, /* fe */ 7, /* ff */ 8 -}; - struct bench_unit { bench_unit() : valid(false), cycles(0), cycles_local(0), times(0), times_local(0){} @@ -359,33 +365,7 @@ uint64_t get_cpu_cycles() { #if !defined(__clang__) && !defined(_MSC_VER) __attribute__((optimize("no-tree-vectorize"))) #endif -uint64_t popcount_scalar_naive_nosimd(const uint8_t* data, uint32_t len) { - uint64_t total = 0; - // for (int i = 0; i < len; ++i) { - // total += STORM_popcount64(data1[i] & data2[i]); - // } - // assert(len % 8 == 0); - - for (int j = 0; j < len; j += 8) { - // total += STORM_popcount64(data[i]); - // diff = data1[i] & data2[i]; - total += popcnt_lookup8bit[data[j+0]]; - total += popcnt_lookup8bit[data[j+1]]; - total += popcnt_lookup8bit[data[j+2]]; - total += popcnt_lookup8bit[data[j+3]]; - total += popcnt_lookup8bit[data[j+4]]; - total += popcnt_lookup8bit[data[j+5]]; - total += popcnt_lookup8bit[data[j+6]]; - total += popcnt_lookup8bit[data[j+7]]; - } - - return total; -} - -#if !defined(__clang__) && !defined(_MSC_VER) -__attribute__((optimize("no-tree-vectorize"))) -#endif -uint64_t intersect_scalar_naive_nosimd(const uint64_t* STORM_RESTRICT data1,const uint64_t* STORM_RESTRICT data2, uint32_t len) { +uint64_t intersect_scalar_naive_nosimd(const uint64_t* STORM_RESTRICT data1,const uint64_t* STORM_RESTRICT data2, size_t len) { uint64_t total = 0; // for (int i = 0; i < len; ++i) { // total += STORM_popcount64(data1[i] & data2[i]); @@ -396,14 +376,14 @@ uint64_t intersect_scalar_naive_nosimd(const uint64_t* STORM_RESTRICT data1,cons for (int i = 0; i < len; ++i) { // total += STORM_popcount64(data1[i] & data2[i]); diff = data1[i] & data2[i]; - total += popcnt_lookup8bit[b8[0]]; - total += popcnt_lookup8bit[b8[1]]; - total += popcnt_lookup8bit[b8[2]]; - total += popcnt_lookup8bit[b8[3]]; - total += popcnt_lookup8bit[b8[4]]; - total += popcnt_lookup8bit[b8[5]]; - total += popcnt_lookup8bit[b8[6]]; - total += popcnt_lookup8bit[b8[7]]; + total += STORM_popcnt_lookup8bit[b8[0]]; + total += STORM_popcnt_lookup8bit[b8[1]]; + total += STORM_popcnt_lookup8bit[b8[2]]; + total += STORM_popcnt_lookup8bit[b8[3]]; + total += STORM_popcnt_lookup8bit[b8[4]]; + total += STORM_popcnt_lookup8bit[b8[5]]; + total += STORM_popcnt_lookup8bit[b8[6]]; + total += STORM_popcnt_lookup8bit[b8[7]]; } return total; @@ -412,7 +392,7 @@ uint64_t intersect_scalar_naive_nosimd(const uint64_t* STORM_RESTRICT data1,cons #if !defined(__clang__) && !defined(_MSC_VER) __attribute__((optimize("no-tree-vectorize"))) #endif -uint64_t union_scalar_naive_nosimd(const uint64_t* STORM_RESTRICT data1,const uint64_t* STORM_RESTRICT data2, uint32_t len) { +uint64_t union_scalar_naive_nosimd(const uint64_t* STORM_RESTRICT data1,const uint64_t* STORM_RESTRICT data2, size_t len) { uint64_t total = 0; // for (int i = 0; i < len; ++i) { // total += STORM_popcount64(data1[i] | data2[i]); @@ -423,14 +403,14 @@ uint64_t union_scalar_naive_nosimd(const uint64_t* STORM_RESTRICT data1,const ui for (int i = 0; i < len; ++i) { // total += STORM_popcount64(data1[i] | data2[i]); diff = data1[i] | data2[i]; - total += popcnt_lookup8bit[b8[0]]; - total += popcnt_lookup8bit[b8[1]]; - total += popcnt_lookup8bit[b8[2]]; - total += popcnt_lookup8bit[b8[3]]; - total += popcnt_lookup8bit[b8[4]]; - total += popcnt_lookup8bit[b8[5]]; - total += popcnt_lookup8bit[b8[6]]; - total += popcnt_lookup8bit[b8[7]]; + total += STORM_popcnt_lookup8bit[b8[0]]; + total += STORM_popcnt_lookup8bit[b8[1]]; + total += STORM_popcnt_lookup8bit[b8[2]]; + total += STORM_popcnt_lookup8bit[b8[3]]; + total += STORM_popcnt_lookup8bit[b8[4]]; + total += STORM_popcnt_lookup8bit[b8[5]]; + total += STORM_popcnt_lookup8bit[b8[6]]; + total += STORM_popcnt_lookup8bit[b8[7]]; } return total; @@ -439,7 +419,7 @@ uint64_t union_scalar_naive_nosimd(const uint64_t* STORM_RESTRICT data1,const ui #if !defined(__clang__) && !defined(_MSC_VER) __attribute__((optimize("no-tree-vectorize"))) #endif -uint64_t diff_scalar_naive_nosimd(const uint64_t* STORM_RESTRICT data1,const uint64_t* STORM_RESTRICT data2, uint32_t len) { +uint64_t diff_scalar_naive_nosimd(const uint64_t* STORM_RESTRICT data1,const uint64_t* STORM_RESTRICT data2, size_t len) { uint64_t total = 0; // for (int i = 0; i < len; ++i) { // total += STORM_popcount64(data1[i] ^ data2[i]); @@ -450,14 +430,14 @@ uint64_t diff_scalar_naive_nosimd(const uint64_t* STORM_RESTRICT data1,const uin for (int i = 0; i < len; ++i) { // total += STORM_popcount64(data1[i] ^ data2[i]); diff = data1[i] ^ data2[i]; - total += popcnt_lookup8bit[b8[0]]; - total += popcnt_lookup8bit[b8[1]]; - total += popcnt_lookup8bit[b8[2]]; - total += popcnt_lookup8bit[b8[3]]; - total += popcnt_lookup8bit[b8[4]]; - total += popcnt_lookup8bit[b8[5]]; - total += popcnt_lookup8bit[b8[6]]; - total += popcnt_lookup8bit[b8[7]]; + total += STORM_popcnt_lookup8bit[b8[0]]; + total += STORM_popcnt_lookup8bit[b8[1]]; + total += STORM_popcnt_lookup8bit[b8[2]]; + total += STORM_popcnt_lookup8bit[b8[3]]; + total += STORM_popcnt_lookup8bit[b8[4]]; + total += STORM_popcnt_lookup8bit[b8[5]]; + total += STORM_popcnt_lookup8bit[b8[6]]; + total += STORM_popcnt_lookup8bit[b8[7]]; } return total; @@ -469,11 +449,9 @@ typedef std::chrono::high_resolution_clock::time_point clockdef; int set_algebra_wrapper(std::string name, STORM_compute_func f, int iterations, - uint64_t* STORM_RESTRICT data1, - uint64_t* STORM_RESTRICT data2, uint32_t range, uint32_t n_values, - uint32_t n_bitmaps, + size_t n_bitmaps, bench_unit& unit) { uint32_t cycles_low = 0, cycles_high = 0; @@ -503,8 +481,8 @@ asm volatile("RDTSCP\n\t" "mov %%eax, %1\n\t" "CPUID\n\t": "=r" (cycles_high1), "=r" (cycles_low1):: "%rax", "%rbx", "%rcx", "%rdx"); #endif - generate_random_data(data1, range, n_values); - generate_random_data(data2, range, n_values); + uint64_t* mem1 = generate_random_data(n_values); + uint64_t* mem2 = generate_random_data(n_values); volatile uint64_t total = 0; // voltatile to prevent compiler to remove work through optimization clockdef t1 = std::chrono::high_resolution_clock::now(); @@ -525,7 +503,7 @@ asm volatile("RDTSCP\n\t" for (int i = 0; i < iterations; ++i) { // Call argument subroutine pointer. - total += (*f)(data1, data2, n_bitmaps); + total += (*f)(mem1, mem2, n_bitmaps); } #ifndef _MSC_VER @@ -542,6 +520,9 @@ asm volatile("RDTSCP\n\t" clockdef t2 = std::chrono::high_resolution_clock::now(); auto time_span = std::chrono::duration_cast(t2 - t1); + STORM_aligned_free(mem1); + STORM_aligned_free(mem2); + uint64_t start = ( ((uint64_t)cycles_high << 32) | cycles_low ); uint64_t end = ( ((uint64_t)cycles_high1 << 32) | cycles_low1 ); @@ -566,7 +547,6 @@ asm volatile("RDTSCP\n\t" int popcount_wrapper(std::string name, STORM_popcnt_func f, int iterations, - uint64_t* data, uint32_t range, uint32_t n_values, uint32_t n_bitmaps, @@ -599,7 +579,7 @@ asm volatile("RDTSCP\n\t" "mov %%eax, %1\n\t" "CPUID\n\t": "=r" (cycles_high1), "=r" (cycles_low1):: "%rax", "%rbx", "%rcx", "%rdx"); #endif - generate_random_data(data, range, n_values); + uint64_t* mem = generate_random_data(n_values); volatile uint64_t total = 0; // voltatile to prevent compiler to remove work through optimization clockdef t1 = std::chrono::high_resolution_clock::now(); @@ -618,9 +598,10 @@ asm volatile("RDTSCP\n\t" "mov %%eax, %1\n\t": "=r" (cycles_high), "=r" (cycles_low):: "%rax", "%rbx", "%rcx", "%rdx"); #endif + size_t n_b = n_bitmaps*8; for (int i = 0; i < iterations; ++i) { // Call argument subroutine pointer. - total += (*f)((uint8_t*)data, n_bitmaps); + total += (*f)((uint8_t*)mem, n_b); } #ifndef _MSC_VER @@ -637,6 +618,8 @@ asm volatile("RDTSCP\n\t" clockdef t2 = std::chrono::high_resolution_clock::now(); auto time_span = std::chrono::duration_cast(t2 - t1); + STORM_aligned_free(mem); + uint64_t start = ( ((uint64_t)cycles_high << 32) | cycles_low ); uint64_t end = ( ((uint64_t)cycles_high1 << 32) | cycles_low1 ); @@ -658,92 +641,95 @@ asm volatile("RDTSCP\n\t" return 0; } -int benchmark(uint32_t range, uint32_t n_values) { - std::cout << "Range = [0," << range << ") with bits=" << n_values << std::endl; - +int benchmark(int n_repetitions, bool use_perf = false) { // Align some bitmaps. - uint32_t n_bitmaps = ceil(range / 64.0); - uint64_t* bitmaps = (uint64_t*)STORM_aligned_malloc(STORM_get_alignment(), n_bitmaps*sizeof(uint64_t)); - uint64_t* bitmaps2 = (uint64_t*)STORM_aligned_malloc(STORM_get_alignment(), n_bitmaps*sizeof(uint64_t)); - - generate_random_data(bitmaps, range, n_values); - generate_random_data(bitmaps2, range, n_values); + uint64_t* bitmaps = (uint64_t*)STORM_aligned_malloc(STORM_get_alignment(), 1048576*sizeof(uint64_t)); + uint64_t* bitmaps2 = (uint64_t*)STORM_aligned_malloc(STORM_get_alignment(), 1048576*sizeof(uint64_t)); - // Compute the population bit count. - uint64_t bitcnt = STORM_popcnt((uint8_t*)bitmaps, n_bitmaps*8); - std::cout << "POPCNT=" << bitcnt << std::endl; - - // Compute the positional bit count. - uint32_t flag_count[16]; - int pospopcnt = STORM_pospopcnt_u16((uint16_t*)bitmaps, n_bitmaps*4, &flag_count[0]); - uint32_t pospopcnt_total = flag_count[0]; - std::cout << "POSPOPCNT=" << flag_count[0]; - for (int i = 1; i < 16; ++i) { - std::cout << "," << flag_count[i]; - pospopcnt_total += flag_count[i]; + std::vector ranges = {4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768,65536,131072,262144,524288,1048576}; + std::vector reps; + if (n_repetitions <= 0) { + reps = {5000,5000,5000,5000,5000,2500,2500,2500,2500,2500,150,150,150,150,150,150,150,100,100,100}; + } else { + reps = std::vector(ranges.size(), n_repetitions); } - std::cout << std::endl; - std::cout << "POSPOPCNT total=" << pospopcnt_total << std::endl; - - // Compute intersect count - uint64_t intersect_count = STORM_intersect_count(bitmaps, bitmaps2, n_bitmaps); - std::cout << "intersect count=" << intersect_count << std::endl; - - // Compute union count - uint64_t union_count = STORM_union_count(bitmaps, bitmaps2, n_bitmaps); - std::cout << "union count=" << union_count << std::endl; - - // Compute diff count - uint64_t diff_count = STORM_diff_count(bitmaps, bitmaps2, n_bitmaps); - std::cout << "diff count=" << diff_count << std::endl; - - { - // Align some bitmaps. - uint32_t n_bitmaps = ceil(range / 64.0); - uint64_t* bitmaps = (uint64_t*)STORM_aligned_malloc(STORM_get_alignment(), 1048576*sizeof(uint64_t)); - uint64_t* bitmaps2 = (uint64_t*)STORM_aligned_malloc(STORM_get_alignment(), 1048576*sizeof(uint64_t)); - - std::vector ranges = {4,8,32,128,256,512,1024,2048,4096,8192,65536,262144,1048576}; - std::vector iterations = {2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000}; - assert(ranges.size() == iterations.size()); - - for (int i = 0; i < ranges.size(); ++i) { - bench_unit unit_intsec, unit_union, unit_diff; - + + if (use_perf) { +#ifndef __linux__ + std::cerr << "perf counter are only available on Linux systems!" << std::endl; + exit(EXIT_FAILURE); +#endif + printf("Algorithm\tWords\tInstructions/cycle\tCycles/word\tInstructions/word\tMinCycles\tMinInstructions\tMinBranchMiss\tMinCacheRef\tminCacheMiss\tAvgCycles\tAvgInstructions\tAvgBranchMiss\tAvgCacheRef\tAvgCacheMiss\n"); + } + + + for (int i = 0; i < ranges.size(); ++i) { + bench_unit unit_intsec, unit_union, unit_diff; + + if (use_perf) { #ifdef __linux__ - linux_popcount_wrapper("popcount-naive",&popcount_scalar_naive_nosimd, iterations[i], bitmaps, range, n_values, ranges[i], true); - linux_popcount_wrapper("popcount",&STORM_popcnt, iterations[i], bitmaps, range, n_values, ranges[i], true); - linux_set_algebra_wrapper("intersect-naive",&intersect_scalar_naive_nosimd, iterations[i], bitmaps, bitmaps2, range, n_values, ranges[i], true); - linux_set_algebra_wrapper("intersect",STORM_get_intersect_count_func(ranges[i]), iterations[i], bitmaps, bitmaps2, range, n_values, ranges[i], true); - linux_set_algebra_wrapper("union-naive",&union_scalar_naive_nosimd, iterations[i], bitmaps, bitmaps2, range, n_values, ranges[i], true); - linux_set_algebra_wrapper("union",STORM_get_union_count_func(ranges[i]), iterations[i], bitmaps, bitmaps2, range, n_values, ranges[i], true); - linux_set_algebra_wrapper("diff-naive",&diff_scalar_naive_nosimd, iterations[i], bitmaps, bitmaps2, range, n_values, ranges[i], true); - linux_set_algebra_wrapper("diff",STORM_get_diff_count_func(ranges[i]), iterations[i], bitmaps, bitmaps2, range, n_values, ranges[i], true); + linux_popcount_wrapper("popcount-naive",&popcount_scalar_naive_nosimd, reps[i], ranges[i], ranges[i], ranges[i], true); + linux_popcount_wrapper("popcount",&STORM_popcnt, reps[i], ranges[i], ranges[i], ranges[i], true); + linux_set_algebra_wrapper("intersect-naive",&intersect_scalar_naive_nosimd, reps[i], ranges[i], ranges[i], ranges[i], true); + linux_set_algebra_wrapper("intersect",STORM_get_intersect_count_func(ranges[i]), reps[i], ranges[i], ranges[i], ranges[i], true); + linux_set_algebra_wrapper("union-naive",&union_scalar_naive_nosimd, reps[i], ranges[i], ranges[i], ranges[i], true); + linux_set_algebra_wrapper("union",STORM_get_union_count_func(ranges[i]), reps[i], ranges[i], ranges[i], ranges[i], true); + linux_set_algebra_wrapper("diff-naive",&diff_scalar_naive_nosimd, reps[i], ranges[i], ranges[i], ranges[i], true); + linux_set_algebra_wrapper("diff",STORM_get_diff_count_func(ranges[i]), reps[i], ranges[i], ranges[i], ranges[i], true); #else - popcount_wrapper("popcount-naive",&popcount_scalar_naive_nosimd, iterations[i], bitmaps, range, n_values, ranges[i], unit_intsec); - popcount_wrapper("popcount",&STORM_popcnt, iterations[i], bitmaps, range, n_values, ranges[i], unit_intsec); - set_algebra_wrapper("intersect-naive",&intersect_scalar_naive_nosimd, iterations[i], bitmaps, bitmaps2, range, n_values, ranges[i], unit_intsec); - set_algebra_wrapper("intersect",STORM_get_intersect_count_func(ranges[i]), iterations[i], bitmaps, bitmaps2, range, n_values, ranges[i], unit_intsec); - set_algebra_wrapper("union-naive",&union_scalar_naive_nosimd, iterations[i], bitmaps, bitmaps2, range, n_values, ranges[i], unit_intsec); - set_algebra_wrapper("union",STORM_get_union_count_func(ranges[i]), iterations[i], bitmaps, bitmaps2, range, n_values, ranges[i], unit_union); - set_algebra_wrapper("diff-naive",&diff_scalar_naive_nosimd, iterations[i], bitmaps, bitmaps2, range, n_values, ranges[i], unit_intsec); - set_algebra_wrapper("diff",STORM_get_diff_count_func(ranges[i]), iterations[i], bitmaps, bitmaps2, range, n_values, ranges[i], unit_diff); + std::cerr << "perf counter are only available on Linux systems!" << std::endl; + exit(EXIT_FAILURE); #endif + } else { + popcount_wrapper("popcount-naive",&popcount_scalar_naive_nosimd, reps[i], ranges[i], ranges[i], ranges[i], unit_intsec); + popcount_wrapper("popcount",&STORM_popcnt, reps[i], ranges[i], ranges[i], ranges[i], unit_intsec); + set_algebra_wrapper("intersect-naive",&intersect_scalar_naive_nosimd, reps[i], ranges[i], ranges[i], ranges[i], unit_intsec); + set_algebra_wrapper("intersect",STORM_get_intersect_count_func(ranges[i]), reps[i], ranges[i], ranges[i], ranges[i], unit_intsec); + set_algebra_wrapper("union-naive",&union_scalar_naive_nosimd, reps[i], ranges[i], ranges[i], ranges[i], unit_intsec); + set_algebra_wrapper("union",STORM_get_union_count_func(ranges[i]), reps[i], ranges[i], ranges[i], ranges[i], unit_union); + set_algebra_wrapper("diff-naive",&diff_scalar_naive_nosimd, reps[i], ranges[i], ranges[i], ranges[i], unit_intsec); + set_algebra_wrapper("diff",STORM_get_diff_count_func(ranges[i]), reps[i], ranges[i], ranges[i], ranges[i], unit_diff); } - - // Clean up. - STORM_aligned_free(bitmaps); - STORM_aligned_free(bitmaps2); } // Clean up. STORM_aligned_free(bitmaps); STORM_aligned_free(bitmaps2); + return 1; } -int main(int argc, char **argv) { - benchmark(2048000, 50000); +int main(int argc, char **argv) { +#if !defined(_MSC_VER) + bool verbose = false; + bool perf_subsystem = false; + int c; + int n_repetitions = -1; + + while ((c = getopt(argc, argv, "vpr:")) != -1) { + switch (c) { + case 'r': + n_repetitions = atoi(optarg); + break; + case 'v': + verbose = true; + break; + case 'p': + perf_subsystem = true; + break; + default: + abort(); + } + } + + benchmark(n_repetitions, perf_subsystem); +#else + int n_repetitions = -1; + if (argc > 2) { + n_repetitions = std::atoi(argv[1]); + } + benchmark(n_repetitions, false); +#endif return EXIT_SUCCESS; } \ No newline at end of file diff --git a/libalgebra.h b/libalgebra.h index 2a92c7a..ac0a790 100644 --- a/libalgebra.h +++ b/libalgebra.h @@ -65,8 +65,8 @@ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#ifndef LIBALGEBRA_H_9827563662203 -#define LIBALGEBRA_H_9827563662203 +#ifndef LIBALGEBRA_H_8723467365934 +#define LIBALGEBRA_H_8723467365934 /* ************************************* * Includes @@ -77,32 +77,18 @@ #include #include -// Safety +/* ************************************* +* Safety +***************************************/ + #if !(defined(__APPLE__)) && !(defined(__FreeBSD__)) #include // this should never be needed but there are some reports that it is needed. #endif -/* ************************************* - * Support. - * - * These subroutines and definitions are taken from the CRoaring repo - * by Daniel Lemire et al. available under the Apache 2.0 License - * (same as libintersect.h): - * https://github.com/RoaringBitmap/CRoaring/ - ***************************************/ #if defined(__SIZEOF_LONG_LONG__) && __SIZEOF_LONG_LONG__ != 8 #error This code assumes 64-bit long longs (by use of the GCC intrinsics). Your system is not currently supported. #endif -/* === Compiler specifics === */ - -#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* >= C99 */ -# define STORM_RESTRICT restrict -#else -/* note : it might be useful to define __restrict or STORM_RESTRICT for some C++ compilers */ -# define STORM_RESTRICT /* disable */ -#endif - /**************************** * Memory management * @@ -110,6 +96,11 @@ * STORM_aligned_malloc and STORM_aligned_free to prevent clashing with the * same subroutines in Roaring. These subroutines are included here * since there is no hard dependency on using Roaring bitmaps. +* +* These subroutines and definitions are taken from the CRoaring repo +* by Daniel Lemire et al. available under the Apache 2.0 License +* (same as libalgebra.h): +* https://github.com/RoaringBitmap/CRoaring/ ****************************/ // portable version of posix_memalign #ifndef _MSC_VER @@ -150,15 +141,18 @@ void STORM_aligned_free(void* memblock) { // portable alignment #if defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) /* C11+ */ # include -# define STORM_ALIGN(n) alignas(n) +# define STORM_ALIGN(n) alignas(n) #elif defined(__GNUC__) -# define STORM_ALIGN(n) __attribute__ ((aligned(n))) +# define STORM_ALIGN(n) __attribute__ ((aligned(n))) #elif defined(_MSC_VER) -# define STORM_ALIGN(n) __declspec(align(n)) +# define STORM_ALIGN(n) __declspec(align(n)) #else -# define STORM_ALIGN(n) /* disabled */ +# define STORM_ALIGN(n) /* disabled */ #endif +/* ************************************* +* Compiler Specific Options +***************************************/ // Taken from XXHASH #ifdef _MSC_VER /* Visual Studio */ # pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */ @@ -179,29 +173,34 @@ void STORM_aligned_free(void* memblock) { # endif /* __STDC_VERSION__ */ #endif -// disable noise -#ifdef __GNUC__ -#define WARN_UNUSED __attribute__((warn_unused_result)) -#else -#define WARN_UNUSED -#endif - -/*------ SIMD definitions --------*/ - -#define STORM_SSE_ALIGNMENT 16 -#define STORM_AVX2_ALIGNMENT 32 -#define STORM_AVX512_ALIGNMENT 64 - /**************************** * General checks ****************************/ #ifndef __has_builtin - #define __has_builtin(x) 0 + #define STORM_HAS_BUILTIN(x) 0 +#else + #define STORM_HAS_BUILTIN(x) __has_builtin(x) #endif #ifndef __has_attribute - #define __has_attribute(x) 0 + #define STORM_HAS_ATTRIBUTE(x) 0 +#else + #define STORM_HAS_ATTRIBUTE(x) __has_attribute(x) +#endif + +// disable noise +#ifdef __GNUC__ +#define STORM_WARN_UNUSED __attribute__((warn_unused_result)) +#else +#define STORM_WARN_UNUSED +#endif + +#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* >= C99 */ +# define STORM_RESTRICT restrict +#else +/* note : it might be useful to define __restrict or STORM_RESTRICT for some C++ compilers */ +# define STORM_RESTRICT /* disable */ #endif #ifdef __GNUC__ @@ -229,68 +228,81 @@ void STORM_aligned_free(void* memblock) { (defined(__cplusplus) || \ defined(_MSC_VER) || \ (GNUC_PREREQ(4, 2) || \ - __has_builtin(__sync_val_compare_and_swap))) - #define HAVE_CPUID + STORM_HAS_BUILTIN(__sync_val_compare_and_swap))) + #define STORM_HAVE_CPUID #endif #if GNUC_PREREQ(4, 2) || \ - __has_builtin(__builtin_popcount) - #define HAVE_BUILTIN_POPCOUNT + STORM_HAS_BUILTIN(__builtin_popcount) + #define STORM_HAVE_BUILTIN_POPCOUNT #endif #if GNUC_PREREQ(4, 2) || \ CLANG_PREREQ(3, 0) - #define HAVE_ASM_POPCNT + #define STORM_HAVE_ASM_POPCNT #endif -#if defined(HAVE_CPUID) && \ - (defined(HAVE_ASM_POPCNT) || \ +#if defined(STORM_HAVE_CPUID) && \ + (defined(STORM_HAVE_ASM_POPCNT) || \ defined(_MSC_VER)) - #define HAVE_POPCNT + #define STORM_HAVE_POPCNT #endif -#if defined(HAVE_CPUID) && \ +#if defined(STORM_HAVE_CPUID) && \ GNUC_PREREQ(4, 9) - #define HAVE_SSE42 - #define HAVE_AVX2 + #define STORM_HAVE_SSE42 + #define STORM_HAVE_AVX2 #endif -#if defined(HAVE_CPUID) && \ +#if defined(STORM_HAVE_CPUID) && \ GNUC_PREREQ(5, 0) - #define HAVE_AVX512 + #define STORM_HAVE_AVX512 #endif -#if defined(HAVE_CPUID) && \ +#if defined(STORM_HAVE_CPUID) && \ defined(_MSC_VER) && \ defined(__AVX2__) - #define HAVE_SSE42 - #define HAVE_AVX2 + #define STORM_HAVE_SSE42 + #define STORM_HAVE_AVX2 #endif -#if defined(HAVE_CPUID) && \ +#if defined(STORM_HAVE_CPUID) && \ defined(_MSC_VER) && \ defined(__AVX512__) - #define HAVE_AVX512 + #define STORM_HAVE_AVX512 #endif -#if defined(HAVE_CPUID) && \ +#if defined(STORM_HAVE_CPUID) && \ CLANG_PREREQ(3, 8) && \ - __has_attribute(target) && \ + STORM_HAS_ATTRIBUTE(target) && \ (!defined(_MSC_VER) || defined(__AVX2__)) && \ (!defined(__apple_build_version__) || __apple_build_version__ >= 8000000) - #define HAVE_SSE42 - #define HAVE_AVX2 - #define HAVE_AVX512 + #define STORM_HAVE_SSE42 + #define STORM_HAVE_AVX2 + #define STORM_HAVE_AVX512 #endif -#ifdef __cplusplus -extern "C" { +// Target attribute +#if !defined(_MSC_VER) + #define STORM_TARGET(x) __attribute__ ((target (x))) +#else + #define STORM_TARGET(x) 0 #endif + /**************************** -* CPUID +* CPUID and SIMD ****************************/ -#if defined(HAVE_CPUID) + +#define STORM_SSE_ALIGNMENT 16 +#define STORM_AVX2_ALIGNMENT 32 +#define STORM_AVX512_ALIGNMENT 64 + +#ifdef __cplusplus +extern "C" { +#endif + +#if defined(STORM_HAVE_CPUID) #if defined(_MSC_VER) #include @@ -299,13 +311,13 @@ extern "C" { // CPUID flags. See https://en.wikipedia.org/wiki/CPUID for more info. /* %ecx bit flags */ -#define STORM_bit_POPCNT (1 << 23) // POPCNT instruction -#define STORM_bit_SSE41 (1 << 19) // CPUID.01H:ECX.SSE41[Bit 19] -#define STORM_bit_SSE42 (1 << 20) // CPUID.01H:ECX.SSE41[Bit 20] +#define STORM_CPUID_runtime_bit_POPCNT (1 << 23) // POPCNT instruction +#define STORM_CPUID_runtime_bit_SSE41 (1 << 19) // CPUID.01H:ECX.SSE41[Bit 19] +#define STORM_CPUID_runtime_bit_SSE42 (1 << 20) // CPUID.01H:ECX.SSE41[Bit 20] /* %ebx bit flags */ -#define STORM_bit_AVX2 (1 << 5) // CPUID.(EAX=07H, ECX=0H):EBX.AVX2[bit 5] -#define STORM_bit_AVX512BW (1 << 30) // AVX-512 Byte and Word Instructions +#define STORM_CPUID_runtime_bit_AVX2 (1 << 5) // CPUID.(EAX=07H, ECX=0H):EBX.AVX2[bit 5] +#define STORM_CPUID_runtime_bit_AVX512BW (1 << 30) // AVX-512 Byte and Word Instructions /* xgetbv bit flags */ #define STORM_XSTATE_SSE (1 << 1) @@ -345,11 +357,11 @@ void STORM_run_cpuid(int eax, int ecx, int* abcd) { #endif } -#if defined(HAVE_AVX2) || \ - defined(HAVE_AVX512) +#if defined(STORM_HAVE_AVX2) || \ + defined(STORM_HAVE_AVX512) static -int get_xcr0() { +int STORM_get_xcr0() { int xcr0; #if defined(_MSC_VER) @@ -364,26 +376,26 @@ int get_xcr0() { #endif static -int get_cpuid() { +int STORM_get_cpuid() { int flags = 0; int abcd[4]; STORM_run_cpuid(1, 0, abcd); // Check for POPCNT instruction - if ((abcd[2] & STORM_bit_POPCNT) == STORM_bit_POPCNT) - flags |= STORM_bit_POPCNT; + if ((abcd[2] & STORM_CPUID_runtime_bit_POPCNT) == STORM_CPUID_runtime_bit_POPCNT) + flags |= STORM_CPUID_runtime_bit_POPCNT; // Check for SSE4.1 instruction set - if ((abcd[2] & STORM_bit_SSE41) == STORM_bit_SSE41) - flags |= STORM_bit_SSE41; + if ((abcd[2] & STORM_CPUID_runtime_bit_SSE41) == STORM_CPUID_runtime_bit_SSE41) + flags |= STORM_CPUID_runtime_bit_SSE41; // Check for SSE4.2 instruction set - if ((abcd[2] & STORM_bit_SSE42) == STORM_bit_SSE42) - flags |= STORM_bit_SSE42; + if ((abcd[2] & STORM_CPUID_runtime_bit_SSE42) == STORM_CPUID_runtime_bit_SSE42) + flags |= STORM_CPUID_runtime_bit_SSE42; -#if defined(HAVE_AVX2) || \ - defined(HAVE_AVX512) +#if defined(STORM_HAVE_AVX2) || \ + defined(STORM_HAVE_AVX512) int osxsave_mask = (1 << 27); @@ -394,17 +406,17 @@ int get_cpuid() { int ymm_mask = STORM_XSTATE_SSE | STORM_XSTATE_YMM; int zmm_mask = STORM_XSTATE_SSE | STORM_XSTATE_YMM | STORM_XSTATE_ZMM; - int xcr0 = get_xcr0(); + int xcr0 = STORM_get_xcr0(); if ((xcr0 & ymm_mask) == ymm_mask) { STORM_run_cpuid(7, 0, abcd); - if ((abcd[1] & STORM_bit_AVX2) == STORM_bit_AVX2) - flags |= STORM_bit_AVX2; + if ((abcd[1] & STORM_CPUID_runtime_bit_AVX2) == STORM_CPUID_runtime_bit_AVX2) + flags |= STORM_CPUID_runtime_bit_AVX2; if ((xcr0 & zmm_mask) == zmm_mask) { - if ((abcd[1] & STORM_bit_AVX512BW) == STORM_bit_AVX512BW) - flags |= STORM_bit_AVX512BW; + if ((abcd[1] & STORM_CPUID_runtime_bit_AVX512BW) == STORM_CPUID_runtime_bit_AVX512BW) + flags |= STORM_CPUID_runtime_bit_AVX512BW; } } @@ -412,10 +424,10 @@ int get_cpuid() { return flags; } -#endif // defined(HAVE_CPUID) +#endif // defined(STORM_HAVE_CPUID) /// Taken from libpopcnt.h -#if defined(HAVE_ASM_POPCNT) && \ +#if defined(STORM_HAVE_ASM_POPCNT) && \ defined(__x86_64__) STORM_FORCE_INLINE @@ -425,7 +437,7 @@ uint64_t STORM_POPCOUNT(uint64_t x) return x; } -#elif defined(HAVE_ASM_POPCNT) && \ +#elif defined(STORM_HAVE_ASM_POPCNT) && \ defined(__i386__) STORM_FORCE_INLINE @@ -465,7 +477,7 @@ uint64_t STORM_POPCOUNT(uint64_t x) } /* non x86 CPUs */ -#elif defined(HAVE_BUILTIN_POPCOUNT) +#elif defined(STORM_HAVE_BUILTIN_POPCOUNT) STORM_FORCE_INLINE uint64_t STORM_POPCOUNT(uint64_t x) { @@ -476,7 +488,6 @@ uint64_t STORM_POPCOUNT(uint64_t x) { * use pure integer algorithm */ #else -// TODO: FIXME STORM_FORCE_INLINE uint64_t STORM_POPCOUNT(uint64_t x) { return STORM_popcount64(x); @@ -488,7 +499,7 @@ uint64_t STORM_POPCOUNT(uint64_t x) { static uint64_t STORM_intersect_count_unrolled(const uint64_t* STORM_RESTRICT data1, const uint64_t* STORM_RESTRICT data2, - uint64_t size) + size_t size) { const uint64_t limit = size - size % 4; uint64_t cnt = 0; @@ -510,7 +521,7 @@ uint64_t STORM_intersect_count_unrolled(const uint64_t* STORM_RESTRICT data1, static uint64_t STORM_union_count_unrolled(const uint64_t* STORM_RESTRICT data1, const uint64_t* STORM_RESTRICT data2, - uint64_t size) + size_t size) { const uint64_t limit = size - size % 4; uint64_t cnt = 0; @@ -532,7 +543,7 @@ uint64_t STORM_union_count_unrolled(const uint64_t* STORM_RESTRICT data1, static uint64_t STORM_diff_count_unrolled(const uint64_t* STORM_RESTRICT data1, const uint64_t* STORM_RESTRICT data2, - uint64_t size) + size_t size) { const uint64_t limit = size - size % 4; uint64_t cnt = 0; @@ -552,10 +563,10 @@ uint64_t STORM_diff_count_unrolled(const uint64_t* STORM_RESTRICT data1, } static -int STORM_pospopcnt_u16_scalar_naive(const uint16_t* data, uint32_t len, uint32_t* flags) { +int STORM_pospopcnt_u16_scalar_naive(const uint16_t* data, size_t len, uint32_t* out) { for (int i = 0; i < len; ++i) { for (int j = 0; j < 16; ++j) { - flags[j] += ((data[i] & (1 << j)) >> j); + out[j] += ((data[i] & (1 << j)) >> j); } } @@ -581,7 +592,7 @@ uint64_t STORM_pospopcnt_loadu_u64(const void* ptr) { // By @aqrit (https://github.com/aqrit) // @see: https://gist.github.com/aqrit/c729815b0165c139d0bac642ab7ee104 static -int STORM_pospopcnt_u16_scalar_umul128_unroll2(const uint16_t* in, uint32_t n, uint32_t* out) { +int STORM_pospopcnt_u16_scalar_umul128_unroll2(const uint16_t* in, size_t n, uint32_t* out) { while (n >= 8) { uint64_t counter_a = 0; // 4 packed 12-bit counters uint64_t counter_b = 0; @@ -683,10 +694,10 @@ int STORM_pospopcnt_u16_scalar_umul128_unroll2(const uint16_t* in, uint32_t n, u STORM_FORCE_INLINE uint64_t STORM_popcount64(uint64_t x) { - uint64_t m1 = 0x5555555555555555ll; - uint64_t m2 = 0x3333333333333333ll; - uint64_t m4 = 0x0F0F0F0F0F0F0F0Fll; - uint64_t h01 = 0x0101010101010101ll; + uint64_t m1 = UINT64_C(0x5555555555555555); + uint64_t m2 = UINT64_C(0x3333333333333333); + uint64_t m4 = UINT64_C(0x0F0F0F0F0F0F0F0F); + uint64_t h01 = UINT64_C(0x0101010101010101); x -= (x >> 1) & m1; x = (x & m2) + ((x >> 2) & m2); @@ -768,29 +779,18 @@ const uint8_t STORM_popcnt_lookup8bit[256] = { * SSE4.1 functions ****************************/ -#if defined(HAVE_SSE42) +#if defined(STORM_HAVE_SSE42) #include -#ifndef STORM_POPCOUNT_SSE4 -#define STORM_POPCOUNT_SSE4(A, B) { \ - A += STORM_POPCOUNT(_mm_extract_epi64(B, 0)); \ - A += STORM_POPCOUNT(_mm_extract_epi64(B, 1)); \ -} -#endif - -#if !defined(_MSC_VER) - __attribute__ ((target ("sse4.2"))) -#endif +STORM_TARGET("sse4.2") STORM_FORCE_INLINE uint64_t STORM_POPCOUNT_SSE(const __m128i n) { return(STORM_POPCOUNT(_mm_cvtsi128_si64(n)) + STORM_POPCOUNT(_mm_cvtsi128_si64(_mm_unpackhi_epi64(n, n)))); } -#if !defined(_MSC_VER) - __attribute__ ((target ("sse4.2"))) -#endif +STORM_TARGET("sse4.2") STORM_FORCE_INLINE void STORM_CSA128(__m128i* h, __m128i* l, __m128i a, __m128i b, __m128i c) { __m128i u = _mm_xor_si128(a, b); @@ -817,9 +817,7 @@ void STORM_CSA128(__m128i* h, __m128i* l, __m128i a, __m128i b, __m128i c) { * @param b * @param c */ -#if !defined(_MSC_VER) - __attribute__ ((target ("sse4.2"))) -#endif +STORM_TARGET("sse4.2") STORM_FORCE_INLINE void STORM_pospopcnt_csa_sse(__m128i* STORM_RESTRICT h, __m128i* STORM_RESTRICT l, @@ -833,11 +831,9 @@ void STORM_pospopcnt_csa_sse(__m128i* STORM_RESTRICT h, // By @aqrit (https://github.com/aqrit) // @see: https://gist.github.com/aqrit/cb52b2ac5b7d0dfe9319c09d27237bf3 -#if !defined(_MSC_VER) - __attribute__ ((target ("sse4.2"))) -#endif +STORM_TARGET("sse4.2") static -int STORM_pospopcnt_u16_sse_sad(const uint16_t* data, uint32_t len, uint32_t* flag_counts) { +int STORM_pospopcnt_u16_sse_sad(const uint16_t* data, size_t len, uint32_t* flag_counts) { const __m128i zero = _mm_setzero_si128(); const __m128i mask_lo_byte = _mm_srli_epi16(_mm_cmpeq_epi8(zero, zero), 8); const __m128i mask_lo_cnt = _mm_srli_epi16(mask_lo_byte, 2); @@ -1032,11 +1028,9 @@ int STORM_pospopcnt_u16_sse_sad(const uint16_t* data, uint32_t len, uint32_t* fl return 0; } -#if !defined(_MSC_VER) - __attribute__ ((target ("sse4.2"))) -#endif +STORM_TARGET("sse4.2") static -int STORM_pospopcnt_u16_sse_blend_popcnt_unroll8(const uint16_t* array, uint32_t len, uint32_t* flags) { +int STORM_pospopcnt_u16_sse_blend_popcnt_unroll8(const uint16_t* array, size_t len, uint32_t* out) { const __m128i* data_vectors = (const __m128i*)(array); const uint32_t n_cycles = len / 8; @@ -1053,8 +1047,8 @@ int STORM_pospopcnt_u16_sse_blend_popcnt_unroll8(const uint16_t* array, uint32_t U(0,1) U(2,3) U(4,5) U(6,7) for (int i = 0; i < 8; ++i) { -#define A0(p) flags[ 7 - i] += _mm_popcnt_u32(_mm_movemask_epi8(input##p)); -#define A1(k) flags[15 - i] += _mm_popcnt_u32(_mm_movemask_epi8(input##k)); +#define A0(p) out[ 7 - i] += _mm_popcnt_u32(_mm_movemask_epi8(input##p)); +#define A1(k) out[15 - i] += _mm_popcnt_u32(_mm_movemask_epi8(input##k)); #define A(p, k) A0(p) A1(k) A(0,1) A(2, 3) A(4,5) A(6, 7) @@ -1088,7 +1082,7 @@ int STORM_pospopcnt_u16_sse_blend_popcnt_unroll8(const uint16_t* array, uint32_t i *= 8; for (/**/; i < len; ++i) { for (int j = 0; j < 16; ++j) { - flags[j] += ((array[i] & (1 << j)) >> j); + out[j] += ((array[i] & (1 << j)) >> j); } } @@ -1104,14 +1098,12 @@ int STORM_pospopcnt_u16_sse_blend_popcnt_unroll8(const uint16_t* array, uint32_t return 0; } -#if !defined(_MSC_VER) - __attribute__ ((target ("sse4.2"))) -#endif +STORM_TARGET("sse4.2") static -int STORM_pospopcnt_u16_sse_harvey_seal(const uint16_t* array, uint32_t len, uint32_t* flags) { +int STORM_pospopcnt_u16_sse_harvey_seal(const uint16_t* array, size_t len, uint32_t* out) { for (uint32_t i = len - (len % (16 * 8)); i < len; ++i) { for (int j = 0; j < 16; ++j) { - flags[j] += ((array[i] & (1 << j)) >> j); + out[j] += ((array[i] & (1 << j)) >> j); } } @@ -1171,7 +1163,7 @@ int STORM_pospopcnt_u16_sse_harvey_seal(const uint16_t* array, uint32_t len, uin for (size_t i = 0; i < 16; ++i) { _mm_storeu_si128((__m128i*)buffer, counter[i]); for (size_t z = 0; z < 8; z++) { - flags[i] += 16 * (uint32_t)buffer[z]; + out[i] += 16 * (uint32_t)buffer[z]; } } } @@ -1179,38 +1171,36 @@ int STORM_pospopcnt_u16_sse_harvey_seal(const uint16_t* array, uint32_t len, uin _mm_storeu_si128((__m128i*)buffer, v1); for (size_t i = 0; i < 8; ++i) { for (int j = 0; j < 16; ++j) { - flags[j] += ((buffer[i] & (1 << j)) >> j); + out[j] += ((buffer[i] & (1 << j)) >> j); } } _mm_storeu_si128((__m128i*)buffer, v2); for (size_t i = 0; i < 8; ++i) { for (int j = 0; j < 16; ++j) { - flags[j] += 2 * ((buffer[i] & (1 << j)) >> j); + out[j] += 2 * ((buffer[i] & (1 << j)) >> j); } } _mm_storeu_si128((__m128i*)buffer, v4); for (size_t i = 0; i < 8; ++i) { for (int j = 0; j < 16; ++j) { - flags[j] += 4 * ((buffer[i] & (1 << j)) >> j); + out[j] += 4 * ((buffer[i] & (1 << j)) >> j); } } _mm_storeu_si128((__m128i*)buffer, v8); for (size_t i = 0; i < 8; ++i) { for (int j = 0; j < 16; ++j) { - flags[j] += 8 * ((buffer[i] & (1 << j)) >> j); + out[j] += 8 * ((buffer[i] & (1 << j)) >> j); } } return 0; } -#if !defined(_MSC_VER) - __attribute__ ((target ("sse4.2"))) -#endif +STORM_TARGET("sse4.2") static uint64_t STORM_intersect_count_csa_sse4(const __m128i* STORM_RESTRICT data1, const __m128i* STORM_RESTRICT data2, - uint64_t size) + size_t size) { __m128i ones = _mm_setzero_si128(); __m128i twos = _mm_setzero_si128(); @@ -1258,13 +1248,11 @@ uint64_t STORM_intersect_count_csa_sse4(const __m128i* STORM_RESTRICT data1, return cnt64; } -#if !defined(_MSC_VER) - __attribute__ ((target ("sse4.2"))) -#endif +STORM_TARGET("sse4.2") static uint64_t STORM_union_count_csa_sse4(const __m128i* STORM_RESTRICT data1, const __m128i* STORM_RESTRICT data2, - uint64_t size) + size_t size) { __m128i ones = _mm_setzero_si128(); __m128i twos = _mm_setzero_si128(); @@ -1312,13 +1300,11 @@ uint64_t STORM_union_count_csa_sse4(const __m128i* STORM_RESTRICT data1, return cnt64; } -#if !defined(_MSC_VER) - __attribute__ ((target ("sse4.2"))) -#endif +STORM_TARGET("sse4.2") static uint64_t STORM_diff_count_csa_sse4(const __m128i* STORM_RESTRICT data1, const __m128i* STORM_RESTRICT data2, - uint64_t size) + size_t size) { __m128i ones = _mm_setzero_si128(); __m128i twos = _mm_setzero_si128(); @@ -1366,13 +1352,62 @@ uint64_t STORM_diff_count_csa_sse4(const __m128i* STORM_RESTRICT data1, return cnt64; } -#if !defined(_MSC_VER) - __attribute__ ((target ("sse4.2"))) -#endif +STORM_TARGET("sse4.2") +static +uint64_t STORM_popcnt_csa_sse4(const __m128i* STORM_RESTRICT data, + size_t size) +{ + __m128i ones = _mm_setzero_si128(); + __m128i twos = _mm_setzero_si128(); + __m128i fours = _mm_setzero_si128(); + __m128i eights = _mm_setzero_si128(); + __m128i sixteens = _mm_setzero_si128(); + __m128i twosA, twosB, foursA, foursB, eightsA, eightsB; + + uint64_t i = 0; + uint64_t limit = size - size % 16; + uint64_t cnt64 = 0; + +#define LOAD(a) (_mm_loadu_si128(&data[i+a])) + + for (/**/; i < limit; i += 16) { + STORM_CSA128(&twosA, &ones, ones, LOAD(0), LOAD(1)); + STORM_CSA128(&twosB, &ones, ones, LOAD(2), LOAD(3)); + STORM_CSA128(&foursA, &twos, twos, twosA, twosB); + STORM_CSA128(&twosA, &ones, ones, LOAD(4), LOAD(5)); + STORM_CSA128(&twosB, &ones, ones, LOAD(6), LOAD(7)); + STORM_CSA128(&foursB, &twos, twos, twosA, twosB); + STORM_CSA128(&eightsA, &fours, fours, foursA, foursB); + STORM_CSA128(&twosA, &ones, ones, LOAD(8), LOAD(9)); + STORM_CSA128(&twosB, &ones, ones, LOAD(10), LOAD(11)); + STORM_CSA128(&foursA, &twos, twos, twosA, twosB); + STORM_CSA128(&twosA, &ones, ones, LOAD(12), LOAD(13)); + STORM_CSA128(&twosB, &ones, ones, LOAD(14), LOAD(15)); + STORM_CSA128(&foursB, &twos, twos, twosA, twosB); + STORM_CSA128(&eightsB, &fours, fours, foursA, foursB); + STORM_CSA128(&sixteens,&eights, eights,eightsA,eightsB); + + cnt64 += STORM_POPCOUNT_SSE(sixteens); + } +#undef LOAD + + cnt64 <<= 4; + cnt64 += STORM_POPCOUNT_SSE(eights) << 3; + cnt64 += STORM_POPCOUNT_SSE(fours) << 2; + cnt64 += STORM_POPCOUNT_SSE(twos) << 1; + cnt64 += STORM_POPCOUNT_SSE(ones) << 0; + + for (/**/; i < size; ++i) + cnt64 = STORM_POPCOUNT_SSE(_mm_loadu_si128(&data[i])); + + return cnt64; +} + +STORM_TARGET("sse4.2") static uint64_t STORM_intersect_count_sse4(const uint64_t* STORM_RESTRICT b1, const uint64_t* STORM_RESTRICT b2, - const uint32_t n_ints) + const size_t n_ints) { uint64_t count = 0; const __m128i* r1 = (__m128i*)b1; @@ -1388,13 +1423,11 @@ uint64_t STORM_intersect_count_sse4(const uint64_t* STORM_RESTRICT b1, return(count); } -#if !defined(_MSC_VER) - __attribute__ ((target ("sse4.2"))) -#endif +STORM_TARGET("sse4.2") static uint64_t STORM_union_count_sse4(const uint64_t* STORM_RESTRICT b1, const uint64_t* STORM_RESTRICT b2, - const uint32_t n_ints) + const size_t n_ints) { uint64_t count = 0; const __m128i* r1 = (__m128i*)b1; @@ -1410,13 +1443,11 @@ uint64_t STORM_union_count_sse4(const uint64_t* STORM_RESTRICT b1, return(count); } -#if !defined(_MSC_VER) - __attribute__ ((target ("sse4.2"))) -#endif +STORM_TARGET("sse4.2") static uint64_t STORM_diff_count_sse4(const uint64_t* STORM_RESTRICT b1, const uint64_t* STORM_RESTRICT b2, - const uint32_t n_ints) + const size_t n_ints) { uint64_t count = 0; const __m128i* r1 = (__m128i*)b1; @@ -1431,28 +1462,35 @@ uint64_t STORM_diff_count_sse4(const uint64_t* STORM_RESTRICT b1, return(count); } + +STORM_TARGET("sse4.2") +static +uint64_t STORM_popcnt_sse4(const uint64_t* STORM_RESTRICT data, + const size_t n_ints) +{ + uint64_t count = 0; + const __m128i* r1 = (__m128i*)data; + const uint32_t n_cycles = n_ints / 2; + + count += STORM_popcnt_csa_sse4(r1, n_cycles); + + for (int i = n_cycles*2; i < n_ints; ++i) { + count += STORM_POPCOUNT(data[i]); + } + + return(count); +} #endif /**************************** * AVX256 functions ****************************/ -#if defined(HAVE_AVX2) +#if defined(STORM_HAVE_AVX2) #include -#ifndef STORM_POPCOUNT_AVX2 -#define STORM_POPCOUNT_AVX2(A, B) { \ - A += STORM_POPCOUNT(_mm256_extract_epi64(B, 0)); \ - A += STORM_POPCOUNT(_mm256_extract_epi64(B, 1)); \ - A += STORM_POPCOUNT(_mm256_extract_epi64(B, 2)); \ - A += STORM_POPCOUNT(_mm256_extract_epi64(B, 3)); \ -} -#endif - -#if !defined(_MSC_VER) - __attribute__ ((target ("avx2"))) -#endif +STORM_TARGET("avx2") STORM_FORCE_INLINE void STORM_CSA256(__m256i* h, __m256i* l, __m256i a, __m256i b, __m256i c) { __m256i u = _mm256_xor_si256(a, b); @@ -1460,9 +1498,7 @@ void STORM_CSA256(__m256i* h, __m256i* l, __m256i a, __m256i b, __m256i c) { *l = _mm256_xor_si256(u, c); } -#if !defined(_MSC_VER) - __attribute__ ((target ("avx2"))) -#endif +STORM_TARGET("avx2") STORM_FORCE_INLINE void STORM_pospopcnt_csa_avx2(__m256i* STORM_RESTRICT h, __m256i* STORM_RESTRICT l, @@ -1474,11 +1510,9 @@ void STORM_pospopcnt_csa_avx2(__m256i* STORM_RESTRICT h, *l = _mm256_xor_si256(u, c); } -#if !defined(_MSC_VER) - __attribute__ ((target ("avx2"))) -#endif +STORM_TARGET("avx2") static -int STORM_pospopcnt_u16_avx2_blend_popcnt_unroll8(const uint16_t* array, uint32_t len, uint32_t* flags) { +int STORM_pospopcnt_u16_avx2_blend_popcnt_unroll8(const uint16_t* array, size_t len, uint32_t* out) { const __m256i* data_vectors = (const __m256i*)(array); const uint32_t n_cycles = len / 16; @@ -1494,8 +1528,8 @@ int STORM_pospopcnt_u16_avx2_blend_popcnt_unroll8(const uint16_t* array, uint32_ U(0,1) U(2, 3) U(4, 5) U(6, 7) for (int i = 0; i < 8; ++i) { -#define A0(p) flags[ 7 - i] += _mm_popcnt_u32(_mm256_movemask_epi8(input##p)); -#define A1(k) flags[15 - i] += _mm_popcnt_u32(_mm256_movemask_epi8(input##k)); +#define A0(p) out[ 7 - i] += _mm_popcnt_u32(_mm256_movemask_epi8(input##p)); +#define A1(k) out[15 - i] += _mm_popcnt_u32(_mm256_movemask_epi8(input##k)); #define A(p, k) A0(p) A1(k) A(0,1) A(2, 3) A(4, 5) A(6, 7) @@ -1528,7 +1562,7 @@ int STORM_pospopcnt_u16_avx2_blend_popcnt_unroll8(const uint16_t* array, uint32_ i *= 16; for (/**/; i < len; ++i) { for (int j = 0; j < 16; ++j) { - flags[j] += ((array[i] & (1 << j)) >> j); + out[j] += ((array[i] & (1 << j)) >> j); } } @@ -1545,14 +1579,12 @@ int STORM_pospopcnt_u16_avx2_blend_popcnt_unroll8(const uint16_t* array, uint32_ return 0; } -#if !defined(_MSC_VER) - __attribute__ ((target ("avx2"))) -#endif +STORM_TARGET("avx2") static -int STORM_pospopcnt_u16_avx2_harvey_seal(const uint16_t* array, uint32_t len, uint32_t* flags) { +int STORM_pospopcnt_u16_avx2_harvey_seal(const uint16_t* array, size_t len, uint32_t* out) { for (uint32_t i = len - (len % (16 * 16)); i < len; ++i) { for (int j = 0; j < 16; ++j) { - flags[j] += ((array[i] & (1 << j)) >> j); + out[j] += ((array[i] & (1 << j)) >> j); } } @@ -1613,7 +1645,7 @@ int STORM_pospopcnt_u16_avx2_harvey_seal(const uint16_t* array, uint32_t len, ui for (size_t i = 0; i < 16; ++i) { _mm256_storeu_si256((__m256i*)buffer, counter[i]); for (size_t z = 0; z < 16; z++) { - flags[i] += 16 * (uint32_t)buffer[z]; + out[i] += 16 * (uint32_t)buffer[z]; } } } @@ -1621,35 +1653,33 @@ int STORM_pospopcnt_u16_avx2_harvey_seal(const uint16_t* array, uint32_t len, ui _mm256_storeu_si256((__m256i*)buffer, v1); for (size_t i = 0; i < 16; ++i) { for (int j = 0; j < 16; ++j) { - flags[j] += ((buffer[i] & (1 << j)) >> j); + out[j] += ((buffer[i] & (1 << j)) >> j); } } _mm256_storeu_si256((__m256i*)buffer, v2); for (size_t i = 0; i < 16; ++i) { for (int j = 0; j < 16; ++j) { - flags[j] += 2 * ((buffer[i] & (1 << j)) >> j); + out[j] += 2 * ((buffer[i] & (1 << j)) >> j); } } _mm256_storeu_si256((__m256i*)buffer, v4); for (size_t i = 0; i < 16; ++i) { for (int j = 0; j < 16; ++j) { - flags[j] += 4 * ((buffer[i] & (1 << j)) >> j); + out[j] += 4 * ((buffer[i] & (1 << j)) >> j); } } _mm256_storeu_si256((__m256i*)buffer, v8); for (size_t i = 0; i < 16; ++i) { for (int j = 0; j < 16; ++j) { - flags[j] += 8 * ((buffer[i] & (1 << j)) >> j); + out[j] += 8 * ((buffer[i] & (1 << j)) >> j); } } return 0; } -#if !defined(_MSC_VER) - __attribute__ ((target ("avx2"))) -#endif +STORM_TARGET("avx2") static __m256i STORM_popcnt256(__m256i v) { __m256i lookup1 = _mm256_setr_epi8( @@ -1676,9 +1706,7 @@ __m256i STORM_popcnt256(__m256i v) { } // modified from https://github.com/WojciechMula/sse-popcount -#if !defined(_MSC_VER) - __attribute__ ((target ("avx2"))) -#endif +STORM_TARGET("avx2") static uint64_t STORM_intersect_count_lookup_avx2_func(const uint8_t* STORM_RESTRICT data1, const uint8_t* STORM_RESTRICT data2, @@ -1747,9 +1775,7 @@ uint64_t STORM_intersect_count_lookup_avx2_func(const uint8_t* STORM_RESTRICT da } // modified from https://github.com/WojciechMula/sse-popcount -#if !defined(_MSC_VER) - __attribute__ ((target ("avx2"))) -#endif +STORM_TARGET("avx2") static uint64_t STORM_union_count_lookup_avx2_func(const uint8_t* STORM_RESTRICT data1, const uint8_t* STORM_RESTRICT data2, @@ -1818,9 +1844,7 @@ uint64_t STORM_union_count_lookup_avx2_func(const uint8_t* STORM_RESTRICT data1, } // modified from https://github.com/WojciechMula/sse-popcount -#if !defined(_MSC_VER) - __attribute__ ((target ("avx2"))) -#endif +STORM_TARGET("avx2") static uint64_t STORM_diff_count_lookup_avx2_func(const uint8_t* STORM_RESTRICT data1, const uint8_t* STORM_RESTRICT data2, @@ -1888,11 +1912,9 @@ uint64_t STORM_diff_count_lookup_avx2_func(const uint8_t* STORM_RESTRICT data1, return result; } -#if !defined(_MSC_VER) - __attribute__ ((target ("avx2"))) -#endif +STORM_TARGET("avx2") static -uint64_t STORM_popcnt_avx2(const __m256i* data, uint64_t size) +uint64_t STORM_popcnt_csa_avx2(const __m256i* data, uint64_t size) { __m256i cnt = _mm256_setzero_si256(); __m256i ones = _mm256_setzero_si256(); @@ -1955,13 +1977,11 @@ uint64_t STORM_popcnt_avx2(const __m256i* data, uint64_t size) * @see https://arxiv.org/abs/1611.07612 */ // In this version we perform the operation A&B as input into the CSA operator. -#if !defined(_MSC_VER) - __attribute__ ((target ("avx2"))) -#endif +STORM_TARGET("avx2") static uint64_t STORM_intersect_count_csa_avx2(const __m256i* STORM_RESTRICT data1, const __m256i* STORM_RESTRICT data2, - uint64_t size) + size_t size) { __m256i cnt = _mm256_setzero_si256(); __m256i ones = _mm256_setzero_si256(); @@ -2016,13 +2036,11 @@ uint64_t STORM_intersect_count_csa_avx2(const __m256i* STORM_RESTRICT data1, } // In this version we perform the operation A|B as input into the CSA operator. -#if !defined(_MSC_VER) - __attribute__ ((target ("avx2"))) -#endif +STORM_TARGET("avx2") static uint64_t STORM_union_count_csa_avx2(const __m256i* STORM_RESTRICT data1, const __m256i* STORM_RESTRICT data2, - uint64_t size) + size_t size) { __m256i cnt = _mm256_setzero_si256(); __m256i ones = _mm256_setzero_si256(); @@ -2077,13 +2095,11 @@ uint64_t STORM_union_count_csa_avx2(const __m256i* STORM_RESTRICT data1, } // In this version we perform the operation A^B as input into the CSA operator. -#if !defined(_MSC_VER) - __attribute__ ((target ("avx2"))) -#endif +STORM_TARGET("avx2") static uint64_t STORM_diff_count_csa_avx2(const __m256i* STORM_RESTRICT data1, const __m256i* STORM_RESTRICT data2, - uint64_t size) + size_t size) { __m256i cnt = _mm256_setzero_si256(); __m256i ones = _mm256_setzero_si256(); @@ -2137,13 +2153,11 @@ uint64_t STORM_diff_count_csa_avx2(const __m256i* STORM_RESTRICT data1, cnt64[3]; } -#if !defined(_MSC_VER) - __attribute__ ((target ("avx2"))) -#endif +STORM_TARGET("avx2") static uint64_t STORM_intersect_count_avx2(const uint64_t* STORM_RESTRICT b1, const uint64_t* STORM_RESTRICT b2, - const uint32_t n_ints) + const size_t n_ints) { uint64_t count = 0; const __m256i* r1 = (__m256i*)b1; @@ -2159,13 +2173,11 @@ uint64_t STORM_intersect_count_avx2(const uint64_t* STORM_RESTRICT b1, return(count); } -#if !defined(_MSC_VER) - __attribute__ ((target ("avx2"))) -#endif +STORM_TARGET("avx2") static uint64_t STORM_union_count_avx2(const uint64_t* STORM_RESTRICT b1, - const uint64_t* STORM_RESTRICT b2, - const uint32_t n_ints) + const uint64_t* STORM_RESTRICT b2, + const size_t n_ints) { uint64_t count = 0; const __m256i* r1 = (__m256i*)b1; @@ -2181,13 +2193,11 @@ uint64_t STORM_union_count_avx2(const uint64_t* STORM_RESTRICT b1, return(count); } -#if !defined(_MSC_VER) - __attribute__ ((target ("avx2"))) -#endif +STORM_TARGET("avx2") static uint64_t STORM_diff_count_avx2(const uint64_t* STORM_RESTRICT b1, - const uint64_t* STORM_RESTRICT b2, - const uint32_t n_ints) + const uint64_t* STORM_RESTRICT b2, + const size_t n_ints) { uint64_t count = 0; const __m256i* r1 = (__m256i*)b1; @@ -2203,51 +2213,65 @@ uint64_t STORM_diff_count_avx2(const uint64_t* STORM_RESTRICT b1, return(count); } -#if !defined(_MSC_VER) - __attribute__ ((target ("avx2"))) -#endif +STORM_TARGET("avx2") static uint64_t STORM_intersect_count_lookup_avx2(const uint64_t* STORM_RESTRICT b1, const uint64_t* STORM_RESTRICT b2, - const uint32_t n_ints) + const size_t n_ints) { return STORM_intersect_count_lookup_avx2_func((uint8_t*)b1, (uint8_t*)b2, n_ints*8); } -#if !defined(_MSC_VER) - __attribute__ ((target ("avx2"))) -#endif +STORM_TARGET("avx2") static uint64_t STORM_union_count_lookup_avx2(const uint64_t* STORM_RESTRICT b1, const uint64_t* STORM_RESTRICT b2, - const uint32_t n_ints) + const size_t n_ints) { return STORM_union_count_lookup_avx2_func((uint8_t*)b1, (uint8_t*)b2, n_ints*8); } -#if !defined(_MSC_VER) - __attribute__ ((target ("avx2"))) -#endif +STORM_TARGET("avx2") static uint64_t STORM_diff_count_lookup_avx2(const uint64_t* STORM_RESTRICT b1, const uint64_t* STORM_RESTRICT b2, - const uint32_t n_ints) + const size_t n_ints) { return STORM_diff_count_lookup_avx2_func((uint8_t*)b1, (uint8_t*)b2, n_ints*8); } + +STORM_TARGET("avx2") +static +uint64_t STORM_popcnt_avx2(const uint64_t* data, + const size_t n_ints) +{ + uint64_t count = 0; + const uint32_t n_cycles = n_ints / 4; + const uint32_t n_cycles_sse = (n_ints % 4) / 2; + + const __m256i* r1 = (__m256i*)&data[0]; + const __m128i* r2 = (__m128i*)&data[n_cycles_sse*4]; + + count += STORM_popcnt_csa_avx2(r1, n_cycles); + count += STORM_popcnt_csa_sse4(r2, n_cycles_sse); + + for (int i = (4*n_cycles + 2*n_cycles_sse); i < n_ints; ++i) { + count += STORM_POPCOUNT(data[i]); + } + + return count; +} #endif /**************************** * AVX512BW functions ****************************/ -#if defined(HAVE_AVX512) +#if defined(STORM_HAVE_AVX512) #include -#if !defined(_MSC_VER) - __attribute__ ((target ("avx512bw"))) -#endif +STORM_TARGET("avx512bw") STORM_FORCE_INLINE __m512i STORM_popcnt512(__m512i v) { __m512i m1 = _mm512_set1_epi8(0x55); @@ -2260,9 +2284,7 @@ __m512i STORM_popcnt512(__m512i v) { return _mm512_sad_epu8(t3, _mm512_setzero_si512()); } -#if !defined(_MSC_VER) - __attribute__ ((target ("avx512bw"))) -#endif +STORM_TARGET("avx512bw") STORM_FORCE_INLINE void STORM_CSA512(__m512i* h, __m512i* l, __m512i a, __m512i b, __m512i c) { *l = _mm512_ternarylogic_epi32(c, b, a, 0x96); @@ -2272,9 +2294,7 @@ void STORM_CSA512(__m512i* h, __m512i* l, __m512i a, __m512i b, __m512i c) { // By Wojciech Muła // @see https://github.com/WojciechMula/sse-popcount/blob/master/popcnt-avx512-harley-seal.cpp#L3 // @see https://arxiv.org/abs/1611.07612 -#if !defined(_MSC_VER) - __attribute__ ((target ("avx512bw"))) -#endif +STORM_TARGET("avx512bw") STORM_FORCE_INLINE __m512i STORM_avx512_popcount(const __m512i v) { const __m512i m1 = _mm512_set1_epi8(0x55); // 01010101 @@ -2288,9 +2308,7 @@ __m512i STORM_avx512_popcount(const __m512i v) { } // 512i-version of carry-save adder subroutine. -#if !defined(_MSC_VER) - __attribute__ ((target ("avx512bw"))) -#endif +STORM_TARGET("avx512bw") STORM_FORCE_INLINE void STORM_pospopcnt_csa_avx512(__m512i* STORM_RESTRICT h, __m512i* STORM_RESTRICT l, @@ -2300,11 +2318,9 @@ void STORM_pospopcnt_csa_avx512(__m512i* STORM_RESTRICT h, *l = _mm512_ternarylogic_epi32(c, b, *l, 0x96); // 10010110 } -#if !defined(_MSC_VER) - __attribute__ ((target ("avx512bw"))) -#endif +STORM_TARGET("avx512bw") static -uint64_t STORM_popcnt_avx512(const __m512i* STORM_RESTRICT data, uint64_t size) +uint64_t STORM_popcnt_csa_avx512bw(const __m512i* STORM_RESTRICT data, size_t size) { __m512i cnt = _mm512_setzero_si512(); __m512i ones = _mm512_setzero_si512(); @@ -2362,14 +2378,12 @@ uint64_t STORM_popcnt_avx512(const __m512i* STORM_RESTRICT data, uint64_t size) cnt64[7]; } -#if !defined(_MSC_VER) - __attribute__ ((target ("avx512bw"))) -#endif +STORM_TARGET("avx512bw") static -int STORM_pospopcnt_u16_avx512bw_harvey_seal(const uint16_t* array, uint32_t len, uint32_t* flags) { +int STORM_pospopcnt_u16_avx512bw_harvey_seal(const uint16_t* array, size_t len, uint32_t* out) { for (uint32_t i = len - (len % (32 * 16)); i < len; ++i) { for (int j = 0; j < 16; ++j) { - flags[j] += ((array[i] & (1 << j)) >> j); + out[j] += ((array[i] & (1 << j)) >> j); } } @@ -2426,47 +2440,45 @@ int STORM_pospopcnt_u16_avx512bw_harvey_seal(const uint16_t* array, uint32_t len for (size_t i = 0; i < 16; ++i) { _mm512_storeu_si512((__m512i*)buffer, counter[i]); for (size_t z = 0; z < 32; z++) { - flags[i] += 16 * (uint32_t)buffer[z]; + out[i] += 16 * (uint32_t)buffer[z]; } } } _mm512_storeu_si512((__m512i*)buffer, v1); - for (size_t i = 0; i < 32; i++) { - for (int j = 0; j < 16; j++) { - flags[j] += 1 * ((buffer[i] & (1 << j)) >> j); + for (size_t i = 0; i < 32; ++i) { + for (int j = 0; j < 16; ++j) { + out[j] += 1 * ((buffer[i] & (1 << j)) >> j); } } _mm512_storeu_si512((__m512i*)buffer, v2); - for (size_t i = 0; i < 32; i++) { - for (int j = 0; j < 16; j++) { - flags[j] += 2 * ((buffer[i] & (1 << j)) >> j); + for (size_t i = 0; i < 32; ++i) { + for (int j = 0; j < 16; ++j) { + out[j] += 2 * ((buffer[i] & (1 << j)) >> j); } } _mm512_storeu_si512((__m512i*)buffer, v4); - for (size_t i = 0; i < 32; i++) { - for (int j = 0; j < 16; j++) { - flags[j] += 4 * ((buffer[i] & (1 << j)) >> j); + for (size_t i = 0; i < 32; ++i) { + for (int j = 0; j < 16; ++j) { + out[j] += 4 * ((buffer[i] & (1 << j)) >> j); } } _mm512_storeu_si512((__m512i*)buffer, v8); - for (size_t i = 0; i < 32; i++) { - for (int j = 0; j < 16; j++) { - flags[j] += 8 * ((buffer[i] & (1 << j)) >> j); + for (size_t i = 0; i < 32; ++i) { + for (int j = 0; j < 16; ++j) { + out[j] += 8 * ((buffer[i] & (1 << j)) >> j); } } return 0; } -#if !defined(_MSC_VER) - __attribute__ ((target ("avx512bw"))) -#endif +STORM_TARGET("avx512bw") static -int STORM_pospopcnt_u16_avx512bw_blend_popcnt_unroll8(const uint16_t* data, uint32_t len, uint32_t* flags) { +int STORM_pospopcnt_u16_avx512bw_blend_popcnt_unroll8(const uint16_t* data, size_t len, uint32_t* out) { #define AND_OR 0xea // ternary function: (a & b) | c const __m512i* data_vectors = (const __m512i*)(data); const uint32_t n_cycles = len / 32; @@ -2484,8 +2496,8 @@ int STORM_pospopcnt_u16_avx512bw_blend_popcnt_unroll8(const uint16_t* data, uint U(0,1) U( 2, 3) U( 4, 5) U( 6, 7) for (int i = 0; i < 8; ++i) { -#define A0(p) flags[ 7 - i] += _mm_popcnt_u64(_mm512_movepi8_mask(input##p)); -#define A1(k) flags[15 - i] += _mm_popcnt_u64(_mm512_movepi8_mask(input##k)); +#define A0(p) out[ 7 - i] += _mm_popcnt_u64(_mm512_movepi8_mask(input##p)); +#define A1(k) out[15 - i] += _mm_popcnt_u64(_mm512_movepi8_mask(input##k)); #define A(p, k) A0(p) A1(k) A(0,1) A(2, 3) A(4,5) A(6, 7) @@ -2519,7 +2531,7 @@ int STORM_pospopcnt_u16_avx512bw_blend_popcnt_unroll8(const uint16_t* data, uint i *= 32; for (/**/; i < len; ++i) { for (int j = 0; j < 16; ++j) { - flags[j] += ((data[i] & (1 << j)) >> j); + out[j] += ((data[i] & (1 << j)) >> j); } } @@ -2537,11 +2549,9 @@ int STORM_pospopcnt_u16_avx512bw_blend_popcnt_unroll8(const uint16_t* data, uint return 0; } -#if !defined(_MSC_VER) - __attribute__ ((target ("avx512bw"))) -#endif +STORM_TARGET("avx512bw") static -int STORM_pospopcnt_u16_avx512bw_adder_forest(const uint16_t* array, uint32_t len, uint32_t* flags) { +int STORM_pospopcnt_u16_avx512bw_adder_forest(const uint16_t* array, size_t len, uint32_t* out) { __m512i counters[16]; for (size_t i = 0; i < 16; ++i) { @@ -2610,7 +2620,7 @@ int STORM_pospopcnt_u16_avx512bw_adder_forest(const uint16_t* array, uint32_t le // Update. for (size_t i = 0; i < 16; ++i) { _mm512_storeu_si512((__m512i*)tmp, counters[i]); - for (int j = 0; j < 32; ++j) flags[i] += tmp[j]; + for (int j = 0; j < 32; ++j) out[i] += tmp[j]; } // Reset. for (size_t i = 0; i < 16; ++i) { @@ -2643,7 +2653,7 @@ int STORM_pospopcnt_u16_avx512bw_adder_forest(const uint16_t* array, uint32_t le i *= 512; for (/**/; i < len; ++i) { for (int j = 0; j < 16; ++j) { - flags[j] += ((array[i] & (1 << j)) >> j); + out[j] += ((array[i] & (1 << j)) >> j); } } @@ -2659,7 +2669,7 @@ int STORM_pospopcnt_u16_avx512bw_adder_forest(const uint16_t* array, uint32_t le for (size_t i = 0; i < 16; ++i) { _mm512_storeu_si512((__m512i*)tmp, counters[i]); - for (int j = 0; j < 32; ++j) flags[i] += tmp[j]; + for (int j = 0; j < 32; ++j) out[i] += tmp[j]; } return 0; } @@ -2671,13 +2681,11 @@ int STORM_pospopcnt_u16_avx512bw_adder_forest(const uint16_t* array, uint32_t le * Wojciech Mula (23 Nov 2016). * @see https://arxiv.org/abs/1611.07612 */ -#if !defined(_MSC_VER) - __attribute__ ((target ("avx512bw"))) -#endif +STORM_TARGET("avx512bw") static uint64_t STORM_intersect_count_csa_avx512(const __m512i* STORM_RESTRICT data1, const __m512i* STORM_RESTRICT data2, - uint64_t size) + size_t size) { __m512i cnt = _mm512_setzero_si512(); __m512i ones = _mm512_setzero_si512(); @@ -2735,13 +2743,11 @@ uint64_t STORM_intersect_count_csa_avx512(const __m512i* STORM_RESTRICT data1, cnt64[7]; } -#if !defined(_MSC_VER) - __attribute__ ((target ("avx512bw"))) -#endif +STORM_TARGET("avx512bw") static uint64_t STORM_union_count_csa_avx512(const __m512i* STORM_RESTRICT data1, const __m512i* STORM_RESTRICT data2, - uint64_t size) + size_t size) { __m512i cnt = _mm512_setzero_si512(); __m512i ones = _mm512_setzero_si512(); @@ -2799,13 +2805,11 @@ uint64_t STORM_union_count_csa_avx512(const __m512i* STORM_RESTRICT data1, cnt64[7]; } -#if !defined(_MSC_VER) - __attribute__ ((target ("avx512bw"))) -#endif +STORM_TARGET("avx512bw") static uint64_t STORM_diff_count_csa_avx512(const __m512i* STORM_RESTRICT data1, - const __m512i* STORM_RESTRICT data2, - uint64_t size) + const __m512i* STORM_RESTRICT data2, + size_t size) { __m512i cnt = _mm512_setzero_si512(); __m512i ones = _mm512_setzero_si512(); @@ -2865,13 +2869,11 @@ uint64_t STORM_diff_count_csa_avx512(const __m512i* STORM_RESTRICT data1, // Functions // AVX512 -#if !defined(_MSC_VER) - __attribute__ ((target ("avx512bw"))) -#endif +STORM_TARGET("avx512bw") static uint64_t STORM_intersect_count_avx512(const uint64_t* STORM_RESTRICT b1, const uint64_t* STORM_RESTRICT b2, - const uint32_t n_ints) + const size_t n_ints) { uint64_t count = 0; const __m512i* r1 = (const __m512i*)(b1); @@ -2887,13 +2889,11 @@ uint64_t STORM_intersect_count_avx512(const uint64_t* STORM_RESTRICT b1, return(count); } -#if !defined(_MSC_VER) - __attribute__ ((target ("avx512bw"))) -#endif +STORM_TARGET("avx512bw") static uint64_t STORM_union_count_avx512(const uint64_t* STORM_RESTRICT b1, const uint64_t* STORM_RESTRICT b2, - const uint32_t n_ints) + const size_t n_ints) { uint64_t count = 0; const __m512i* r1 = (const __m512i*)(b1); @@ -2909,13 +2909,11 @@ uint64_t STORM_union_count_avx512(const uint64_t* STORM_RESTRICT b1, return(count); } -#if !defined(_MSC_VER) - __attribute__ ((target ("avx512bw"))) -#endif +STORM_TARGET("avx512bw") static uint64_t STORM_diff_count_avx512(const uint64_t* STORM_RESTRICT b1, const uint64_t* STORM_RESTRICT b2, - const uint32_t n_ints) + const size_t n_ints) { uint64_t count = 0; const __m512i* r1 = (const __m512i*)(b1); @@ -2930,6 +2928,31 @@ uint64_t STORM_diff_count_avx512(const uint64_t* STORM_RESTRICT b1, return(count); } + +STORM_TARGET("avx2") +static +uint64_t STORM_popcnt_avx512(const uint64_t* data, + const size_t n_ints) +{ + uint64_t count = 0; + const uint32_t n_cycles = n_ints / 8; + const uint32_t n_cycles_avx2 = (n_ints % 8) / 4; + const uint32_t n_cycles_sse = ((n_ints % 8) % 4) / 2; + + const __m512i* r1 = (__m512i*)&data[0]; + const __m256i* r2 = (__m256i*)&data[n_cycles*8]; + const __m128i* r3 = (__m128i*)&data[n_cycles*8+n_cycles_avx2*4]; + + count += STORM_popcnt_csa_avx512bw(r1, n_cycles); + count += STORM_popcnt_csa_avx2(r2, n_cycles_avx2); + count += STORM_popcnt_csa_sse4(r3, n_cycles_sse); + + for (int i = (8*n_cycles + 4*n_cycles + 2*n_cycles_sse); i < n_ints; ++i) { + count += STORM_POPCOUNT(data[i]); + } + + return count; +} #endif /**************************** @@ -2937,7 +2960,7 @@ uint64_t STORM_diff_count_avx512(const uint64_t* STORM_RESTRICT b1, ****************************/ STORM_FORCE_INLINE -uint64_t STORM_popcount64_unrolled(const uint64_t* data, uint64_t size) { +uint64_t STORM_popcount64_unrolled(const uint64_t* data, size_t size) { uint64_t i = 0; uint64_t limit = size - size % 4; uint64_t cnt = 0; @@ -2962,7 +2985,7 @@ uint64_t STORM_popcount64_unrolled(const uint64_t* data, uint64_t size) { STORM_FORCE_INLINE uint64_t STORM_intersect_count_scalar(const uint64_t* STORM_RESTRICT b1, const uint64_t* STORM_RESTRICT b2, - const uint32_t n_ints) + const size_t n_ints) { return STORM_intersect_count_unrolled(b1, b2, n_ints); } @@ -2970,7 +2993,7 @@ uint64_t STORM_intersect_count_scalar(const uint64_t* STORM_RESTRICT b1, STORM_FORCE_INLINE uint64_t STORM_union_count_scalar(const uint64_t* STORM_RESTRICT b1, const uint64_t* STORM_RESTRICT b2, - const uint32_t n_ints) + const size_t n_ints) { return STORM_union_count_unrolled(b1, b2, n_ints); } @@ -2978,7 +3001,7 @@ uint64_t STORM_union_count_scalar(const uint64_t* STORM_RESTRICT b1, STORM_FORCE_INLINE uint64_t STORM_diff_count_scalar(const uint64_t* STORM_RESTRICT b1, const uint64_t* STORM_RESTRICT b2, - const uint32_t n_ints) + const size_t n_ints) { return STORM_diff_count_unrolled(b1, b2, n_ints); } @@ -2988,22 +3011,18 @@ uint64_t STORM_intersect_count_scalar_list(const uint64_t* STORM_RESTRICT b1, const uint64_t* STORM_RESTRICT b2, const uint32_t* STORM_RESTRICT l1, const uint32_t* STORM_RESTRICT l2, - const uint32_t n1, - const uint32_t n2) + const size_t n1, + const size_t n2) { uint64_t count = 0; #define MOD(x) (( (x) * 64 ) >> 6) - if(n1 < n2) { - for (int i = 0; i < n1; ++i) { - count += ((b2[l1[i] >> 6] & (1L << MOD(l1[i]))) != 0); - // __builtin_prefetch(&b2[l1[i] >> 6], 0, _MM_HINT_T0); - } + if (n1 < n2) { + for (int i = 0; i < n1; ++i) + count += ((b2[l1[i] >> 6] & (1L << MOD(l1[i]))) != 0); } else { - for (int i = 0; i < n2; ++i) { + for (int i = 0; i < n2; ++i) count += ((b1[l2[i] >> 6] & (1L << MOD(l2[i]))) != 0); - // __builtin_prefetch(&b1[l2[i] >> 6], 0, _MM_HINT_T0); - } } #undef MOD return(count); @@ -3013,11 +3032,9 @@ uint64_t STORM_intersect_count_scalar_list(const uint64_t* STORM_RESTRICT b1, /* ************************************* * Function pointer definitions. ***************************************/ -typedef uint64_t (*STORM_compute_func)(const uint64_t*, const uint64_t*, const uint32_t); -typedef uint64_t (*STORM_compute_lfunc)(const uint64_t*, const uint64_t*, - const uint32_t*, const uint32_t*, const uint32_t, const uint32_t); -typedef int (STORM_pposcnt_func)(const uint16_t*, uint32_t, uint32_t*); -typedef uint64_t (STORM_popcnt_func)(const uint8_t*, uint32_t); +typedef uint64_t (*STORM_compute_func)(const uint64_t*, const uint64_t*, const size_t); +typedef int (STORM_pposcnt_func)(const uint16_t*, size_t, uint32_t*); +typedef uint64_t (STORM_popcnt_func)(const uint8_t*, size_t); /* ************************************* * Alignment @@ -3027,15 +3044,15 @@ typedef uint64_t (STORM_popcnt_func)(const uint8_t*, uint32_t); static uint32_t STORM_get_alignment() { -#if defined(HAVE_CPUID) +#if defined(STORM_HAVE_CPUID) #if defined(__cplusplus) /* C++11 thread-safe singleton */ - static const int cpuid = get_cpuid(); + static const int cpuid = STORM_get_cpuid(); #else static int cpuid_ = -1; int cpuid = cpuid_; if (cpuid == -1) { - cpuid = get_cpuid(); + cpuid = STORM_get_cpuid(); #if defined(_MSC_VER) _InterlockedCompareExchange(&cpuid_, cpuid, -1); @@ -3047,20 +3064,20 @@ uint32_t STORM_get_alignment() { #endif uint32_t alignment = 0; -#if defined(HAVE_AVX512) - if ((cpuid & STORM_bit_AVX512BW)) { // 16*512 +#if defined(STORM_HAVE_AVX512) + if ((cpuid & STORM_CPUID_runtime_bit_AVX512BW)) { // 16*512 alignment = STORM_AVX512_ALIGNMENT; } #endif -#if defined(HAVE_AVX2) - if ((cpuid & STORM_bit_AVX2) && alignment == 0) { // 16*256 +#if defined(STORM_HAVE_AVX2) + if ((cpuid & STORM_CPUID_runtime_bit_AVX2) && alignment == 0) { // 16*256 alignment = STORM_AVX2_ALIGNMENT; } #endif -#if defined(HAVE_SSE42) - if ((cpuid & STORM_bit_SSE41) && alignment == 0) { // 16*128 +#if defined(STORM_HAVE_SSE42) + if ((cpuid & STORM_CPUID_runtime_bit_SSE41) && alignment == 0) { // 16*128 alignment = STORM_SSE_ALIGNMENT; } #endif @@ -3075,17 +3092,17 @@ uint32_t STORM_get_alignment() { // Return the optimal intersection function given the range [0, n_bitmaps_vector) // and the available instruction set at run-time. static -STORM_compute_func STORM_get_intersect_count_func(const uint32_t n_bitmaps_vector) { +STORM_compute_func STORM_get_intersect_count_func(const size_t n_bitmaps_vector) { -#if defined(HAVE_CPUID) +#if defined(STORM_HAVE_CPUID) #if defined(__cplusplus) /* C++11 thread-safe singleton */ - static const int cpuid = get_cpuid(); + static const int cpuid = STORM_get_cpuid(); #else static int cpuid_ = -1; int cpuid = cpuid_; if (cpuid == -1) { - cpuid = get_cpuid(); + cpuid = STORM_get_cpuid(); #if defined(_MSC_VER) _InterlockedCompareExchange(&cpuid_, cpuid, -1); @@ -3097,24 +3114,24 @@ STORM_compute_func STORM_get_intersect_count_func(const uint32_t n_bitmaps_vecto #endif -#if defined(HAVE_AVX512) - if ((cpuid & STORM_bit_AVX512BW) && n_bitmaps_vector >= 128) { // 16*512 +#if defined(STORM_HAVE_AVX512) + if ((cpuid & STORM_CPUID_runtime_bit_AVX512BW) && n_bitmaps_vector >= 128) { // 16*512 return &STORM_intersect_count_avx512; } #endif -#if defined(HAVE_AVX2) - if ((cpuid & STORM_bit_AVX2) && n_bitmaps_vector >= 64) { // 16*256 +#if defined(STORM_HAVE_AVX2) + if ((cpuid & STORM_CPUID_runtime_bit_AVX2) && n_bitmaps_vector >= 64) { // 16*256 return &STORM_intersect_count_avx2; } - if ((cpuid & STORM_bit_AVX2) && n_bitmaps_vector >= 4) { + if ((cpuid & STORM_CPUID_runtime_bit_AVX2) && n_bitmaps_vector >= 4) { return &STORM_intersect_count_lookup_avx2; } #endif -#if defined(HAVE_SSE42) - if ((cpuid & STORM_bit_SSE41) && n_bitmaps_vector >= 32) { // 16*128 +#if defined(STORM_HAVE_SSE42) + if ((cpuid & STORM_CPUID_runtime_bit_SSE41) && n_bitmaps_vector >= 32) { // 16*128 return &STORM_intersect_count_sse4; } #endif @@ -3123,17 +3140,17 @@ STORM_compute_func STORM_get_intersect_count_func(const uint32_t n_bitmaps_vecto } static -STORM_compute_func STORM_get_union_count_func(const uint32_t n_bitmaps_vector) { +STORM_compute_func STORM_get_union_count_func(const size_t n_bitmaps_vector) { -#if defined(HAVE_CPUID) +#if defined(STORM_HAVE_CPUID) #if defined(__cplusplus) /* C++11 thread-safe singleton */ - static const int cpuid = get_cpuid(); + static const int cpuid = STORM_get_cpuid(); #else static int cpuid_ = -1; int cpuid = cpuid_; if (cpuid == -1) { - cpuid = get_cpuid(); + cpuid = STORM_get_cpuid(); #if defined(_MSC_VER) _InterlockedCompareExchange(&cpuid_, cpuid, -1); @@ -3145,24 +3162,24 @@ STORM_compute_func STORM_get_union_count_func(const uint32_t n_bitmaps_vector) { #endif -#if defined(HAVE_AVX512) - if ((cpuid & STORM_bit_AVX512BW) && n_bitmaps_vector >= 128) { // 16*512 +#if defined(STORM_HAVE_AVX512) + if ((cpuid & STORM_CPUID_runtime_bit_AVX512BW) && n_bitmaps_vector >= 128) { // 16*512 return &STORM_union_count_avx512; } #endif -#if defined(HAVE_AVX2) - if ((cpuid & STORM_bit_AVX2) && n_bitmaps_vector >= 64) { // 16*256 +#if defined(STORM_HAVE_AVX2) + if ((cpuid & STORM_CPUID_runtime_bit_AVX2) && n_bitmaps_vector >= 64) { // 16*256 return &STORM_union_count_avx2; } - if ((cpuid & STORM_bit_AVX2) && n_bitmaps_vector >= 4) { + if ((cpuid & STORM_CPUID_runtime_bit_AVX2) && n_bitmaps_vector >= 4) { return &STORM_union_count_lookup_avx2; } #endif -#if defined(HAVE_SSE42) - if ((cpuid & STORM_bit_SSE41) && n_bitmaps_vector >= 32) { // 16*128 +#if defined(STORM_HAVE_SSE42) + if ((cpuid & STORM_CPUID_runtime_bit_SSE41) && n_bitmaps_vector >= 32) { // 16*128 return &STORM_union_count_sse4; } #endif @@ -3171,17 +3188,17 @@ STORM_compute_func STORM_get_union_count_func(const uint32_t n_bitmaps_vector) { } static -STORM_compute_func STORM_get_diff_count_func(const uint32_t n_bitmaps_vector) { +STORM_compute_func STORM_get_diff_count_func(const size_t n_bitmaps_vector) { -#if defined(HAVE_CPUID) +#if defined(STORM_HAVE_CPUID) #if defined(__cplusplus) /* C++11 thread-safe singleton */ - static const int cpuid = get_cpuid(); + static const int cpuid = STORM_get_cpuid(); #else static int cpuid_ = -1; int cpuid = cpuid_; if (cpuid == -1) { - cpuid = get_cpuid(); + cpuid = STORM_get_cpuid(); #if defined(_MSC_VER) _InterlockedCompareExchange(&cpuid_, cpuid, -1); @@ -3193,24 +3210,24 @@ STORM_compute_func STORM_get_diff_count_func(const uint32_t n_bitmaps_vector) { #endif -#if defined(HAVE_AVX512) - if ((cpuid & STORM_bit_AVX512BW) && n_bitmaps_vector >= 128) { // 16*512 +#if defined(STORM_HAVE_AVX512) + if ((cpuid & STORM_CPUID_runtime_bit_AVX512BW) && n_bitmaps_vector >= 128) { // 16*512 return &STORM_diff_count_avx512; } #endif -#if defined(HAVE_AVX2) - if ((cpuid & STORM_bit_AVX2) && n_bitmaps_vector >= 64) { // 16*256 +#if defined(STORM_HAVE_AVX2) + if ((cpuid & STORM_CPUID_runtime_bit_AVX2) && n_bitmaps_vector >= 64) { // 16*256 return &STORM_diff_count_avx2; } - if ((cpuid & STORM_bit_AVX2) && n_bitmaps_vector >= 4) { + if ((cpuid & STORM_CPUID_runtime_bit_AVX2) && n_bitmaps_vector >= 4) { return &STORM_diff_count_lookup_avx2; } #endif -#if defined(HAVE_SSE42) - if ((cpuid & STORM_bit_SSE41) && n_bitmaps_vector >= 32) { // 16*128 +#if defined(STORM_HAVE_SSE42) + if ((cpuid & STORM_CPUID_runtime_bit_SSE41) && n_bitmaps_vector >= 32) { // 16*128 return &STORM_diff_count_sse4; } #endif @@ -3222,17 +3239,20 @@ STORM_compute_func STORM_get_diff_count_func(const uint32_t n_bitmaps_vector) { // Return the optimal intersection function given the range [0, n_bitmaps_vector) // and the available instruction set at run-time. static -uint64_t STORM_intersect_count(const uint64_t* STORM_RESTRICT data1, const uint64_t* STORM_RESTRICT data2, const uint32_t n_len) { +uint64_t STORM_intersect_count(const uint64_t* STORM_RESTRICT data1, + const uint64_t* STORM_RESTRICT data2, + const size_t n_len) +{ -#if defined(HAVE_CPUID) +#if defined(STORM_HAVE_CPUID) #if defined(__cplusplus) /* C++11 thread-safe singleton */ - static const int cpuid = get_cpuid(); + static const int cpuid = STORM_get_cpuid(); #else static int cpuid_ = -1; int cpuid = cpuid_; if (cpuid == -1) { - cpuid = get_cpuid(); + cpuid = STORM_get_cpuid(); #if defined(_MSC_VER) _InterlockedCompareExchange(&cpuid_, cpuid, -1); @@ -3244,24 +3264,24 @@ uint64_t STORM_intersect_count(const uint64_t* STORM_RESTRICT data1, const uint6 #endif -#if defined(HAVE_AVX512) - if ((cpuid & STORM_bit_AVX512BW) && n_len >= 128) { // 16*512 +#if defined(STORM_HAVE_AVX512) + if ((cpuid & STORM_CPUID_runtime_bit_AVX512BW) && n_len >= 128) { // 16*512 return STORM_intersect_count_avx512(data1, data2, n_len); } #endif -#if defined(HAVE_AVX2) - if ((cpuid & STORM_bit_AVX2) && n_len >= 64) { // 16*256 +#if defined(STORM_HAVE_AVX2) + if ((cpuid & STORM_CPUID_runtime_bit_AVX2) && n_len >= 64) { // 16*256 return STORM_intersect_count_avx2(data1, data2, n_len); } - if ((cpuid & STORM_bit_AVX2) && n_len >= 4) { + if ((cpuid & STORM_CPUID_runtime_bit_AVX2) && n_len >= 4) { return STORM_intersect_count_lookup_avx2(data1, data2, n_len); } #endif -#if defined(HAVE_SSE42) - if ((cpuid & STORM_bit_SSE41) && n_len >= 32) { // 16*128 +#if defined(STORM_HAVE_SSE42) + if ((cpuid & STORM_CPUID_runtime_bit_SSE41) && n_len >= 32) { // 16*128 return STORM_intersect_count_sse4(data1, data2, n_len); } #endif @@ -3270,17 +3290,20 @@ uint64_t STORM_intersect_count(const uint64_t* STORM_RESTRICT data1, const uint6 } static -uint64_t STORM_union_count(const uint64_t* STORM_RESTRICT data1, const uint64_t* STORM_RESTRICT data2, const uint32_t n_len) { +uint64_t STORM_union_count(const uint64_t* STORM_RESTRICT data1, + const uint64_t* STORM_RESTRICT data2, + const size_t n_len) +{ -#if defined(HAVE_CPUID) +#if defined(STORM_HAVE_CPUID) #if defined(__cplusplus) /* C++11 thread-safe singleton */ - static const int cpuid = get_cpuid(); + static const int cpuid = STORM_get_cpuid(); #else static int cpuid_ = -1; int cpuid = cpuid_; if (cpuid == -1) { - cpuid = get_cpuid(); + cpuid = STORM_get_cpuid(); #if defined(_MSC_VER) _InterlockedCompareExchange(&cpuid_, cpuid, -1); @@ -3292,24 +3315,24 @@ uint64_t STORM_union_count(const uint64_t* STORM_RESTRICT data1, const uint64_t* #endif -#if defined(HAVE_AVX512) - if ((cpuid & STORM_bit_AVX512BW) && n_len >= 128) { // 16*512 +#if defined(STORM_HAVE_AVX512) + if ((cpuid & STORM_CPUID_runtime_bit_AVX512BW) && n_len >= 128) { // 16*512 return STORM_union_count_avx512(data1, data2, n_len); } #endif -#if defined(HAVE_AVX2) - if ((cpuid & STORM_bit_AVX2) && n_len >= 64) { // 16*256 +#if defined(STORM_HAVE_AVX2) + if ((cpuid & STORM_CPUID_runtime_bit_AVX2) && n_len >= 64) { // 16*256 return STORM_union_count_avx2(data1, data2, n_len); } - if ((cpuid & STORM_bit_AVX2) && n_len >= 4) { + if ((cpuid & STORM_CPUID_runtime_bit_AVX2) && n_len >= 4) { return STORM_union_count_lookup_avx2(data1, data2, n_len); } #endif -#if defined(HAVE_SSE42) - if ((cpuid & STORM_bit_SSE41) && n_len >= 32) { // 16*128 +#if defined(STORM_HAVE_SSE42) + if ((cpuid & STORM_CPUID_runtime_bit_SSE41) && n_len >= 32) { // 16*128 return STORM_union_count_sse4(data1, data2, n_len); } #endif @@ -3318,17 +3341,20 @@ uint64_t STORM_union_count(const uint64_t* STORM_RESTRICT data1, const uint64_t* } static -uint64_t STORM_diff_count(const uint64_t* STORM_RESTRICT data1, const uint64_t* STORM_RESTRICT data2, const uint32_t n_len) { +uint64_t STORM_diff_count(const uint64_t* STORM_RESTRICT data1, + const uint64_t* STORM_RESTRICT data2, + const size_t n_len) +{ -#if defined(HAVE_CPUID) +#if defined(STORM_HAVE_CPUID) #if defined(__cplusplus) /* C++11 thread-safe singleton */ - static const int cpuid = get_cpuid(); + static const int cpuid = STORM_get_cpuid(); #else static int cpuid_ = -1; int cpuid = cpuid_; if (cpuid == -1) { - cpuid = get_cpuid(); + cpuid = STORM_get_cpuid(); #if defined(_MSC_VER) _InterlockedCompareExchange(&cpuid_, cpuid, -1); @@ -3340,24 +3366,24 @@ uint64_t STORM_diff_count(const uint64_t* STORM_RESTRICT data1, const uint64_t* #endif -#if defined(HAVE_AVX512) - if ((cpuid & STORM_bit_AVX512BW) && n_len >= 128) { // 16*512 +#if defined(STORM_HAVE_AVX512) + if ((cpuid & STORM_CPUID_runtime_bit_AVX512BW) && n_len >= 128) { // 16*512 return STORM_diff_count_avx512(data1, data2, n_len); } #endif -#if defined(HAVE_AVX2) - if ((cpuid & STORM_bit_AVX2) && n_len >= 64) { // 16*256 +#if defined(STORM_HAVE_AVX2) + if ((cpuid & STORM_CPUID_runtime_bit_AVX2) && n_len >= 64) { // 16*256 return STORM_diff_count_avx2(data1, data2, n_len); } - if ((cpuid & STORM_bit_AVX2) && n_len >= 4) { + if ((cpuid & STORM_CPUID_runtime_bit_AVX2) && n_len >= 4) { return STORM_diff_count_lookup_avx2(data1, data2, n_len); } #endif -#if defined(HAVE_SSE42) - if ((cpuid & STORM_bit_SSE41) && n_len >= 32) { // 16*128 +#if defined(STORM_HAVE_SSE42) + if ((cpuid & STORM_CPUID_runtime_bit_SSE41) && n_len >= 32) { // 16*128 return STORM_diff_count_sse4(data1, data2, n_len); } #endif @@ -3369,19 +3395,20 @@ uint64_t STORM_diff_count(const uint64_t* STORM_RESTRICT data1, const uint64_t* * POPCNT and POSPOPCNT functions. ***************************************/ static -uint64_t STORM_popcnt(const uint8_t* data, uint32_t size) { +uint64_t STORM_popcnt(const uint8_t* data, size_t size) { uint64_t cnt = 0; uint64_t i; + // size /= 8; -#if defined(HAVE_CPUID) +#if defined(STORM_HAVE_CPUID) #if defined(__cplusplus) - /* C++11 thread-safe singleton */ - static const int cpuid = get_cpuid(); + /* C++11 thread-safe singleton */ + static const int cpuid = STORM_get_cpuid(); #else static int cpuid_ = -1; int cpuid = cpuid_; if (cpuid == -1) { - cpuid = get_cpuid(); + cpuid = STORM_get_cpuid(); #if defined(_MSC_VER) _InterlockedCompareExchange(&cpuid_, cpuid, -1); @@ -3392,39 +3419,59 @@ uint64_t STORM_popcnt(const uint8_t* data, uint32_t size) { #endif #endif -#if defined(HAVE_AVX512) +#if defined(STORM_HAVE_AVX512) /* AVX512 requires arrays >= 1024 bytes */ - if ((cpuid & STORM_bit_AVX512BW) && + if ((cpuid & STORM_CPUID_runtime_bit_AVX512BW) && size >= 1024) { - cnt += STORM_popcnt_avx512((const __m512i*)data, size / 64); - data += size - size % 64; - size = size % 64; + // cnt += STORM_popcnt_avx512((const __m512i*)data, size / 64); + // data += size - size % 64; + // size = size % 64; + cnt += STORM_popcnt_avx512((uint64_t*)data, size/8); + data += size - size % 8; + size = size % 8; } #endif -#if defined(HAVE_AVX2) +#if defined(STORM_HAVE_AVX2) /* AVX2 requires arrays >= 512 bytes */ - if ((cpuid & STORM_bit_AVX2) && + if ((cpuid & STORM_CPUID_runtime_bit_AVX2) && size >= 512) { - cnt += STORM_popcnt_avx2((const __m256i*)data, size / 32); - data += size - size % 32; - size = size % 32; + cnt += STORM_popcnt_avx2((uint64_t*)data, size/8); + data += size - size % 8; + size = size % 8; + // data += size - size % 32; + // size = size % 32; } #endif -#if defined(HAVE_POPCNT) +#if defined(HAVE_SSE4) - if (cpuid & STORM_bit_POPCNT) { + /* AVX2 requires arrays >= 512 bytes */ + if ((cpuid & STORM_CPUID_runtime_bit_SSE42) && + size >= 256) + { + cnt += STORM_popcnt_sse4((uint64_t*)data, size/8)); + data += size - size % 8; + size = size % 8; + // data += size - size % 32; + // size = size % 32; + } + +#endif + +#if defined(STORM_HAVE_POPCNT) + + if (cpuid & STORM_CPUID_runtime_bit_POPCNT) { cnt += STORM_popcount64_unrolled((const uint64_t*)data, size / 8); data += size - size % 8; size = size % 8; - for (i = 0; i < size; i++) + for (i = 0; i < size; ++i) cnt += STORM_popcount64(data[i]); return cnt; @@ -3440,25 +3487,25 @@ uint64_t STORM_popcnt(const uint8_t* data, uint32_t size) { } /* pure integer popcount algorithm */ - for (i = 0; i < size; i++) + for (i = 0; i < size; ++i) cnt += STORM_popcount64(data[i]); return cnt; } static -int STORM_pospopcnt_u16(const uint16_t* data, uint32_t len, uint32_t* flags) { - memset(flags, 0, sizeof(uint32_t)*16); +int STORM_pospopcnt_u16(const uint16_t* data, size_t len, uint32_t* out) { + memset(out, 0, sizeof(uint32_t)*16); -#if defined(HAVE_CPUID) +#if defined(STORM_HAVE_CPUID) #if defined(__cplusplus) - /* C++11 thread-safe singleton */ - static const int cpuid = get_cpuid(); + /* C++11 thread-safe singleton */ + static const int cpuid = STORM_get_cpuid(); #else static int cpuid_ = -1; int cpuid = cpuid_; if (cpuid == -1) { - cpuid = get_cpuid(); + cpuid = STORM_get_cpuid(); #if defined(_MSC_VER) _InterlockedCompareExchange(&cpuid_, cpuid, -1); @@ -3469,37 +3516,37 @@ int STORM_pospopcnt_u16(const uint16_t* data, uint32_t len, uint32_t* flags) { #endif #endif -#if defined(HAVE_AVX512) - if ((cpuid & STORM_bit_AVX512BW)) +#if defined(STORM_HAVE_AVX512) + if ((cpuid & STORM_CPUID_runtime_bit_AVX512BW)) { - if (len < 32) return(STORM_pospopcnt_u16_sse_sad(data, len, flags)); // small - else if (len < 256) return(STORM_pospopcnt_u16_sse_blend_popcnt_unroll8(data, len, flags)); // small - else if (len < 512) return(STORM_pospopcnt_u16_avx512bw_blend_popcnt_unroll8(data, len, flags)); // medium - else if (len < 4096) return(STORM_pospopcnt_u16_avx512bw_adder_forest(data, len, flags)); // medium3 - else return(STORM_pospopcnt_u16_avx512bw_harvey_seal(data, len, flags)); // fix + if (len < 32) return(STORM_pospopcnt_u16_sse_sad(data, len, out)); // small + else if (len < 256) return(STORM_pospopcnt_u16_sse_blend_popcnt_unroll8(data, len, out)); // small + else if (len < 512) return(STORM_pospopcnt_u16_avx512bw_blend_popcnt_unroll8(data, len, out)); // medium + else if (len < 4096) return(STORM_pospopcnt_u16_avx512bw_adder_forest(data, len, out)); // medium3 + else return(STORM_pospopcnt_u16_avx512bw_harvey_seal(data, len, out)); // fix } #endif -#if defined(HAVE_AVX2) - if ((cpuid & STORM_bit_AVX2)) +#if defined(STORM_HAVE_AVX2) + if ((cpuid & STORM_CPUID_runtime_bit_AVX2)) { - if (len < 128) return(STORM_pospopcnt_u16_sse_sad(data, len, flags)); // small - else if (len < 1024) return(STORM_pospopcnt_u16_avx2_blend_popcnt_unroll8(data, len, flags)); // medium - else return(STORM_pospopcnt_u16_avx2_harvey_seal(data, len, flags)); // large + if (len < 128) return(STORM_pospopcnt_u16_sse_sad(data, len, out)); // small + else if (len < 1024) return(STORM_pospopcnt_u16_avx2_blend_popcnt_unroll8(data, len, out)); // medium + else return(STORM_pospopcnt_u16_avx2_harvey_seal(data, len, out)); // large } #endif #if defined(HAVE_SSE4) - if ((cpuid & STORM_bit_SSE42)) + if ((cpuid & STORM_CPUID_runtime_bit_SSE42)) { - return(STORM_pospopcnt_u16_sse_harvey_seal(data, len, flags)); + return(STORM_pospopcnt_u16_sse_harvey_seal(data, len, out)); } #endif #ifndef _MSC_VER - return(STORM_pospopcnt_u16_scalar_umul128_unroll2(data, len, flags)); // fallback scalar + return(STORM_pospopcnt_u16_scalar_umul128_unroll2(data, len, out)); // fallback scalar #else - return(STORM_pospopcnt_u16_scalar_naive(data, len, flags)); + return(STORM_pospopcnt_u16_scalar_naive(data, len, out)); #endif } @@ -3507,4 +3554,4 @@ int STORM_pospopcnt_u16(const uint16_t* data, uint32_t len, uint32_t* flags) { } /* extern "C" */ #endif -#endif /* LIBALGEBRA_H_9827563662203 */ +#endif /* LIBALGEBRA_H_8723467365934 */