|
29 | 29 | */
|
30 | 30 |
|
31 | 31 | #include "server.h"
|
32 |
| - |
| 32 | +#ifdef HAVE_AVX2 |
| 33 | +#include <immintrin.h> |
| 34 | +#endif |
33 | 35 | /* -----------------------------------------------------------------------------
|
34 | 36 | * Helpers and low level bit functions.
|
35 | 37 | * -------------------------------------------------------------------------- */
|
36 | 38 |
|
37 |
| -/* Count number of bits set in the binary array pointed by 's' and long |
38 |
| - * 'count' bytes. The implementation of this function is required to |
39 |
| - * work with an input string length up to 512 MB or more (server.proto_max_bulk_len) */ |
40 |
| -long long serverPopcount(void *s, long count) { |
| 39 | +static const unsigned char bitsinbyte[256] = { |
| 40 | + 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 1, 2, 2, 3, 2, |
| 41 | + 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, |
| 42 | + 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, |
| 43 | + 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, |
| 44 | + 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, |
| 45 | + 5, 5, 6, 5, 6, 6, 7, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, |
| 46 | + 6, 7, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8}; |
| 47 | + |
| 48 | +#ifdef HAVE_AVX2 |
| 49 | +/* The SIMD version of popcount enhances performance through parallel lookup tables which is based on the following article: |
| 50 | + * https://arxiv.org/pdf/1611.07612 */ |
| 51 | +ATTRIBUTE_TARGET_AVX2 |
| 52 | +long long popcountAVX2(void *s, long count) { |
| 53 | + long i = 0; |
| 54 | + unsigned char *p = (unsigned char *)s; |
| 55 | + long long bits = 0; |
| 56 | + |
| 57 | + /* clang-format off */ |
| 58 | + const __m256i lookup = _mm256_setr_epi8( |
| 59 | + /* First Lane [0:127] */ |
| 60 | + /* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2, |
| 61 | + /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3, |
| 62 | + /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3, |
| 63 | + /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4, |
| 64 | + |
| 65 | + /* Second Lane [128:255] identical to first lane due to lane isolation in _mm256_shuffle_epi8. |
| 66 | + * For more information, see following URL |
| 67 | + * https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shuffle_epi8 */ |
| 68 | + /* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2, |
| 69 | + /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3, |
| 70 | + /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3, |
| 71 | + /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4); |
| 72 | + /* clang-format on */ |
| 73 | + const __m256i low_mask = _mm256_set1_epi8(0x0f); |
| 74 | + __m256i acc = _mm256_setzero_si256(); |
| 75 | + |
| 76 | +/* Count 32 bytes per iteration. */ |
| 77 | +#define ITER_32_BYTES \ |
| 78 | + { \ |
| 79 | + const __m256i vec = _mm256_loadu_si256((const __m256i *)(p + i)); \ |
| 80 | + const __m256i lo = _mm256_and_si256(vec, low_mask); \ |
| 81 | + const __m256i hi = _mm256_and_si256(_mm256_srli_epi16(vec, 4), low_mask); \ |
| 82 | + const __m256i popcnt1 = _mm256_shuffle_epi8(lookup, lo); \ |
| 83 | + const __m256i popcnt2 = _mm256_shuffle_epi8(lookup, hi); \ |
| 84 | + local = _mm256_add_epi8(local, popcnt1); \ |
| 85 | + local = _mm256_add_epi8(local, popcnt2); \ |
| 86 | + i += 32; \ |
| 87 | + } |
| 88 | + |
| 89 | + /* We divide the array into the following three parts |
| 90 | + * Part A Part B Part C |
| 91 | + * +-----------------+--------------+---------+ |
| 92 | + * | 8 * 32bytes * X | 32bytes * Y | Z bytes | |
| 93 | + * +-----------------+--------------+---------+ |
| 94 | + */ |
| 95 | + |
| 96 | + /* Part A: loop unrolling, processing 8 * 32 bytes per iteration. */ |
| 97 | + while (i + 8 * 32 <= count) { |
| 98 | + __m256i local = _mm256_setzero_si256(); |
| 99 | + ITER_32_BYTES |
| 100 | + ITER_32_BYTES |
| 101 | + ITER_32_BYTES |
| 102 | + ITER_32_BYTES |
| 103 | + ITER_32_BYTES |
| 104 | + ITER_32_BYTES |
| 105 | + ITER_32_BYTES |
| 106 | + ITER_32_BYTES |
| 107 | + acc = _mm256_add_epi64(acc, _mm256_sad_epu8(local, _mm256_setzero_si256())); |
| 108 | + } |
| 109 | + |
| 110 | + /* Part B: when the remaining data length is less than 8 * 32 bytes, |
| 111 | + * process 32 bytes per iteration. */ |
| 112 | + __m256i local = _mm256_setzero_si256(); |
| 113 | + while (i + 32 <= count) { |
| 114 | + ITER_32_BYTES; |
| 115 | + } |
| 116 | + acc = _mm256_add_epi64(acc, _mm256_sad_epu8(local, _mm256_setzero_si256())); |
| 117 | + |
| 118 | +#undef ITER_32_BYTES |
| 119 | + |
| 120 | + bits += _mm256_extract_epi64(acc, 0); |
| 121 | + bits += _mm256_extract_epi64(acc, 1); |
| 122 | + bits += _mm256_extract_epi64(acc, 2); |
| 123 | + bits += _mm256_extract_epi64(acc, 3); |
| 124 | + |
| 125 | + /* Part C: count the remaining bytes. */ |
| 126 | + for (; i < count; i++) { |
| 127 | + bits += bitsinbyte[p[i]]; |
| 128 | + } |
| 129 | + |
| 130 | + return bits; |
| 131 | +} |
| 132 | +#endif |
| 133 | + |
| 134 | +/* The scalar version of popcount based on lookup tables. */ |
| 135 | +long long popcountScalar(void *s, long count) { |
41 | 136 | long long bits = 0;
|
42 | 137 | unsigned char *p = s;
|
43 | 138 | uint32_t *p4;
|
44 |
| - static const unsigned char bitsinbyte[256] = { |
45 |
| - 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 1, 2, 2, 3, 2, |
46 |
| - 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, |
47 |
| - 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, |
48 |
| - 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, |
49 |
| - 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, |
50 |
| - 5, 5, 6, 5, 6, 6, 7, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, |
51 |
| - 6, 7, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8}; |
52 | 139 |
|
53 | 140 | /* Count initial bytes not aligned to 32 bit. */
|
54 | 141 | while ((unsigned long)p & 3 && count) {
|
@@ -97,6 +184,20 @@ long long serverPopcount(void *s, long count) {
|
97 | 184 | return bits;
|
98 | 185 | }
|
99 | 186 |
|
| 187 | +/* Count number of bits set in the binary array pointed by 's' and long |
| 188 | + * 'count' bytes. The implementation of this function is required to |
| 189 | + * work with an input string length up to 512 MB or more (server.proto_max_bulk_len) */ |
| 190 | +long long serverPopcount(void *s, long count) { |
| 191 | +#ifdef HAVE_AVX2 |
| 192 | + /* If length of s >= 256 bits and the CPU supports AVX2, |
| 193 | + * we prefer to use the SIMD version */ |
| 194 | + if (count >= 32) { |
| 195 | + return popcountAVX2(s, count); |
| 196 | + } |
| 197 | +#endif |
| 198 | + return popcountScalar(s, count); |
| 199 | +} |
| 200 | + |
100 | 201 | /* Return the position of the first bit set to one (if 'bit' is 1) or
|
101 | 202 | * zero (if 'bit' is 0) in the bitmap starting at 's' and long 'count' bytes.
|
102 | 203 | *
|
|
0 commit comments