Skip to content

Commit 79d5047

Browse files
authored
Optimize bitcount command by SIMD (#1741)
**Background** - Currently, we implement bitcount using a lookup table method - By SIMD, parallel table lookups can be achieved, which boosts performance - Most x86 servers support the AVX2 instruction set **BenchMark** | Value Size | QPS (After optimization) | QPS (Before optimization) | change | | ---- | ---- | ---- | ---- | |16 B | 114925| 115924 | -0.8%| |256 B| 112619 | 112201| +0.3%| |4 KB| 105523|96251| +9.6%| |64 KB|79723|36796| +116%| |1MB|21306|3466|+514%| CPU: AMD EPYC 9754 128-Core Processor * 8 OS: Ubuntu Server 22.04 LTS 64bit Memory: 16GB VM: Tencent cloud SA5.2XLARGE16 **Test Plan** Pending. Will add test if it looks okay **Other** This PR is based on https://github.com/WojciechMula/sse-popcount/blob/master/popcnt-avx2-lookup.cpp --------- Signed-off-by: chzhoo <[email protected]>
1 parent b360f96 commit 79d5047

File tree

3 files changed

+191
-13
lines changed

3 files changed

+191
-13
lines changed

src/bitops.c

Lines changed: 114 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -29,26 +29,113 @@
2929
*/
3030

3131
#include "server.h"
32-
32+
#ifdef HAVE_AVX2
33+
#include <immintrin.h>
34+
#endif
3335
/* -----------------------------------------------------------------------------
3436
* Helpers and low level bit functions.
3537
* -------------------------------------------------------------------------- */
3638

37-
/* Count number of bits set in the binary array pointed by 's' and long
38-
* 'count' bytes. The implementation of this function is required to
39-
* work with an input string length up to 512 MB or more (server.proto_max_bulk_len) */
40-
long long serverPopcount(void *s, long count) {
39+
static const unsigned char bitsinbyte[256] = {
40+
0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 1, 2, 2, 3, 2,
41+
3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3,
42+
3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5,
43+
6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4,
44+
3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4,
45+
5, 5, 6, 5, 6, 6, 7, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6,
46+
6, 7, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8};
47+
48+
#ifdef HAVE_AVX2
49+
/* The SIMD version of popcount enhances performance through parallel lookup tables which is based on the following article:
50+
* https://arxiv.org/pdf/1611.07612 */
51+
ATTRIBUTE_TARGET_AVX2
52+
long long popcountAVX2(void *s, long count) {
53+
long i = 0;
54+
unsigned char *p = (unsigned char *)s;
55+
long long bits = 0;
56+
57+
/* clang-format off */
58+
const __m256i lookup = _mm256_setr_epi8(
59+
/* First Lane [0:127] */
60+
/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
61+
/* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
62+
/* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
63+
/* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4,
64+
65+
/* Second Lane [128:255] identical to first lane due to lane isolation in _mm256_shuffle_epi8.
66+
* For more information, see following URL
67+
* https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shuffle_epi8 */
68+
/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
69+
/* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
70+
/* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
71+
/* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4);
72+
/* clang-format on */
73+
const __m256i low_mask = _mm256_set1_epi8(0x0f);
74+
__m256i acc = _mm256_setzero_si256();
75+
76+
/* Count 32 bytes per iteration. */
77+
#define ITER_32_BYTES \
78+
{ \
79+
const __m256i vec = _mm256_loadu_si256((const __m256i *)(p + i)); \
80+
const __m256i lo = _mm256_and_si256(vec, low_mask); \
81+
const __m256i hi = _mm256_and_si256(_mm256_srli_epi16(vec, 4), low_mask); \
82+
const __m256i popcnt1 = _mm256_shuffle_epi8(lookup, lo); \
83+
const __m256i popcnt2 = _mm256_shuffle_epi8(lookup, hi); \
84+
local = _mm256_add_epi8(local, popcnt1); \
85+
local = _mm256_add_epi8(local, popcnt2); \
86+
i += 32; \
87+
}
88+
89+
/* We divide the array into the following three parts
90+
* Part A Part B Part C
91+
* +-----------------+--------------+---------+
92+
* | 8 * 32bytes * X | 32bytes * Y | Z bytes |
93+
* +-----------------+--------------+---------+
94+
*/
95+
96+
/* Part A: loop unrolling, processing 8 * 32 bytes per iteration. */
97+
while (i + 8 * 32 <= count) {
98+
__m256i local = _mm256_setzero_si256();
99+
ITER_32_BYTES
100+
ITER_32_BYTES
101+
ITER_32_BYTES
102+
ITER_32_BYTES
103+
ITER_32_BYTES
104+
ITER_32_BYTES
105+
ITER_32_BYTES
106+
ITER_32_BYTES
107+
acc = _mm256_add_epi64(acc, _mm256_sad_epu8(local, _mm256_setzero_si256()));
108+
}
109+
110+
/* Part B: when the remaining data length is less than 8 * 32 bytes,
111+
* process 32 bytes per iteration. */
112+
__m256i local = _mm256_setzero_si256();
113+
while (i + 32 <= count) {
114+
ITER_32_BYTES;
115+
}
116+
acc = _mm256_add_epi64(acc, _mm256_sad_epu8(local, _mm256_setzero_si256()));
117+
118+
#undef ITER_32_BYTES
119+
120+
bits += _mm256_extract_epi64(acc, 0);
121+
bits += _mm256_extract_epi64(acc, 1);
122+
bits += _mm256_extract_epi64(acc, 2);
123+
bits += _mm256_extract_epi64(acc, 3);
124+
125+
/* Part C: count the remaining bytes. */
126+
for (; i < count; i++) {
127+
bits += bitsinbyte[p[i]];
128+
}
129+
130+
return bits;
131+
}
132+
#endif
133+
134+
/* The scalar version of popcount based on lookup tables. */
135+
long long popcountScalar(void *s, long count) {
41136
long long bits = 0;
42137
unsigned char *p = s;
43138
uint32_t *p4;
44-
static const unsigned char bitsinbyte[256] = {
45-
0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 1, 2, 2, 3, 2,
46-
3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3,
47-
3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5,
48-
6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4,
49-
3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4,
50-
5, 5, 6, 5, 6, 6, 7, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6,
51-
6, 7, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8};
52139

53140
/* Count initial bytes not aligned to 32 bit. */
54141
while ((unsigned long)p & 3 && count) {
@@ -97,6 +184,20 @@ long long serverPopcount(void *s, long count) {
97184
return bits;
98185
}
99186

187+
/* Count number of bits set in the binary array pointed by 's' and long
188+
* 'count' bytes. The implementation of this function is required to
189+
* work with an input string length up to 512 MB or more (server.proto_max_bulk_len) */
190+
long long serverPopcount(void *s, long count) {
191+
#ifdef HAVE_AVX2
192+
/* If length of s >= 256 bits and the CPU supports AVX2,
193+
* we prefer to use the SIMD version */
194+
if (count >= 32) {
195+
return popcountAVX2(s, count);
196+
}
197+
#endif
198+
return popcountScalar(s, count);
199+
}
200+
100201
/* Return the position of the first bit set to one (if 'bit' is 1) or
101202
* zero (if 'bit' is 0) in the bitmap starting at 's' and long 'count' bytes.
102203
*

src/unit/test_bitops.c

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
#include <time.h>
2+
#include <stdint.h>
3+
4+
#include "test_help.h"
5+
6+
#include "../config.h"
7+
#include "../zmalloc.h"
8+
9+
extern long long popcountScalar(void *s, long count);
10+
#ifdef HAVE_AVX2
11+
extern long long popcountAVX2(void *s, long count);
12+
#endif
13+
14+
static long long bitcount(void *s, long count) {
15+
long long bits = 0;
16+
uint8_t *p = (uint8_t *)s;
17+
for (int x = 0; x < count; x += 1) {
18+
uint8_t val = *(x + p);
19+
while (val) {
20+
bits += val & 1;
21+
val >>= 1;
22+
}
23+
}
24+
return bits;
25+
}
26+
27+
static int test_case(const char *msg, int size) {
28+
uint8_t buf[size];
29+
int fuzzing = 1000;
30+
for (int y = 0; y < fuzzing; y += 1) {
31+
for (int z = 0; z < size; z += 1) {
32+
buf[z] = random() % 256;
33+
}
34+
35+
long long expect = bitcount(buf, size);
36+
long long ret_scalar = popcountScalar(buf, size);
37+
TEST_ASSERT_MESSAGE(msg, expect == ret_scalar);
38+
#ifdef HAVE_AVX2
39+
long long ret_avx2 = popcountAVX2(buf, size);
40+
TEST_ASSERT_MESSAGE(msg, expect == ret_avx2);
41+
#endif
42+
}
43+
44+
return 0;
45+
}
46+
47+
int test_popcount(int argc, char **argv, int flags) {
48+
UNUSED(argc);
49+
UNUSED(argv);
50+
UNUSED(flags);
51+
52+
#define TEST_CASE(MSG, SIZE) \
53+
if (test_case("Test failed: " MSG, SIZE)) { \
54+
return 1; \
55+
}
56+
57+
/* The AVX2 version divides the array into the following 3 parts."
58+
* Part A Part B Part C
59+
* +-----------------+--------------+---------+
60+
* | 8 * 32bytes * X | 32bytes * Y | Z bytes |
61+
* +-----------------+--------------+---------+
62+
*/
63+
/* So we test the following cases */
64+
TEST_CASE("Popcount(Part A)", 8 * 32 * 2);
65+
TEST_CASE("Popcount(Part B)", 32 * 2);
66+
TEST_CASE("Popcount(Part C)", 2);
67+
TEST_CASE("Popcount(Part A + Part B)", 8 * 32 * 7 + 32 * 2);
68+
TEST_CASE("Popcount(Part A + Part C)", 8 * 32 * 11 + 7);
69+
TEST_CASE("Popcount(Part A + Part B + Part C)", 8 * 32 * 3 + 3 * 32 + 5);
70+
TEST_CASE("Popcount(Corner case)", 0);
71+
#undef TEST_CASE
72+
73+
return 0;
74+
}

src/unit/test_files.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ typedef struct unitTest {
66
unitTestProc *proc;
77
} unitTest;
88

9+
int test_popcount(int argc, char **argv, int flags);
910
int test_crc64(int argc, char **argv, int flags);
1011
int test_crc64combine(int argc, char **argv, int flags);
1112
int test_dictCreate(int argc, char **argv, int flags);
@@ -231,6 +232,7 @@ int test_zmallocInitialUsedMemory(int argc, char **argv, int flags);
231232
int test_zmallocAllocReallocCallocAndFree(int argc, char **argv, int flags);
232233
int test_zmallocAllocZeroByteAndFree(int argc, char **argv, int flags);
233234

235+
unitTest __test_bitops_c[] = {{"test_popcount", test_popcount}, {NULL, NULL}};
234236
unitTest __test_crc64_c[] = {{"test_crc64", test_crc64}, {NULL, NULL}};
235237
unitTest __test_crc64combine_c[] = {{"test_crc64combine", test_crc64combine}, {NULL, NULL}};
236238
unitTest __test_dict_c[] = {{"test_dictCreate", test_dictCreate}, {"test_dictAdd16Keys", test_dictAdd16Keys}, {"test_dictDisableResize", test_dictDisableResize}, {"test_dictAddOneKeyTriggerResize", test_dictAddOneKeyTriggerResize}, {"test_dictDeleteKeys", test_dictDeleteKeys}, {"test_dictDeleteOneKeyTriggerResize", test_dictDeleteOneKeyTriggerResize}, {"test_dictEmptyDirAdd128Keys", test_dictEmptyDirAdd128Keys}, {"test_dictDisableResizeReduceTo3", test_dictDisableResizeReduceTo3}, {"test_dictDeleteOneKeyTriggerResizeAgain", test_dictDeleteOneKeyTriggerResizeAgain}, {"test_dictBenchmark", test_dictBenchmark}, {NULL, NULL}};
@@ -255,6 +257,7 @@ struct unitTestSuite {
255257
char *filename;
256258
unitTest *tests;
257259
} unitTestSuite[] = {
260+
{"test_bitops.c", __test_bitops_c},
258261
{"test_crc64.c", __test_crc64_c},
259262
{"test_crc64combine.c", __test_crc64combine_c},
260263
{"test_dict.c", __test_dict_c},

0 commit comments

Comments
 (0)