Skip to content

Commit 1927740

Browse files
committed
Use an AVX2 implementation with dynamic dispatch to speedup AdapterCounter
1 parent 5bb67d9 commit 1927740

File tree

1 file changed

+110
-30
lines changed

1 file changed

+110
-30
lines changed

src/sequali/_qcmodule.c

Lines changed: 110 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -37,10 +37,6 @@ along with Sequali. If not, see <https://www.gnu.org/licenses/
3737
#include <math.h>
3838
#include <stdbool.h>
3939

40-
#ifdef __SSE2__
41-
#include "emmintrin.h"
42-
#endif
43-
4440
/* Pointers to types that will be imported/initialized in the module
4541
initialization section */
4642

@@ -2164,6 +2160,8 @@ typedef struct AdapterCounterStruct {
21642160
bitmask_t *init_masks;
21652161
bitmask_t *found_masks;
21662162
bitmask_t (*bitmasks)[NUC_TABLE_SIZE];
2163+
/* Same as bitmasks, but better organization for vectorized approach. */
2164+
bitmask_t (*by_four_bitmasks)[NUC_TABLE_SIZE][4];
21672165
AdapterSequence **adapter_sequences;
21682166
} AdapterCounter;
21692167

@@ -2185,6 +2183,7 @@ AdapterCounter_dealloc(AdapterCounter *self)
21852183
PyMem_Free(self->found_masks);
21862184
PyMem_Free(self->init_masks);
21872185
PyMem_Free(self->bitmasks);
2186+
PyMem_Free(self->by_four_bitmasks);
21882187
PyMem_Free(self->adapter_sequences);
21892188

21902189
PyTypeObject *tp = Py_TYPE((PyObject *)self);
@@ -2272,9 +2271,11 @@ AdapterCounter__new__(PyTypeObject *type, PyObject *args, PyObject *kwargs)
22722271
PyMem_Calloc(matcher_array_size, sizeof(AdapterSequence *));
22732272
self->bitmasks =
22742273
PyMem_Calloc(matcher_array_size, NUC_TABLE_SIZE * sizeof(bitmask_t));
2274+
self->by_four_bitmasks = PyMem_Calloc(
2275+
matcher_array_size / 4, NUC_TABLE_SIZE * 4 * sizeof(bitmask_t));
22752276
if (self->adapter_counter == NULL || self->found_masks == NULL ||
22762277
self->init_masks == NULL || self->adapter_sequences == NULL ||
2277-
self->bitmasks == NULL) {
2278+
self->bitmasks == NULL || self->by_four_bitmasks == NULL) {
22782279
PyErr_NoMemory();
22792280
goto error;
22802281
}
@@ -2331,6 +2332,16 @@ AdapterCounter__new__(PyTypeObject *type, PyObject *args, PyObject *kwargs)
23312332
self->init_masks[matcher_index] = init_mask;
23322333
matcher_index += 1;
23332334
}
2335+
/* Initialize an array for better vectorized loading. Doing it here is
2336+
much more efficient than doing it for every string. */
2337+
for (size_t i = 0; i < self->number_of_matchers; i += 4) {
2338+
for (size_t j = 0; j < NUC_TABLE_SIZE; j++) {
2339+
self->by_four_bitmasks[i / 4][j][0] = self->bitmasks[i][j];
2340+
self->by_four_bitmasks[i / 4][j][1] = self->bitmasks[i + 1][j];
2341+
self->by_four_bitmasks[i / 4][j][2] = self->bitmasks[i + 2][j];
2342+
self->by_four_bitmasks[i / 4][j][3] = self->bitmasks[i + 3][j];
2343+
}
2344+
}
23342345
self->adapters = adapters;
23352346
return (PyObject *)self;
23362347

@@ -2362,31 +2373,6 @@ AdapterCounter_resize(AdapterCounter *self, size_t new_size)
23622373
return 0;
23632374
}
23642375

2365-
#ifdef __SSE2__
2366-
static inline int
2367-
bitwise_and_nonzero_si128(__m128i vector1, __m128i vector2)
2368-
{
2369-
/* There is no way to directly check if an entire vector is set to 0
2370-
so some trickery needs to be done to ascertain if one of the bits is
2371-
set.
2372-
_mm_movemask_epi8 only catches the most significant bit. So we need to
2373-
set that bit. Comparison for larger than 0 does not work since only
2374-
signed comparisons are available. So the most significant bit makes
2375-
integers smaller than 0. Instead we do a saturated add of 127.
2376-
_mm_adds_epu8 works on unsigned integers. So 0b10000000 (128) will
2377-
become 255. Also everything above 0 will trigger the last bit to be set.
2378-
0 itself results in 0b01111111 so the most significant bit will not be
2379-
set.
2380-
The sequence of instructions below is faster than
2381-
return (!_mm_test_all_zeros(vector1, vector2));
2382-
which is available in SSE4.1. So there is no value in moving up one
2383-
instruction set. */
2384-
__m128i and = _mm_and_si128(vector1, vector2);
2385-
__m128i res = _mm_adds_epu8(and, _mm_set1_epi8(127));
2386-
return _mm_movemask_epi8(res);
2387-
}
2388-
#endif
2389-
23902376
static inline bitmask_t
23912377
update_adapter_count_array(size_t position, bitmask_t match,
23922378
bitmask_t already_found,
@@ -2441,6 +2427,90 @@ find_single_matcher(const uint8_t *sequence, size_t sequence_length,
24412427
}
24422428
}
24432429

2430+
static void (*find_four_matchers)(const uint8_t *sequence, size_t sequence_length,
2431+
const bitmask_t *restrict init_masks,
2432+
const bitmask_t *restrict found_masks,
2433+
const bitmask_t (*by_four_bitmasks)[4],
2434+
AdapterSequence **adapter_sequences_store,
2435+
uint64_t **adapter_counter) = NULL;
2436+
2437+
#if COMPILER_HAS_TARGETED_DISPATCH && BUILD_IS_X86_64
2438+
__attribute__((__target__("avx2"))) static void
2439+
find_four_matchers_avx2(const uint8_t *sequence, size_t sequence_length,
2440+
const bitmask_t *restrict init_masks,
2441+
const bitmask_t *restrict found_masks,
2442+
const bitmask_t (*by_four_bitmasks)[4],
2443+
AdapterSequence **adapter_sequences_store,
2444+
uint64_t **adapter_counter)
2445+
{
2446+
bitmask_t fmask0 = found_masks[0];
2447+
bitmask_t fmask1 = found_masks[1];
2448+
bitmask_t fmask2 = found_masks[2];
2449+
bitmask_t fmask3 = found_masks[3];
2450+
bitmask_t already_found0 = 0;
2451+
bitmask_t already_found1 = 0;
2452+
bitmask_t already_found2 = 0;
2453+
bitmask_t already_found3 = 0;
2454+
2455+
__m256i found_mask = _mm256_loadu_si256((const __m256i *)(found_masks));
2456+
__m256i init_mask = _mm256_loadu_si256((const __m256i *)(init_masks));
2457+
2458+
__m256i R = _mm256_setzero_si256();
2459+
const bitmask_t(*bitmask)[4] = by_four_bitmasks;
2460+
2461+
for (size_t pos = 0; pos < sequence_length; pos++) {
2462+
R = _mm256_slli_epi64(R, 1);
2463+
R = _mm256_or_si256(R, init_mask);
2464+
uint8_t index = NUCLEOTIDE_TO_INDEX[sequence[pos]];
2465+
R = _mm256_and_si256(R, _mm256_loadu_si256((__m256i *)bitmask[index]));
2466+
2467+
__m256i check = _mm256_and_si256(R, found_mask);
2468+
/* Adding 0b01111111 (127) to any number higher than 0 sets the bit for
2469+
128. Movemask collects these bits. This way we can test if there is
2470+
a 1 across the entire 256-bit vector. */
2471+
int check_int =
2472+
_mm256_movemask_epi8(_mm256_adds_epu8(check, _mm256_set1_epi8(127)));
2473+
if (check_int) {
2474+
bitmask_t Rray[4];
2475+
_mm256_storeu_si256(((__m256i *)Rray), R);
2476+
2477+
if (Rray[0] & fmask0) {
2478+
already_found0 = update_adapter_count_array(
2479+
pos, Rray[0], already_found0, adapter_sequences_store[0],
2480+
adapter_counter);
2481+
}
2482+
if (Rray[1] & fmask1) {
2483+
already_found1 = update_adapter_count_array(
2484+
pos, Rray[1], already_found1, adapter_sequences_store[1],
2485+
adapter_counter);
2486+
}
2487+
if (Rray[2] & fmask2) {
2488+
already_found2 = update_adapter_count_array(
2489+
pos, Rray[2], already_found2, adapter_sequences_store[2],
2490+
adapter_counter);
2491+
}
2492+
if (Rray[3] & fmask3) {
2493+
already_found3 = update_adapter_count_array(
2494+
pos, Rray[3], already_found3, adapter_sequences_store[3],
2495+
adapter_counter);
2496+
}
2497+
}
2498+
}
2499+
}
2500+
2501+
/* Constructor runs at dynamic load time */
2502+
__attribute__((constructor)) static void
2503+
find_four_matchers_init_func_ptr(void)
2504+
{
2505+
if (__builtin_cpu_supports("avx2")) {
2506+
find_four_matchers = find_four_matchers_avx2;
2507+
}
2508+
else {
2509+
find_four_matchers = NULL;
2510+
}
2511+
}
2512+
#endif
2513+
24442514
static int
24452515
AdapterCounter_add_meta(AdapterCounter *self, struct FastqMeta *meta)
24462516
{
@@ -2462,6 +2532,16 @@ AdapterCounter_add_meta(AdapterCounter *self, struct FastqMeta *meta)
24622532
AdapterSequence **adapter_sequences = self->adapter_sequences;
24632533
uint64_t **adapter_count_array = self->adapter_counter;
24642534
while (matcher_index < number_of_matchers) {
2535+
/* Only run when a vectorized function pointer is initialized */
2536+
if (find_four_matchers && number_of_matchers - matcher_index > 1) {
2537+
find_four_matchers(
2538+
sequence, sequence_length, init_masks + matcher_index,
2539+
found_masks + matcher_index,
2540+
self->by_four_bitmasks[matcher_index / 4],
2541+
adapter_sequences + matcher_index, adapter_count_array);
2542+
matcher_index += 4;
2543+
continue;
2544+
}
24652545
find_single_matcher(
24662546
sequence, sequence_length, init_masks + matcher_index,
24672547
found_masks + matcher_index, bitmasks + matcher_index,

0 commit comments

Comments
 (0)