@@ -37,10 +37,6 @@ along with Sequali. If not, see <https://www.gnu.org/licenses/
37
37
#include <math.h>
38
38
#include <stdbool.h>
39
39
40
- #ifdef __SSE2__
41
- #include "emmintrin.h"
42
- #endif
43
-
44
40
/* Pointers to types that will be imported/initialized in the module
45
41
initialization section */
46
42
@@ -2164,6 +2160,8 @@ typedef struct AdapterCounterStruct {
2164
2160
bitmask_t * init_masks ;
2165
2161
bitmask_t * found_masks ;
2166
2162
bitmask_t (* bitmasks )[NUC_TABLE_SIZE ];
2163
+ /* Same as bitmasks, but better organization for vectorized approach. */
2164
+ bitmask_t (* by_four_bitmasks )[NUC_TABLE_SIZE ][4 ];
2167
2165
AdapterSequence * * adapter_sequences ;
2168
2166
} AdapterCounter ;
2169
2167
@@ -2185,6 +2183,7 @@ AdapterCounter_dealloc(AdapterCounter *self)
2185
2183
PyMem_Free (self -> found_masks );
2186
2184
PyMem_Free (self -> init_masks );
2187
2185
PyMem_Free (self -> bitmasks );
2186
+ PyMem_Free (self -> by_four_bitmasks );
2188
2187
PyMem_Free (self -> adapter_sequences );
2189
2188
2190
2189
PyTypeObject * tp = Py_TYPE ((PyObject * )self );
@@ -2272,9 +2271,11 @@ AdapterCounter__new__(PyTypeObject *type, PyObject *args, PyObject *kwargs)
2272
2271
PyMem_Calloc (matcher_array_size , sizeof (AdapterSequence * ));
2273
2272
self -> bitmasks =
2274
2273
PyMem_Calloc (matcher_array_size , NUC_TABLE_SIZE * sizeof (bitmask_t ));
2274
+ self -> by_four_bitmasks = PyMem_Calloc (
2275
+ matcher_array_size / 4 , NUC_TABLE_SIZE * 4 * sizeof (bitmask_t ));
2275
2276
if (self -> adapter_counter == NULL || self -> found_masks == NULL ||
2276
2277
self -> init_masks == NULL || self -> adapter_sequences == NULL ||
2277
- self -> bitmasks == NULL ) {
2278
+ self -> bitmasks == NULL || self -> by_four_bitmasks == NULL ) {
2278
2279
PyErr_NoMemory ();
2279
2280
goto error ;
2280
2281
}
@@ -2331,6 +2332,16 @@ AdapterCounter__new__(PyTypeObject *type, PyObject *args, PyObject *kwargs)
2331
2332
self -> init_masks [matcher_index ] = init_mask ;
2332
2333
matcher_index += 1 ;
2333
2334
}
2335
+ /* Initialize an array for better vectorized loading. Doing it here is
2336
+ much more efficient than doing it for every string. */
2337
+ for (size_t i = 0 ; i < self -> number_of_matchers ; i += 4 ) {
2338
+ for (size_t j = 0 ; j < NUC_TABLE_SIZE ; j ++ ) {
2339
+ self -> by_four_bitmasks [i / 4 ][j ][0 ] = self -> bitmasks [i ][j ];
2340
+ self -> by_four_bitmasks [i / 4 ][j ][1 ] = self -> bitmasks [i + 1 ][j ];
2341
+ self -> by_four_bitmasks [i / 4 ][j ][2 ] = self -> bitmasks [i + 2 ][j ];
2342
+ self -> by_four_bitmasks [i / 4 ][j ][3 ] = self -> bitmasks [i + 3 ][j ];
2343
+ }
2344
+ }
2334
2345
self -> adapters = adapters ;
2335
2346
return (PyObject * )self ;
2336
2347
@@ -2362,31 +2373,6 @@ AdapterCounter_resize(AdapterCounter *self, size_t new_size)
2362
2373
return 0 ;
2363
2374
}
2364
2375
2365
- #ifdef __SSE2__
2366
- static inline int
2367
- bitwise_and_nonzero_si128 (__m128i vector1 , __m128i vector2 )
2368
- {
2369
- /* There is no way to directly check if an entire vector is set to 0
2370
- so some trickery needs to be done to ascertain if one of the bits is
2371
- set.
2372
- _mm_movemask_epi8 only catches the most significant bit. So we need to
2373
- set that bit. Comparison for larger than 0 does not work since only
2374
- signed comparisons are available. So the most significant bit makes
2375
- integers smaller than 0. Instead we do a saturated add of 127.
2376
- _mm_adds_epu8 works on unsigned integers. So 0b10000000 (128) will
2377
- become 255. Also everything above 0 will trigger the last bit to be set.
2378
- 0 itself results in 0b01111111 so the most significant bit will not be
2379
- set.
2380
- The sequence of instructions below is faster than
2381
- return (!_mm_test_all_zeros(vector1, vector2));
2382
- which is available in SSE4.1. So there is no value in moving up one
2383
- instruction set. */
2384
- __m128i and = _mm_and_si128 (vector1 , vector2 );
2385
- __m128i res = _mm_adds_epu8 (and , _mm_set1_epi8 (127 ));
2386
- return _mm_movemask_epi8 (res );
2387
- }
2388
- #endif
2389
-
2390
2376
static inline bitmask_t
2391
2377
update_adapter_count_array (size_t position , bitmask_t match ,
2392
2378
bitmask_t already_found ,
@@ -2441,6 +2427,90 @@ find_single_matcher(const uint8_t *sequence, size_t sequence_length,
2441
2427
}
2442
2428
}
2443
2429
2430
+ static void (* find_four_matchers )(const uint8_t * sequence , size_t sequence_length ,
2431
+ const bitmask_t * restrict init_masks ,
2432
+ const bitmask_t * restrict found_masks ,
2433
+ const bitmask_t (* by_four_bitmasks )[4 ],
2434
+ AdapterSequence * * adapter_sequences_store ,
2435
+ uint64_t * * adapter_counter ) = NULL ;
2436
+
2437
+ #if COMPILER_HAS_TARGETED_DISPATCH && BUILD_IS_X86_64
2438
+ __attribute__((__target__ ("avx2" ))) static void
2439
+ find_four_matchers_avx2 (const uint8_t * sequence , size_t sequence_length ,
2440
+ const bitmask_t * restrict init_masks ,
2441
+ const bitmask_t * restrict found_masks ,
2442
+ const bitmask_t (* by_four_bitmasks )[4 ],
2443
+ AdapterSequence * * adapter_sequences_store ,
2444
+ uint64_t * * adapter_counter )
2445
+ {
2446
+ bitmask_t fmask0 = found_masks [0 ];
2447
+ bitmask_t fmask1 = found_masks [1 ];
2448
+ bitmask_t fmask2 = found_masks [2 ];
2449
+ bitmask_t fmask3 = found_masks [3 ];
2450
+ bitmask_t already_found0 = 0 ;
2451
+ bitmask_t already_found1 = 0 ;
2452
+ bitmask_t already_found2 = 0 ;
2453
+ bitmask_t already_found3 = 0 ;
2454
+
2455
+ __m256i found_mask = _mm256_loadu_si256 ((const __m256i * )(found_masks ));
2456
+ __m256i init_mask = _mm256_loadu_si256 ((const __m256i * )(init_masks ));
2457
+
2458
+ __m256i R = _mm256_setzero_si256 ();
2459
+ const bitmask_t (* bitmask )[4 ] = by_four_bitmasks ;
2460
+
2461
+ for (size_t pos = 0 ; pos < sequence_length ; pos ++ ) {
2462
+ R = _mm256_slli_epi64 (R , 1 );
2463
+ R = _mm256_or_si256 (R , init_mask );
2464
+ uint8_t index = NUCLEOTIDE_TO_INDEX [sequence [pos ]];
2465
+ R = _mm256_and_si256 (R , _mm256_loadu_si256 ((__m256i * )bitmask [index ]));
2466
+
2467
+ __m256i check = _mm256_and_si256 (R , found_mask );
2468
+ /* Adding 0b01111111 (127) to any number higher than 0 sets the bit for
2469
+ 128. Movemask collects these bits. This way we can test if there is
2470
+ a 1 across the entire 256-bit vector. */
2471
+ int check_int =
2472
+ _mm256_movemask_epi8 (_mm256_adds_epu8 (check , _mm256_set1_epi8 (127 )));
2473
+ if (check_int ) {
2474
+ bitmask_t Rray [4 ];
2475
+ _mm256_storeu_si256 (((__m256i * )Rray ), R );
2476
+
2477
+ if (Rray [0 ] & fmask0 ) {
2478
+ already_found0 = update_adapter_count_array (
2479
+ pos , Rray [0 ], already_found0 , adapter_sequences_store [0 ],
2480
+ adapter_counter );
2481
+ }
2482
+ if (Rray [1 ] & fmask1 ) {
2483
+ already_found1 = update_adapter_count_array (
2484
+ pos , Rray [1 ], already_found1 , adapter_sequences_store [1 ],
2485
+ adapter_counter );
2486
+ }
2487
+ if (Rray [2 ] & fmask2 ) {
2488
+ already_found2 = update_adapter_count_array (
2489
+ pos , Rray [2 ], already_found2 , adapter_sequences_store [2 ],
2490
+ adapter_counter );
2491
+ }
2492
+ if (Rray [3 ] & fmask3 ) {
2493
+ already_found3 = update_adapter_count_array (
2494
+ pos , Rray [3 ], already_found3 , adapter_sequences_store [3 ],
2495
+ adapter_counter );
2496
+ }
2497
+ }
2498
+ }
2499
+ }
2500
+
2501
+ /* Constructor runs at dynamic load time */
2502
+ __attribute__((constructor )) static void
2503
+ find_four_matchers_init_func_ptr (void )
2504
+ {
2505
+ if (__builtin_cpu_supports ("avx2" )) {
2506
+ find_four_matchers = find_four_matchers_avx2 ;
2507
+ }
2508
+ else {
2509
+ find_four_matchers = NULL ;
2510
+ }
2511
+ }
2512
+ #endif
2513
+
2444
2514
static int
2445
2515
AdapterCounter_add_meta (AdapterCounter * self , struct FastqMeta * meta )
2446
2516
{
@@ -2462,6 +2532,16 @@ AdapterCounter_add_meta(AdapterCounter *self, struct FastqMeta *meta)
2462
2532
AdapterSequence * * adapter_sequences = self -> adapter_sequences ;
2463
2533
uint64_t * * adapter_count_array = self -> adapter_counter ;
2464
2534
while (matcher_index < number_of_matchers ) {
2535
+ /* Only run when a vectorized function pointer is initialized */
2536
+ if (find_four_matchers && number_of_matchers - matcher_index > 1 ) {
2537
+ find_four_matchers (
2538
+ sequence , sequence_length , init_masks + matcher_index ,
2539
+ found_masks + matcher_index ,
2540
+ self -> by_four_bitmasks [matcher_index / 4 ],
2541
+ adapter_sequences + matcher_index , adapter_count_array );
2542
+ matcher_index += 4 ;
2543
+ continue ;
2544
+ }
2465
2545
find_single_matcher (
2466
2546
sequence , sequence_length , init_masks + matcher_index ,
2467
2547
found_masks + matcher_index , bitmasks + matcher_index ,
0 commit comments