24
24
#endif // #ifndef SL2_ELEMENTS
25
25
26
26
#ifndef SL2_PI
27
- #define SL2_PI 3.1415926535897932384626433832795
27
+ #define SL2_PI 3.14159265358979323846264338327950288419716939937510 // All good things in excess.
28
28
#endif // #ifndef SL2_PI
29
29
30
30
#ifndef SL2_ROUND_UP
@@ -1344,20 +1344,21 @@ namespace sl2 {
1344
1344
* \return Returns the sum of all the floats in the given register.
1345
1345
**/
1346
1346
static inline double HorizontalSum ( __m512d _mReg ) {
1347
+ return _mm512_reduce_add_pd ( _mReg );
1347
1348
// Step 1: Reduce 512 bits to 256 bits by summing the high and low 256 bits.
1348
- __m256d mLow256 = _mm512_castpd512_pd256 ( _mReg ); // Low 256 bits.
1349
- __m256d mHigh256 = _mm512_extractf64x4_pd ( _mReg, 1 ); // High 256 bits.
1350
- __m256d mSum256 = _mm256_add_pd ( mLow256 , mHigh256 ); // Add high and low 256-bit parts.
1349
+ // __m256d mLow256 = _mm512_castpd512_pd256( _mReg ); // Low 256 bits.
1350
+ // __m256d mHigh256 = _mm512_extractf64x4_pd( _mReg, 1 ); // High 256 bits.
1351
+ // __m256d mSum256 = _mm256_add_pd( mLow256, mHigh256 ); // Add high and low 256-bit parts.
1351
1352
1352
- // Step 2: Follow the same 256-bit reduction routine.
1353
- __m256d mShuf = _mm256_permute2f128_pd ( mSum256 , mSum256 , 0x1 ); // Swap low and high 128-bit halves.
1354
- __m256d mSums = _mm256_add_pd ( mSum256 , mShuf ); // Add low and high halves.
1353
+ // // Step 2: Follow the same 256-bit reduction routine.
1354
+ // __m256d mShuf = _mm256_permute2f128_pd( mSum256, mSum256, 0x1 ); // Swap low and high 128-bit halves.
1355
+ // __m256d mSums = _mm256_add_pd( mSum256, mShuf ); // Add low and high halves.
1355
1356
1356
- mShuf = _mm256_shuffle_pd ( mSums , mSums , 0x5 ); // Swap the pairs of doubles.
1357
- mSums = _mm256_add_pd ( mSums , mShuf ); // Add the pairs.
1357
+ // mShuf = _mm256_shuffle_pd( mSums, mSums, 0x5 ); // Swap the pairs of doubles.
1358
+ // mSums = _mm256_add_pd( mSums, mShuf ); // Add the pairs.
1358
1359
1359
- // Step 3: Extract the scalar value (final sum).
1360
- return _mm256_cvtsd_f64 ( mSums ); // Extract the lower double as the sum.
1360
+ // // Step 3: Extract the scalar value (final sum).
1361
+ // return _mm256_cvtsd_f64( mSums ); // Extract the lower double as the sum.
1361
1362
}
1362
1363
1363
1364
/* *
@@ -1367,17 +1368,18 @@ namespace sl2 {
1367
1368
* \return Returns the sum of all the floats in the given register.
1368
1369
**/
1369
1370
static inline float HorizontalSum ( __m512 _mReg ) {
1371
+ return _mm512_reduce_add_ps ( _mReg );
1370
1372
// Step 1: Reduce 512 bits to 256 bits by permuting and adding high and low 256 bits.
1371
- __m256 mLow256 = _mm512_castps512_ps256 ( _mReg ); // Low 256 bits.
1372
- __m256 mHigh256 = _mm512_extractf32x8_ps ( _mReg, 1 ); // High 256 bits.
1373
- __m256 mSum256 = _mm256_add_ps ( mLow256 , mHigh256 ); // Add high and low 256-bit parts.
1373
+ // __m256 mLow256 = _mm512_castps512_ps256( _mReg ); // Low 256 bits.
1374
+ // __m256 mHigh256 = _mm512_extractf32x8_ps( _mReg, 1 ); // High 256 bits.
1375
+ // __m256 mSum256 = _mm256_add_ps( mLow256, mHigh256 ); // Add high and low 256-bit parts.
1374
1376
1375
- // Step 2: Perform horizontal addition on 256 bits.
1376
- mSum256 = _mm256_hadd_ps ( mSum256 , mSum256 ); // First horizontal add.
1377
- mSum256 = _mm256_hadd_ps ( mSum256 , mSum256 ); // Second horizontal add.
1377
+ // // Step 2: Perform horizontal addition on 256 bits.
1378
+ // mSum256 = _mm256_hadd_ps( mSum256, mSum256 ); // First horizontal add.
1379
+ // mSum256 = _mm256_hadd_ps( mSum256, mSum256 ); // Second horizontal add.
1378
1380
1379
- // Step 3: Extract the lower float which now contains the sum.
1380
- return _mm256_cvtss_f32 ( mSum256 );
1381
+ // // Step 3: Extract the lower float which now contains the sum.
1382
+ // return _mm256_cvtss_f32( mSum256 );
1381
1383
}
1382
1384
#endif // #ifdef __AVX512F__
1383
1385
@@ -1389,13 +1391,19 @@ namespace sl2 {
1389
1391
* \return Returns the sum of all the floats in the given register.
1390
1392
**/
1391
1393
static inline double HorizontalSum ( __m256d &_mReg ) {
1394
+ __m256d mT1 = _mm256_hadd_pd ( _mReg, _mReg );
1395
+ __m128d mT2 = _mm256_extractf128_pd ( mT1 , 1 );
1396
+ __m128d mT3 = _mm256_castpd256_pd128 ( mT1 );
1397
+ return _mm_cvtsd_f64 ( _mm_add_pd ( mT2 , mT3 ) );
1398
+ #if 0
1392
1399
__m256d mShuf = _mm256_permute2f128_pd( _mReg, _mReg, 0x1 ); // Swap the low and high halves.
1393
1400
__m256d mSums = _mm256_add_pd( _mReg, mShuf ); // Add the low and high halves.
1394
1401
1395
1402
mShuf = _mm256_shuffle_pd( mSums, mSums, 0x5 ); // Swap the pairs of doubles.
1396
1403
mSums = _mm256_add_pd( mSums, mShuf ); // Add the pairs.
1397
1404
1398
1405
return _mm256_cvtsd_f64( mSums ); // Extract the sum.
1406
+ #endif // #if 0
1399
1407
}
1400
1408
1401
1409
/* *
@@ -1405,6 +1413,13 @@ namespace sl2 {
1405
1413
* \return Returns the sum of all the floats in the given register.
1406
1414
**/
1407
1415
static inline float HorizontalSum ( const __m256 &_mReg ) {
1416
+ SL2_ALIGN ( 32 )
1417
+ float fSumArray [8 ];
1418
+ __m256 mTmp = _mm256_hadd_ps ( _mReg, _mReg );
1419
+ mTmp = _mm256_hadd_ps ( mTmp , mTmp );
1420
+ _mm256_store_ps ( fSumArray , mTmp );
1421
+ return fSumArray [0 ] + fSumArray [4 ];
1422
+ #if 0
1408
1423
__m256 mTmp = _mm256_permute2f128_ps(_mReg, _mReg, 1); // Shuffle high 128 to low.
1409
1424
mTmp = _mm256_add_ps( _mReg, mTmp ); // Add high and low parts.
1410
1425
@@ -1413,6 +1428,7 @@ namespace sl2 {
1413
1428
1414
1429
// Extract the lower float which now contains the sum.
1415
1430
return _mm256_cvtss_f32( mTmp );
1431
+ #endif // #if 0
1416
1432
}
1417
1433
#endif // #ifdef __AVX__
1418
1434
0 commit comments