Skip to content

Commit a75aad5

Browse files
committed
Optimized? HorizontalSum().
1 parent 515c66d commit a75aad5

File tree

2 files changed

+39
-23
lines changed

2 files changed

+39
-23
lines changed

Src/Utilities/SL2Utilities.h

Lines changed: 35 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
#endif // #ifndef SL2_ELEMENTS
2525

2626
#ifndef SL2_PI
27-
#define SL2_PI 3.1415926535897932384626433832795
27+
#define SL2_PI 3.14159265358979323846264338327950288419716939937510 // All good things in excess.
2828
#endif // #ifndef SL2_PI
2929

3030
#ifndef SL2_ROUND_UP
@@ -1344,20 +1344,21 @@ namespace sl2 {
13441344
* \return Returns the sum of all the floats in the given register.
13451345
**/
13461346
static inline double HorizontalSum( __m512d _mReg ) {
1347+
return _mm512_reduce_add_pd( _mReg );
13471348
// Step 1: Reduce 512 bits to 256 bits by summing the high and low 256 bits.
1348-
__m256d mLow256 = _mm512_castpd512_pd256( _mReg ); // Low 256 bits.
1349-
__m256d mHigh256 = _mm512_extractf64x4_pd( _mReg, 1 ); // High 256 bits.
1350-
__m256d mSum256 = _mm256_add_pd( mLow256, mHigh256 ); // Add high and low 256-bit parts.
1349+
//__m256d mLow256 = _mm512_castpd512_pd256( _mReg ); // Low 256 bits.
1350+
//__m256d mHigh256 = _mm512_extractf64x4_pd( _mReg, 1 ); // High 256 bits.
1351+
//__m256d mSum256 = _mm256_add_pd( mLow256, mHigh256 ); // Add high and low 256-bit parts.
13511352

1352-
// Step 2: Follow the same 256-bit reduction routine.
1353-
__m256d mShuf = _mm256_permute2f128_pd( mSum256, mSum256, 0x1 ); // Swap low and high 128-bit halves.
1354-
__m256d mSums = _mm256_add_pd( mSum256, mShuf ); // Add low and high halves.
1353+
//// Step 2: Follow the same 256-bit reduction routine.
1354+
//__m256d mShuf = _mm256_permute2f128_pd( mSum256, mSum256, 0x1 ); // Swap low and high 128-bit halves.
1355+
//__m256d mSums = _mm256_add_pd( mSum256, mShuf ); // Add low and high halves.
13551356

1356-
mShuf = _mm256_shuffle_pd( mSums, mSums, 0x5 ); // Swap the pairs of doubles.
1357-
mSums = _mm256_add_pd( mSums, mShuf ); // Add the pairs.
1357+
//mShuf = _mm256_shuffle_pd( mSums, mSums, 0x5 ); // Swap the pairs of doubles.
1358+
//mSums = _mm256_add_pd( mSums, mShuf ); // Add the pairs.
13581359

1359-
// Step 3: Extract the scalar value (final sum).
1360-
return _mm256_cvtsd_f64( mSums ); // Extract the lower double as the sum.
1360+
//// Step 3: Extract the scalar value (final sum).
1361+
//return _mm256_cvtsd_f64( mSums ); // Extract the lower double as the sum.
13611362
}
13621363

13631364
/**
@@ -1367,17 +1368,18 @@ namespace sl2 {
13671368
* \return Returns the sum of all the floats in the given register.
13681369
**/
13691370
static inline float HorizontalSum( __m512 _mReg ) {
1371+
return _mm512_reduce_add_ps( _mReg );
13701372
// Step 1: Reduce 512 bits to 256 bits by permuting and adding high and low 256 bits.
1371-
__m256 mLow256 = _mm512_castps512_ps256( _mReg ); // Low 256 bits.
1372-
__m256 mHigh256 = _mm512_extractf32x8_ps( _mReg, 1 ); // High 256 bits.
1373-
__m256 mSum256 = _mm256_add_ps( mLow256, mHigh256 ); // Add high and low 256-bit parts.
1373+
//__m256 mLow256 = _mm512_castps512_ps256( _mReg ); // Low 256 bits.
1374+
//__m256 mHigh256 = _mm512_extractf32x8_ps( _mReg, 1 ); // High 256 bits.
1375+
//__m256 mSum256 = _mm256_add_ps( mLow256, mHigh256 ); // Add high and low 256-bit parts.
13741376

1375-
// Step 2: Perform horizontal addition on 256 bits.
1376-
mSum256 = _mm256_hadd_ps( mSum256, mSum256 ); // First horizontal add.
1377-
mSum256 = _mm256_hadd_ps( mSum256, mSum256 ); // Second horizontal add.
1377+
//// Step 2: Perform horizontal addition on 256 bits.
1378+
//mSum256 = _mm256_hadd_ps( mSum256, mSum256 ); // First horizontal add.
1379+
//mSum256 = _mm256_hadd_ps( mSum256, mSum256 ); // Second horizontal add.
13781380

1379-
// Step 3: Extract the lower float which now contains the sum.
1380-
return _mm256_cvtss_f32( mSum256 );
1381+
//// Step 3: Extract the lower float which now contains the sum.
1382+
//return _mm256_cvtss_f32( mSum256 );
13811383
}
13821384
#endif // #ifdef __AVX512F__
13831385

@@ -1389,13 +1391,19 @@ namespace sl2 {
13891391
* \return Returns the sum of all the floats in the given register.
13901392
**/
13911393
static inline double HorizontalSum( __m256d &_mReg ) {
1394+
__m256d mT1 = _mm256_hadd_pd( _mReg, _mReg );
1395+
__m128d mT2 = _mm256_extractf128_pd( mT1, 1 );
1396+
__m128d mT3 = _mm256_castpd256_pd128( mT1 );
1397+
return _mm_cvtsd_f64( _mm_add_pd( mT2, mT3 ) );
1398+
#if 0
13921399
__m256d mShuf = _mm256_permute2f128_pd( _mReg, _mReg, 0x1 ); // Swap the low and high halves.
13931400
__m256d mSums = _mm256_add_pd( _mReg, mShuf ); // Add the low and high halves.
13941401

13951402
mShuf = _mm256_shuffle_pd( mSums, mSums, 0x5 ); // Swap the pairs of doubles.
13961403
mSums = _mm256_add_pd( mSums, mShuf ); // Add the pairs.
13971404

13981405
return _mm256_cvtsd_f64( mSums ); // Extract the sum.
1406+
#endif // #if 0
13991407
}
14001408

14011409
/**
@@ -1405,6 +1413,13 @@ namespace sl2 {
14051413
* \return Returns the sum of all the floats in the given register.
14061414
**/
14071415
static inline float HorizontalSum( const __m256 &_mReg ) {
1416+
SL2_ALIGN( 32 )
1417+
float fSumArray[8];
1418+
__m256 mTmp = _mm256_hadd_ps( _mReg, _mReg );
1419+
mTmp = _mm256_hadd_ps( mTmp, mTmp );
1420+
_mm256_store_ps( fSumArray, mTmp );
1421+
return fSumArray[0] + fSumArray[4];
1422+
#if 0
14081423
__m256 mTmp = _mm256_permute2f128_ps(_mReg, _mReg, 1); // Shuffle high 128 to low.
14091424
mTmp = _mm256_add_ps( _mReg, mTmp ); // Add high and low parts.
14101425

@@ -1413,6 +1428,7 @@ namespace sl2 {
14131428

14141429
// Extract the lower float which now contains the sum.
14151430
return _mm256_cvtss_f32( mTmp );
1431+
#endif // #if 0
14161432
}
14171433
#endif // #ifdef __AVX__
14181434

SurfaceLevel2.vcxproj

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -607,7 +607,7 @@
607607
<ClCompile>
608608
<WarningLevel>Level3</WarningLevel>
609609
<SDLCheck>true</SDLCheck>
610-
<PreprocessorDefinitions>CMS_NO_REGISTER_KEYWORD;__AVX512BW__=1;__AVX512F__=1;__AVX2__=1;__AVX__=1;__SSE4_1__=1;BASISU_NO_ITERATOR_DEBUG_LEVEL;KTX_FEATURE_WRITE=1;KHRONOS_STATIC;_LIB;OPJ_STATIC;LIBRAW_NODLL;FREEIMAGE_LIB;_DEBUG;_CONSOLE;%(PreprocessorDefinitions);STBI_NO_STDIO</PreprocessorDefinitions>
610+
<PreprocessorDefinitions>CMS_NO_REGISTER_KEYWORD;CMS_NO_REGISTER_KEYWORD;__AVX512BW__=1;__AVX512F__=1;__AVX2__=1;__AVX__=1;__SSE4_1__=1;BASISU_NO_ITERATOR_DEBUG_LEVEL;KTX_FEATURE_WRITE=1;KHRONOS_STATIC;_LIB;OPJ_STATIC;LIBRAW_NODLL;FREEIMAGE_LIB;_DEBUG;_CONSOLE;%(PreprocessorDefinitions);STBI_NO_STDIO</PreprocessorDefinitions>
611611
<ConformanceMode>true</ConformanceMode>
612612
<LanguageStandard>stdcpp20</LanguageStandard>
613613
<AdditionalIncludeDirectories>$(ProjectDir)Src\Image\Squish;$(ProjectDir)Src\Image\FreeImage\Source;$(ProjectDir)Src\Image\KTX-Software\include;$(ProjectDir)Src\Image\KTX-Software\lib\dfdutils;$(ProjectDir)Src\Image\KTX-Software\utils;$(ProjectDir)Src\Image\KTX-Software\lib\basisu\zstd;$(ProjectDir)Src\Image\KTX-Software\lib\basisu\transcoder</AdditionalIncludeDirectories>
@@ -635,7 +635,7 @@
635635
<ClCompile>
636636
<WarningLevel>Level3</WarningLevel>
637637
<SDLCheck>true</SDLCheck>
638-
<PreprocessorDefinitions>SL2_LIB;CMS_NO_REGISTER_KEYWORD;__AVX512BW__=1;__AVX512F__=1;__AVX2__=1;__AVX__=1;__SSE4_1__=1;BASISU_NO_ITERATOR_DEBUG_LEVEL;KTX_FEATURE_WRITE=1;KHRONOS_STATIC;_LIB;OPJ_STATIC;LIBRAW_NODLL;FREEIMAGE_LIB;_DEBUG;_CONSOLE;%(PreprocessorDefinitions);STBI_NO_STDIO</PreprocessorDefinitions>
638+
<PreprocessorDefinitions>CMS_NO_REGISTER_KEYWORD;SL2_LIB;CMS_NO_REGISTER_KEYWORD;__AVX512BW__=1;__AVX512F__=1;__AVX2__=1;__AVX__=1;__SSE4_1__=1;BASISU_NO_ITERATOR_DEBUG_LEVEL;KTX_FEATURE_WRITE=1;KHRONOS_STATIC;_LIB;OPJ_STATIC;LIBRAW_NODLL;FREEIMAGE_LIB;_DEBUG;_CONSOLE;%(PreprocessorDefinitions);STBI_NO_STDIO</PreprocessorDefinitions>
639639
<ConformanceMode>true</ConformanceMode>
640640
<LanguageStandard>stdcpp20</LanguageStandard>
641641
<AdditionalIncludeDirectories>$(ProjectDir)Src\Image\Squish;$(ProjectDir)Src\Image\FreeImage\Source;$(ProjectDir)Src\Image\KTX-Software\include;$(ProjectDir)Src\Image\KTX-Software\lib\dfdutils;$(ProjectDir)Src\Image\KTX-Software\utils;$(ProjectDir)Src\Image\KTX-Software\lib\basisu\zstd;$(ProjectDir)Src\Image\KTX-Software\lib\basisu\transcoder</AdditionalIncludeDirectories>
@@ -665,7 +665,7 @@
665665
<FunctionLevelLinking>true</FunctionLevelLinking>
666666
<IntrinsicFunctions>true</IntrinsicFunctions>
667667
<SDLCheck>true</SDLCheck>
668-
<PreprocessorDefinitions>CMS_NO_REGISTER_KEYWORD;__AVX512BW__=1;__AVX512F__=1;__AVX2__=1;__AVX__=1;__SSE4_1__=1;KTX_FEATURE_WRITE=1;KHRONOS_STATIC;_LIB;OPJ_STATIC;LIBRAW_NODLL;FREEIMAGE_LIB;NDEBUG;_CONSOLE;%(PreprocessorDefinitions);STBI_NO_STDIO</PreprocessorDefinitions>
668+
<PreprocessorDefinitions>CMS_NO_REGISTER_KEYWORD;CMS_NO_REGISTER_KEYWORD;__AVX512BW__=1;__AVX512F__=1;__AVX2__=1;__AVX__=1;__SSE4_1__=1;KTX_FEATURE_WRITE=1;KHRONOS_STATIC;_LIB;OPJ_STATIC;LIBRAW_NODLL;FREEIMAGE_LIB;NDEBUG;_CONSOLE;%(PreprocessorDefinitions);STBI_NO_STDIO</PreprocessorDefinitions>
669669
<ConformanceMode>true</ConformanceMode>
670670
<LanguageStandard>stdcpp20</LanguageStandard>
671671
<AdditionalIncludeDirectories>$(ProjectDir)Src\Image\Squish;$(ProjectDir)Src\Image\FreeImage\Source;$(ProjectDir)Src\Image\KTX-Software\include;$(ProjectDir)Src\Image\KTX-Software\lib\dfdutils;$(ProjectDir)Src\Image\KTX-Software\utils;$(ProjectDir)Src\Image\KTX-Software\lib\basisu\zstd;$(ProjectDir)Src\Image\KTX-Software\lib\basisu\transcoder</AdditionalIncludeDirectories>
@@ -696,7 +696,7 @@
696696
<FunctionLevelLinking>true</FunctionLevelLinking>
697697
<IntrinsicFunctions>true</IntrinsicFunctions>
698698
<SDLCheck>true</SDLCheck>
699-
<PreprocessorDefinitions>SL2_LIB;CMS_NO_REGISTER_KEYWORD;__AVX512BW__=1;__AVX512F__=1;__AVX2__=1;__AVX__=1;__SSE4_1__=1;KTX_FEATURE_WRITE=1;KHRONOS_STATIC;_LIB;OPJ_STATIC;LIBRAW_NODLL;FREEIMAGE_LIB;NDEBUG;_CONSOLE;%(PreprocessorDefinitions);STBI_NO_STDIO</PreprocessorDefinitions>
699+
<PreprocessorDefinitions>CMS_NO_REGISTER_KEYWORD;SL2_LIB;CMS_NO_REGISTER_KEYWORD;__AVX512BW__=1;__AVX512F__=1;__AVX2__=1;__AVX__=1;__SSE4_1__=1;KTX_FEATURE_WRITE=1;KHRONOS_STATIC;_LIB;OPJ_STATIC;LIBRAW_NODLL;FREEIMAGE_LIB;NDEBUG;_CONSOLE;%(PreprocessorDefinitions);STBI_NO_STDIO</PreprocessorDefinitions>
700700
<ConformanceMode>true</ConformanceMode>
701701
<LanguageStandard>stdcpp20</LanguageStandard>
702702
<AdditionalIncludeDirectories>$(ProjectDir)Src\Image\Squish;$(ProjectDir)Src\Image\FreeImage\Source;$(ProjectDir)Src\Image\KTX-Software\include;$(ProjectDir)Src\Image\KTX-Software\lib\dfdutils;$(ProjectDir)Src\Image\KTX-Software\utils;$(ProjectDir)Src\Image\KTX-Software\lib\basisu\zstd;$(ProjectDir)Src\Image\KTX-Software\lib\basisu\transcoder</AdditionalIncludeDirectories>

0 commit comments

Comments
 (0)