From 690e34ea451bf593a7107f26617dfa26a226f448 Mon Sep 17 00:00:00 2001 From: Yermalayeu Ihar Date: Mon, 14 Oct 2024 17:39:09 +0300 Subject: [PATCH] *refactoring of SSE4.1 optimizations of SynetMergedConvolution32f. --- prj/vs2019/Sse41.vcxproj | 7 +- prj/vs2019/Sse41.vcxproj.filters | 21 +- prj/vs2022/Sse41.vcxproj | 7 +- prj/vs2022/Sse41.vcxproj.filters | 21 +- .../SimdBaseSynetMergedConvolution32f.cpp | 2 + .../SimdSse41SynetMergedConvolution32f.cpp | 81 + .../SimdSse41SynetMergedConvolution32fCd.cpp | 497 ------ .../SimdSse41SynetMergedConvolution32fCdc.cpp | 1493 ----------------- .../SimdSse41SynetMergedConvolution32fDc.cpp | 456 ----- ...se41SynetMergedConvolution32fDepthwise.cpp | 433 +++++ ...imdSse41SynetMergedConvolution32fInput.cpp | 609 +++++++ ...mdSse41SynetMergedConvolution32fOutput.cpp | 570 +++++++ src/Simd/SimdSynetMergedConvolution32f.h | 14 +- src/Test/TestSynetMergedConvolution32f.cpp | 9 +- 14 files changed, 1737 insertions(+), 2483 deletions(-) create mode 100644 src/Simd/SimdSse41SynetMergedConvolution32f.cpp delete mode 100644 src/Simd/SimdSse41SynetMergedConvolution32fCd.cpp delete mode 100644 src/Simd/SimdSse41SynetMergedConvolution32fCdc.cpp delete mode 100644 src/Simd/SimdSse41SynetMergedConvolution32fDc.cpp create mode 100644 src/Simd/SimdSse41SynetMergedConvolution32fDepthwise.cpp create mode 100644 src/Simd/SimdSse41SynetMergedConvolution32fInput.cpp create mode 100644 src/Simd/SimdSse41SynetMergedConvolution32fOutput.cpp diff --git a/prj/vs2019/Sse41.vcxproj b/prj/vs2019/Sse41.vcxproj index e3c868fb32..35ad7eca6a 100644 --- a/prj/vs2019/Sse41.vcxproj +++ b/prj/vs2019/Sse41.vcxproj @@ -123,9 +123,10 @@ - - - + + + + diff --git a/prj/vs2019/Sse41.vcxproj.filters b/prj/vs2019/Sse41.vcxproj.filters index 3ba8ba02cf..12ea5d8a95 100644 --- a/prj/vs2019/Sse41.vcxproj.filters +++ b/prj/vs2019/Sse41.vcxproj.filters @@ -265,15 +265,6 @@ Sse41 - - Sse41 - - - Sse41 - - - Sse41 - Sse41 @@ -418,6 +409,18 @@ Sse41 + + Sse41 + + + Sse41 + + + Sse41 + + + Sse41 + diff --git a/prj/vs2022/Sse41.vcxproj b/prj/vs2022/Sse41.vcxproj index e3c868fb32..35ad7eca6a 100644 --- a/prj/vs2022/Sse41.vcxproj +++ b/prj/vs2022/Sse41.vcxproj @@ -123,9 +123,10 @@ - - - + + + + diff --git a/prj/vs2022/Sse41.vcxproj.filters b/prj/vs2022/Sse41.vcxproj.filters index 3ba8ba02cf..12ea5d8a95 100644 --- a/prj/vs2022/Sse41.vcxproj.filters +++ b/prj/vs2022/Sse41.vcxproj.filters @@ -265,15 +265,6 @@ Sse41 - - Sse41 - - - Sse41 - - - Sse41 - Sse41 @@ -418,6 +409,18 @@ Sse41 + + Sse41 + + + Sse41 + + + Sse41 + + + Sse41 + diff --git a/src/Simd/SimdBaseSynetMergedConvolution32f.cpp b/src/Simd/SimdBaseSynetMergedConvolution32f.cpp index 873d410912..3cba907193 100644 --- a/src/Simd/SimdBaseSynetMergedConvolution32f.cpp +++ b/src/Simd/SimdBaseSynetMergedConvolution32f.cpp @@ -438,6 +438,7 @@ namespace Simd break; } _sizeB[1] = 0; + _bufH[1] = 0; for (size_t i = 0; i < 2; ++i) { size_t dstC = AlignHiAny(p.conv[i].dstC, i == 1 ? _miC : 2 * _miC); @@ -551,6 +552,7 @@ namespace Simd break; } _bufH[1] = _bufH[0]; + _bufH[0] = 0; _sizeB[1] = 0; for (size_t i = 0; i < 2; ++i) { diff --git a/src/Simd/SimdSse41SynetMergedConvolution32f.cpp b/src/Simd/SimdSse41SynetMergedConvolution32f.cpp new file mode 100644 index 0000000000..cc93f0e6a1 --- /dev/null +++ b/src/Simd/SimdSse41SynetMergedConvolution32f.cpp @@ -0,0 +1,81 @@ +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2024 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#include "Simd/SimdSynetMergedConvolution32f.h" +#include "Simd/SimdSynetConvolution32fCommon.h" +#include "Simd/SimdUpdate.h" +#include "Simd/SimdCpu.h" + +namespace Simd +{ +#if defined(SIMD_SSE41_ENABLE) && defined(SIMD_SYNET_ENABLE) + namespace Sse41 + { + SynetMergedConvolution32fCdc::SynetMergedConvolution32fCdc(const MergConvParam& p) + : Base::SynetMergedConvolution32fCdc(p) + { + SetSize(Base::AlgCacheL1(), Base::AlgCacheL2(), Base::AlgCacheL3(), F); + SetInput(p.conv[0], _convolution + 0); + SetDepthwise(p.conv[1], false, _convolution + 1); + SetOutput(p.conv[2], _convolution + 2); + } + + //------------------------------------------------------------------------------------------------- + + SynetMergedConvolution32fCd::SynetMergedConvolution32fCd(const MergConvParam& p) + : Base::SynetMergedConvolution32fCd(p) + { + SetSize(Base::AlgCacheL1(), Base::AlgCacheL2(), Base::AlgCacheL3(), F); + SetInput(_param.conv[0], _convolution + 0); + SetDepthwise(_param.conv[1], true, _convolution + 1); + } + + //------------------------------------------------------------------------------------------------- + + SynetMergedConvolution32fDc::SynetMergedConvolution32fDc(const MergConvParam& p) + : Base::SynetMergedConvolution32fDc(p) + { + SetSize(Base::AlgCacheL1(), Base::AlgCacheL2(), Base::AlgCacheL3(), F); + SetDepthwise(p.conv[0], false, _convolution + 0); + SetOutput(p.conv[1], _convolution + 1); + } + + //------------------------------------------------------------------------------------------------- + + void* SynetMergedConvolution32fInit(size_t batch, const SimdConvolutionParameters* convs, size_t count, SimdBool add) + { + MergConvParam param(batch, convs, count, add, SimdSynetCompatibilityDefault); + if (!param.Valid(SimdTensorData32f)) + return NULL; + if (SynetMergedConvolution32fCdc::Preferable(param)) + return new Sse41::SynetMergedConvolution32fCdc(param); + else if (SynetMergedConvolution32fCd::Preferable(param)) + return new Sse41::SynetMergedConvolution32fCd(param); + else if (SynetMergedConvolution32fDc::Preferable(param)) + return new Sse41::SynetMergedConvolution32fDc(param); + else + return new Base::SynetMergedConvolution32f(param); + } + } +#endif +} diff --git a/src/Simd/SimdSse41SynetMergedConvolution32fCd.cpp b/src/Simd/SimdSse41SynetMergedConvolution32fCd.cpp deleted file mode 100644 index aa66d3bd0e..0000000000 --- a/src/Simd/SimdSse41SynetMergedConvolution32fCd.cpp +++ /dev/null @@ -1,497 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2024 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdSynetMergedConvolution32f.h" -#include "Simd/SimdSynetConvolution32fCommon.h" -#include "Simd/SimdUpdate.h" -#include "Simd/SimdCpu.h" - -namespace Simd -{ -#if defined(SIMD_SSE41_ENABLE) && defined(SIMD_SYNET_ENABLE) - namespace Sse41 - { - namespace Cd - { - SIMD_INLINE void Save(float* ptr, __m128 val, size_t tail) - { - float tmp[F]; - _mm_storeu_ps(tmp, val); - for (size_t i = 0; i < tail; ++i) - ptr[i] = tmp[i]; - } - - template void DepthwiseConvolution(const float* src, const SimdConvolutionParameters& p, - size_t srcC, size_t yBeg, size_t yEnd, const size_t bufH[2], const float* weight, const float* bias, const float* params, float* dst, int first) - { - size_t strideY = p.strideY, strideX = p.strideX, padY = p.padY, padX = p.padX, padH = p.padH, padW = p.padW; - size_t srcW = p.srcW * F, weightS = p.kernelY * p.kernelX * F, strideXF = strideX * F; - size_t srcM = (bufH[0] - 1), srcS = bufH[0] * srcW, dstS = p.dstW*p.dstC; - size_t noseY = (p.padY + p.strideY - 1) / p.strideY; - size_t bodyY = (p.srcH + p.padY + p.strideY - p.kernelY) / p.strideY; - size_t noseX = (p.padX + p.strideX - 1) / p.strideX; - size_t bodyX = (p.srcW + p.padX + p.strideX - p.kernelX) / p.strideX; - size_t bodyX2 = AlignLo(bodyX - noseX, 2) + noseX; - size_t bodyX4 = AlignLo(bodyX - noseX, 4) + noseX; - size_t bodyX8 = AlignLo(bodyX - noseX, 8) + noseX; - size_t srcCF = AlignLo(srcC, F); - - __m128 _params[2]; - _params[0] = _mm_set1_ps(params[0]); - if (type == SimdConvolutionActivationRestrictRange || - type == SimdConvolutionActivationHswish || - type == SimdConvolutionActivationHardSigmoid) - _params[1] = _mm_set1_ps(params[1]); - for (size_t c = 0; c < srcC; c += F) - { - __m128 _bias = bias ? _mm_loadu_ps(bias + c) : _mm_setzero_ps(); - if (type == ::SimdConvolutionActivationPrelu) - _params[0] = _mm_loadu_ps(params + c); - - if (c == srcCF) - { - size_t tail = srcC - srcCF; - for (size_t dy = yBeg; dy < yEnd; ++dy) - { - float* pd = dst + dy * dstS; - for (size_t dx = 0; dx < p.dstW; ++dx, pd += p.dstC) - { - __m128 sum = _bias; - for (size_t ky = 0; ky < p.kernelY; ++ky) - { - size_t sy = dy * strideY + ky - padY; - if (sy < p.srcH) - { - for (size_t kx = 0; kx < p.kernelX; ++kx) - { - size_t sx = dx * strideX + kx - padX; - if (sx < p.srcW) - { - const float* pw = weight + (ky * p.kernelX + kx) * F; - const float* ps = src + ((sy & srcM) * p.srcW + sx) * F; - sum = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps), _mm_loadu_ps(pw)), sum); - } - } - } - } - Save(pd, Activate(sum, _params, 0), tail); - } - } - } - else - { - for (size_t dy = yBeg; dy < yEnd; ++dy) - { - float* pd = dst + dy * dstS; - if (dy >= noseY && dy < bodyY) - { - size_t dx = 0; - for (; dx < noseX; ++dx, pd += p.dstC) - { - __m128 sum = _bias; - for (size_t ky = 0; ky < p.kernelY; ++ky) - { - size_t sy = dy * p.strideY + ky - padY; - for (size_t kx = 0; kx < p.kernelX; ++kx) - { - size_t sx = dx * p.strideX + kx - padX; - if (sx < p.srcW) - { - const float* pw = weight + (ky * p.kernelX + kx) * F; - const float* ps = src + ((sy & srcM) * p.srcW + sx) * F; - sum = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps), _mm_loadu_ps(pw)), sum); - } - } - } - _mm_storeu_ps(pd, Activate(sum, _params, 0)); - } - for (; dx < bodyX8; dx += 8, pd += 8 * p.dstC) - { - __m128 sum0 = _bias; - __m128 sum1 = _bias; - __m128 sum2 = _bias; - __m128 sum3 = _bias; - __m128 sum4 = _bias; - __m128 sum5 = _bias; - __m128 sum6 = _bias; - __m128 sum7 = _bias; - const float* pw = weight; - for (size_t ky = 0; ky < p.kernelY; ++ky) - { - size_t sy = dy * strideY + ky - padY; - const float* ps = src + ((sy & srcM) * p.srcW + dx * strideX - padX) * F; - for (size_t kx = 0; kx < p.kernelX; ++kx, ps += F, pw += F) - { - __m128 w0 = _mm_loadu_ps(pw); - sum0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 0 * strideXF), w0), sum0); - sum1 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 1 * strideXF), w0), sum1); - sum2 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 2 * strideXF), w0), sum2); - sum3 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 3 * strideXF), w0), sum3); - sum4 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 4 * strideXF), w0), sum4); - sum5 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 5 * strideXF), w0), sum5); - sum6 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 6 * strideXF), w0), sum6); - sum7 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 7 * strideXF), w0), sum7); - } - } - _mm_storeu_ps(pd + 0 * p.dstC, Activate(sum0, _params, 0)); - _mm_storeu_ps(pd + 1 * p.dstC, Activate(sum1, _params, 0)); - _mm_storeu_ps(pd + 2 * p.dstC, Activate(sum2, _params, 0)); - _mm_storeu_ps(pd + 3 * p.dstC, Activate(sum3, _params, 0)); - _mm_storeu_ps(pd + 4 * p.dstC, Activate(sum4, _params, 0)); - _mm_storeu_ps(pd + 5 * p.dstC, Activate(sum5, _params, 0)); - _mm_storeu_ps(pd + 6 * p.dstC, Activate(sum6, _params, 0)); - _mm_storeu_ps(pd + 7 * p.dstC, Activate(sum7, _params, 0)); - } - for (; dx < bodyX4; dx += 4, pd += 4 * p.dstC) - { - __m128 sum0 = _bias; - __m128 sum1 = _bias; - __m128 sum2 = _bias; - __m128 sum3 = _bias; - const float* pw = weight; - for (size_t ky = 0; ky < p.kernelY; ++ky) - { - size_t sy = dy * strideY + ky - padY; - const float* ps = src + ((sy & srcM) * p.srcW + dx * strideX - padX) * F; - for (size_t kx = 0; kx < p.kernelX; ++kx, ps += F, pw += F) - { - __m128 w0 = _mm_loadu_ps(pw); - sum0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 0 * strideXF), w0), sum0); - sum1 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 1 * strideXF), w0), sum1); - sum2 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 2 * strideXF), w0), sum2); - sum3 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 3 * strideXF), w0), sum3); - } - } - _mm_storeu_ps(pd + 0 * p.dstC, Activate(sum0, _params, 0)); - _mm_storeu_ps(pd + 1 * p.dstC, Activate(sum1, _params, 0)); - _mm_storeu_ps(pd + 2 * p.dstC, Activate(sum2, _params, 0)); - _mm_storeu_ps(pd + 3 * p.dstC, Activate(sum3, _params, 0)); - } - for (; dx < bodyX2; dx += 2, pd += 2 * p.dstC) - { - __m128 sum0 = _bias; - __m128 sum1 = _bias; - const float* pw = weight; - for (size_t ky = 0; ky < p.kernelY; ++ky) - { - size_t sy = dy * strideY + ky - padY; - const float* ps = src + ((sy & srcM) * p.srcW + dx * strideX - padX) * F; - for (size_t kx = 0; kx < p.kernelX; ++kx, ps += F, pw += F) - { - __m128 w0 = _mm_loadu_ps(pw); - sum0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 0 * strideXF), w0), sum0); - sum1 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 1 * strideXF), w0), sum1); - } - } - _mm_storeu_ps(pd + 0 * p.dstC, Activate(sum0, _params, 0)); - _mm_storeu_ps(pd + 1 * p.dstC, Activate(sum1, _params, 0)); - } - for (; dx < bodyX; ++dx, pd += p.dstC) - { - __m128 sum = _bias; - const float* pw = weight; - for (size_t ky = 0; ky < p.kernelY; ++ky) - { - size_t sy = dy * strideY + ky - padY; - const float* ps = src + ((sy & srcM) * p.srcW + dx * strideX - padX) * F; - for (size_t kx = 0; kx < p.kernelX; ++kx, ps += F, pw += F) - { - __m128 w0 = _mm_loadu_ps(pw); - sum = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps), w0), sum); - } - } - _mm_storeu_ps(pd, Activate(sum, _params, 0)); - } - for (; dx < p.dstW; ++dx, pd += p.dstC) - { - __m128 sum = _bias; - for (size_t ky = 0; ky < p.kernelY; ++ky) - { - size_t sy = dy * strideY + ky - padY; - for (size_t kx = 0; kx < p.kernelX; ++kx) - { - size_t sx = dx * strideX + kx - padX; - if (sx < p.srcW) - { - const float* pw = weight + (ky * p.kernelX + kx) * F; - const float* ps = src + ((sy & srcM) * p.srcW + sx) * F; - sum = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps), _mm_loadu_ps(pw)), sum); - } - } - } - _mm_storeu_ps(pd, Activate(sum, _params, 0)); - } - } - else - { - for (size_t dx = 0; dx < p.dstW; ++dx, pd += p.dstC) - { - __m128 sum = _bias; - for (size_t ky = 0; ky < p.kernelY; ++ky) - { - size_t sy = dy * strideY + ky - padY; - if (sy < p.srcH) - { - for (size_t kx = 0; kx < p.kernelX; ++kx) - { - size_t sx = dx * strideX + kx - padX; - if (sx < p.srcW) - { - const float* pw = weight + (ky * p.kernelX + kx) * F; - const float* ps = src + ((sy & srcM) * p.srcW + sx) * F; - sum = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps), _mm_loadu_ps(pw)), sum); - } - } - } - } - _mm_storeu_ps(pd, Activate(sum, _params, 0)); - } - } - } - } - - src += srcS; - dst += F; - weight += weightS; - } - } - - template SIMD_INLINE void ConvolutionDepthwise3x3Edge2x2( - const float* src0, const float* src1, const __m128* weight, const __m128& bias, const __m128* params, float* dst) - { - __m128 sum0 = bias, sum1 = _mm_setzero_ps(); - sum0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src0 + 0 * F), weight[0]), sum0); - sum1 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src0 + 1 * F), weight[1]), sum1); - sum0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src1 + 0 * F), weight[3]), sum0); - sum1 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src1 + 1 * F), weight[4]), sum1); - _mm_storeu_ps(dst, Activate(_mm_add_ps(sum0, sum1), params, 0)); - } - - template SIMD_INLINE void ConvolutionDepthwise3x3Edge2x3( - const float* src0, const float* src1, const __m128* weight, const __m128& bias, const __m128* params, float* dst) - { - __m128 sum0 = bias, sum1 = _mm_setzero_ps(), sum2 = _mm_setzero_ps(); - sum0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src0 + 0 * F), weight[0]), sum0); - sum1 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src0 + 1 * F), weight[1]), sum1); - sum2 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src0 + 2 * F), weight[2]), sum2); - sum0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src1 + 0 * F), weight[3]), sum0); - sum1 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src1 + 1 * F), weight[4]), sum1); - sum2 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src1 + 2 * F), weight[5]), sum2); - _mm_storeu_ps(dst, Activate(_mm_add_ps(_mm_add_ps(sum0, sum1), sum2), params, 0)); - } - - template SIMD_INLINE void ConvolutionDepthwise3x3Edge3x2( - const float* src0, const float* src1, const float* src2, const __m128* weight, const __m128& bias, const __m128* params, float* dst) - { - __m128 sum0 = bias, sum1 = _mm_setzero_ps(); - sum0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src0 + 0 * F), weight[0]), sum0); - sum1 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src0 + 1 * F), weight[1]), sum1); - sum0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src1 + 0 * F), weight[3]), sum0); - sum1 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src1 + 1 * F), weight[4]), sum1); - sum0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src2 + 0 * F), weight[6]), sum0); - sum1 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src2 + 1 * F), weight[7]), sum1); - _mm_storeu_ps(dst, Activate(_mm_add_ps(sum0, sum1), params, 0)); - } - - template SIMD_INLINE void ConvolutionDepthwise3x3Main1x1( - const float* src0, const float* src1, const float* src2, const __m128* weight, const __m128& bias, const __m128* params, float* dst) - { - __m128 sum0 = bias, sum1 = _mm_setzero_ps(), sum2 = _mm_setzero_ps(); - sum0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src0 + 0 * F), weight[0]), sum0); - sum1 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src0 + 1 * F), weight[1]), sum1); - sum2 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src0 + 2 * F), weight[2]), sum2); - sum0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src1 + 0 * F), weight[3]), sum0); - sum1 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src1 + 1 * F), weight[4]), sum1); - sum2 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src1 + 2 * F), weight[5]), sum2); - sum0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src2 + 0 * F), weight[6]), sum0); - sum1 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src2 + 1 * F), weight[7]), sum1); - sum2 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src2 + 2 * F), weight[8]), sum2); - _mm_storeu_ps(dst, Activate(_mm_add_ps(_mm_add_ps(sum0, sum1), sum2), params, 0)); - } - - template SIMD_INLINE void ConvolutionDepthwise3x3Main1x2( - const float* src0, const float* src1, const float* src2, const __m128* weight, const __m128& bias, const __m128* params, float* dst, size_t dstC) - { - __m128 sum0 = bias, sum1 = bias, s0; - - s0 = _mm_loadu_ps(src0 + 0 * F); - sum0 = _mm_add_ps(_mm_mul_ps(s0, weight[0]), sum0); - s0 = _mm_loadu_ps(src0 + 1 * F); - sum0 = _mm_add_ps(_mm_mul_ps(s0, weight[1]), sum0); - sum1 = _mm_add_ps(_mm_mul_ps(s0, weight[0]), sum1); - s0 = _mm_loadu_ps(src0 + 2 * F); - sum0 = _mm_add_ps(_mm_mul_ps(s0, weight[2]), sum0); - sum1 = _mm_add_ps(_mm_mul_ps(s0, weight[1]), sum1); - s0 = _mm_loadu_ps(src0 + 3 * F); - sum1 = _mm_add_ps(_mm_mul_ps(s0, weight[2]), sum1); - - s0 = _mm_loadu_ps(src1 + 0 * F); - sum0 = _mm_add_ps(_mm_mul_ps(s0, weight[3]), sum0); - s0 = _mm_loadu_ps(src1 + 1 * F); - sum0 = _mm_add_ps(_mm_mul_ps(s0, weight[4]), sum0); - sum1 = _mm_add_ps(_mm_mul_ps(s0, weight[3]), sum1); - s0 = _mm_loadu_ps(src1 + 2 * F); - sum0 = _mm_add_ps(_mm_mul_ps(s0, weight[5]), sum0); - sum1 = _mm_add_ps(_mm_mul_ps(s0, weight[4]), sum1); - s0 = _mm_loadu_ps(src1 + 3 * F); - sum1 = _mm_add_ps(_mm_mul_ps(s0, weight[5]), sum1); - - s0 = _mm_loadu_ps(src2 + 0 * F); - sum0 = _mm_add_ps(_mm_mul_ps(s0, weight[6]), sum0); - s0 = _mm_loadu_ps(src2 + 1 * F); - sum0 = _mm_add_ps(_mm_mul_ps(s0, weight[7]), sum0); - sum1 = _mm_add_ps(_mm_mul_ps(s0, weight[6]), sum1); - s0 = _mm_loadu_ps(src2 + 2 * F); - sum0 = _mm_add_ps(_mm_mul_ps(s0, weight[8]), sum0); - sum1 = _mm_add_ps(_mm_mul_ps(s0, weight[7]), sum1); - s0 = _mm_loadu_ps(src2 + 3 * F); - sum1 = _mm_add_ps(_mm_mul_ps(s0, weight[8]), sum1); - - _mm_storeu_ps(dst + 0 * dstC, Activate(sum0, params, 0)); - _mm_storeu_ps(dst + 1 * dstC, Activate(sum1, params, 0)); - } - - template void DepthwiseConvolution3x3(const float* src, const SimdConvolutionParameters& p, - size_t srcC, size_t yBeg, size_t yEnd, const size_t bufH[2], const float* weight, const float* bias, const float* params, float* dst, int first) - { - size_t strideY = p.strideY, padY = p.padY, padX = p.padX, padH = p.padH, padW = p.padW; - size_t srcW = p.srcW * F, dstW = p.dstW * F, weightS = p.kernelY * p.kernelX * F; - size_t srcM = (bufH[0] - 1), srcS = bufH[0] * srcW, dstS = p.dstW * p.dstC; - size_t xStep = F * p.strideX, xStep0 = (p.strideX - p.padX) * F; - size_t xMainEnd = p.dstW - p.padW, xMainEnd2 = AlignLo(xMainEnd - padX, 2)*(p.strideX == 1 ? 1 : 0) + padX; - size_t yMainEnd = yEnd == p.dstH && p.padH ? yEnd - 1 : yEnd; - - __m128 _params[2]; - _params[0] = _mm_set1_ps(params[0]); - if (type == SimdConvolutionActivationRestrictRange || - type == SimdConvolutionActivationHswish || - type == SimdConvolutionActivationHardSigmoid) - _params[1] = _mm_set1_ps(params[1]); - for (size_t c = 0; c < srcC; c += F) - { - __m128 _weight[9]; - for (size_t i = 0; i < 9; ++i) - _weight[i] = _mm_loadu_ps(weight + i * F); - __m128 _bias = bias ? _mm_loadu_ps(bias + c) : _mm_setzero_ps(); - if (type == ::SimdConvolutionActivationPrelu) - _params[0] = _mm_loadu_ps(params + c); - - size_t dy = yBeg; - if (yBeg == 0 && padY) - { - size_t sy = 0, dx = 0; - const float* src0 = src + ((sy + 0) & srcM) * srcW; - const float* src1 = src + ((sy + 1) & srcM) * srcW; - float* pDst = dst + dy * dstS; - if (padX) - ConvolutionDepthwise3x3Edge2x2(src0, src1, _weight + 4, _bias, _params, pDst), pDst += p.dstC, dx++, src0 += xStep0, src1 += xStep0; - for (; dx < xMainEnd; dx++, pDst += p.dstC, src0 += xStep, src1 += xStep) - ConvolutionDepthwise3x3Edge2x3(src0, src1, _weight + 3, _bias, _params, pDst); - if (padW) - ConvolutionDepthwise3x3Edge2x2(src0, src1, _weight + 3, _bias, _params, pDst); - dy++; - } - for (; dy < yMainEnd; ++dy) - { - size_t sy = dy * strideY - padY, dx = 0; - const float* src0 = src + ((sy + 0) & srcM) * srcW; - const float* src1 = src + ((sy + 1) & srcM) * srcW; - const float* src2 = src + ((sy + 2) & srcM) * srcW; - float* pDst = dst + dy * dstS; - if (padX) - ConvolutionDepthwise3x3Edge3x2(src0, src1, src2, _weight + 1, _bias, _params, pDst), pDst += p.dstC, dx++, src0 += xStep0, src1 += xStep0, src2 += xStep0; - for (; dx < xMainEnd2; dx += 2, pDst += 2* p.dstC, src0 += 2*xStep, src1 += 2*xStep, src2 += 2*xStep) - ConvolutionDepthwise3x3Main1x2(src0, src1, src2, _weight + 0, _bias, _params, pDst, p.dstC); - for (; dx < xMainEnd; dx++, pDst += p.dstC, src0 += xStep, src1 += xStep, src2 += xStep) - ConvolutionDepthwise3x3Main1x1(src0, src1, src2, _weight + 0, _bias, _params, pDst); - if (padW) - ConvolutionDepthwise3x3Edge3x2(src0, src1, src2, _weight + 0, _bias, _params, pDst); - } - if (dy < yEnd) - { - size_t sy = dy * strideY - padY, dx = 0; - const float* src0 = src + ((sy + 0) & srcM) * srcW; - const float* src1 = src + ((sy + 1) & srcM) * srcW; - float* pDst = dst + dy * dstS; - if (padX) - ConvolutionDepthwise3x3Edge2x2(src0, src1, _weight + 1, _bias, _params, pDst), pDst += p.dstC, dx++, src0 += xStep0, src1 += xStep0; - for (; dx < xMainEnd; dx++, pDst += p.dstC, src0 += xStep, src1 += xStep) - ConvolutionDepthwise3x3Edge2x3(src0, src1, _weight + 0, _bias, _params, pDst); - if (padW) - ConvolutionDepthwise3x3Edge2x2(src0, src1, _weight + 0, _bias, _params, pDst); - } - src += srcS; - dst += F; - weight += weightS; - } - } - - //--------------------------------------------------------------------- - - template void Set(const MergConvParam& p, size_t t, size_t i, SynetMergedConvolution32fCd::ConvolutionPtr * c) - { - switch (t) - { - case 1: - if (p.conv[i].kernelY == 3 && Aligned(p.conv[i].dstC, F)) - c[i] = DepthwiseConvolution3x3; - else - c[i] = DepthwiseConvolution; - break; - default: - assert(0); - } - } - } - - //--------------------------------------------------------------------- - - SynetMergedConvolution32fCd::SynetMergedConvolution32fCd(const MergConvParam& p) - : Base::SynetMergedConvolution32fCd(p) - { - SetSize(Base::AlgCacheL1(), Base::AlgCacheL2(), Base::AlgCacheL3(), F); - SynetMergedConvolution32fCdc::Set(_param, 0, 0, _convolution); - SynetMergedConvolution32fCd::Set(_param, 1, 1, _convolution); - } - - void SynetMergedConvolution32fCd::Set(const MergConvParam& p, size_t t, size_t i, SynetMergedConvolution32f::ConvolutionPtr* c) - { - switch (p.conv[i].activation) - { - case SimdConvolutionActivationIdentity: Cd::Set(p, t, i, c); break; - case SimdConvolutionActivationRelu: Cd::Set(p, t, i, c); break; - case SimdConvolutionActivationLeakyRelu: Cd::Set(p, t, i, c); break; - case SimdConvolutionActivationRestrictRange: Cd::Set(p, t, i, c); break; - case SimdConvolutionActivationPrelu: Cd::Set(p, t, i, c); break; - case SimdConvolutionActivationElu: Cd::Set(p, t, i, c); break; - case SimdConvolutionActivationHswish: Cd::Set(p, t, i, c); break; - case SimdConvolutionActivationMish: Cd::Set(p, t, i, c); break; - case SimdConvolutionActivationHardSigmoid: Cd::Set(p, t, i, c); break; - case SimdConvolutionActivationSwish: Cd::Set(p, t, i, c); break; - case SimdConvolutionActivationGelu: Cd::Set(p, t, i, c); break; - default: assert(0); - } - } - } -#endif -} diff --git a/src/Simd/SimdSse41SynetMergedConvolution32fCdc.cpp b/src/Simd/SimdSse41SynetMergedConvolution32fCdc.cpp deleted file mode 100644 index bb43d5be89..0000000000 --- a/src/Simd/SimdSse41SynetMergedConvolution32fCdc.cpp +++ /dev/null @@ -1,1493 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2024 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdSynetMergedConvolution32f.h" -#include "Simd/SimdSynetConvolution32fCommon.h" -#include "Simd/SimdUpdate.h" -#include "Simd/SimdCpu.h" - -namespace Simd -{ -#if defined(SIMD_SSE41_ENABLE) && defined(SIMD_SYNET_ENABLE) - namespace Sse41 - { - namespace Cdc - { - template SIMD_INLINE void InputConvolution1x1_2x6(const float* src0, size_t srcC, - const float* weight, const __m128* bias, const __m128* params, float* dst0, float* dst1) - { - __m128 d00, d01, d10, d11, d20, d21, d30, d31, d40, d41, d50, d51, s0, w0, w1; - d00 = bias[0], d01 = bias[1]; - d10 = bias[0], d11 = bias[1]; - d20 = bias[0], d21 = bias[1]; - d30 = bias[0], d31 = bias[1]; - d40 = bias[0], d41 = bias[1]; - d50 = bias[0], d51 = bias[1]; - const float* src1 = src0 + 1 * srcC; - const float* src2 = src0 + 2 * srcC; - const float* src3 = src0 + 3 * srcC; - const float* src4 = src0 + 4 * srcC; - const float* src5 = src0 + 5 * srcC; - for (size_t sc = 0; sc < srcC; ++sc) - { - w0 = _mm_loadu_ps(weight + 0); - w1 = _mm_loadu_ps(weight + F); - s0 = _mm_set1_ps(src0[sc]); - d00 = _mm_add_ps(_mm_mul_ps(s0, w0), d00); - d01 = _mm_add_ps(_mm_mul_ps(s0, w1), d01); - s0 = _mm_set1_ps(src1[sc]); - d10 = _mm_add_ps(_mm_mul_ps(s0, w0), d10); - d11 = _mm_add_ps(_mm_mul_ps(s0, w1), d11); - s0 = _mm_set1_ps(src2[sc]); - d20 = _mm_add_ps(_mm_mul_ps(s0, w0), d20); - d21 = _mm_add_ps(_mm_mul_ps(s0, w1), d21); - s0 = _mm_set1_ps(src3[sc]); - d30 = _mm_add_ps(_mm_mul_ps(s0, w0), d30); - d31 = _mm_add_ps(_mm_mul_ps(s0, w1), d31); - s0 = _mm_set1_ps(src4[sc]); - d40 = _mm_add_ps(_mm_mul_ps(s0, w0), d40); - d41 = _mm_add_ps(_mm_mul_ps(s0, w1), d41); - s0 = _mm_set1_ps(src5[sc]); - d50 = _mm_add_ps(_mm_mul_ps(s0, w0), d50); - d51 = _mm_add_ps(_mm_mul_ps(s0, w1), d51); - weight += DF; - } - _mm_storeu_ps(dst0 + 0 * F, Activate(d00, params, 0)); - _mm_storeu_ps(dst1 + 0 * F, Activate(d01, params, 1)); - _mm_storeu_ps(dst0 + 1 * F, Activate(d10, params, 0)); - _mm_storeu_ps(dst1 + 1 * F, Activate(d11, params, 1)); - _mm_storeu_ps(dst0 + 2 * F, Activate(d20, params, 0)); - _mm_storeu_ps(dst1 + 2 * F, Activate(d21, params, 1)); - _mm_storeu_ps(dst0 + 3 * F, Activate(d30, params, 0)); - _mm_storeu_ps(dst1 + 3 * F, Activate(d31, params, 1)); - _mm_storeu_ps(dst0 + 4 * F, Activate(d40, params, 0)); - _mm_storeu_ps(dst1 + 4 * F, Activate(d41, params, 1)); - _mm_storeu_ps(dst0 + 5 * F, Activate(d50, params, 0)); - _mm_storeu_ps(dst1 + 5 * F, Activate(d51, params, 1)); - } - - template SIMD_INLINE void InputConvolution1x1_2xM(const float* src0, size_t srcC, - const float* weight, const __m128* bias, const __m128* params, float* dst0, float* dst1) - { - __m128 d00, d01, d10, d11, d20, d21, d30, d31, d40, d41, d50, d51, s0, w0, w1; - if (M > 0) d00 = bias[0], d01 = bias[1]; - if (M > 1) d10 = bias[0], d11 = bias[1]; - if (M > 2) d20 = bias[0], d21 = bias[1]; - if (M > 3) d30 = bias[0], d31 = bias[1]; - if (M > 4) d40 = bias[0], d41 = bias[1]; - if (M > 5) d50 = bias[0], d51 = bias[1]; - const float* src1 = src0 + 1 * srcC; - const float* src2 = src0 + 2 * srcC; - const float* src3 = src0 + 3 * srcC; - const float* src4 = src0 + 4 * srcC; - const float* src5 = src0 + 5 * srcC; - for (size_t sc = 0; sc < srcC; ++sc) - { - w0 = _mm_loadu_ps(weight + 0); - w1 = _mm_loadu_ps(weight + F); - if (M > 0) s0 = _mm_set1_ps(src0[sc]), d00 = _mm_add_ps(_mm_mul_ps(s0, w0), d00), d01 = _mm_add_ps(_mm_mul_ps(s0, w1), d01); - if (M > 1) s0 = _mm_set1_ps(src1[sc]), d10 = _mm_add_ps(_mm_mul_ps(s0, w0), d10), d11 = _mm_add_ps(_mm_mul_ps(s0, w1), d11); - if (M > 2) s0 = _mm_set1_ps(src2[sc]), d20 = _mm_add_ps(_mm_mul_ps(s0, w0), d20), d21 = _mm_add_ps(_mm_mul_ps(s0, w1), d21); - if (M > 3) s0 = _mm_set1_ps(src3[sc]), d30 = _mm_add_ps(_mm_mul_ps(s0, w0), d30), d31 = _mm_add_ps(_mm_mul_ps(s0, w1), d31); - if (M > 4) s0 = _mm_set1_ps(src4[sc]), d40 = _mm_add_ps(_mm_mul_ps(s0, w0), d40), d41 = _mm_add_ps(_mm_mul_ps(s0, w1), d41); - if (M > 5) s0 = _mm_set1_ps(src5[sc]), d50 = _mm_add_ps(_mm_mul_ps(s0, w0), d50), d51 = _mm_add_ps(_mm_mul_ps(s0, w1), d51); - weight += DF; - } - if (M > 0) _mm_storeu_ps(dst0 + 0 * F, Activate(d00, params, 0)), _mm_storeu_ps(dst1 + 0 * F, Activate(d01, params, 1)); - if (M > 1) _mm_storeu_ps(dst0 + 1 * F, Activate(d10, params, 0)), _mm_storeu_ps(dst1 + 1 * F, Activate(d11, params, 1)); - if (M > 2) _mm_storeu_ps(dst0 + 2 * F, Activate(d20, params, 0)), _mm_storeu_ps(dst1 + 2 * F, Activate(d21, params, 1)); - if (M > 3) _mm_storeu_ps(dst0 + 3 * F, Activate(d30, params, 0)), _mm_storeu_ps(dst1 + 3 * F, Activate(d31, params, 1)); - if (M > 4) _mm_storeu_ps(dst0 + 4 * F, Activate(d40, params, 0)), _mm_storeu_ps(dst1 + 4 * F, Activate(d41, params, 1)); - if (M > 5) _mm_storeu_ps(dst0 + 5 * F, Activate(d50, params, 0)), _mm_storeu_ps(dst1 + 5 * F, Activate(d51, params, 1)); - } - - typedef void(*InputConvolution1x1_2xM_Ptr)(const float* src0, size_t srcC, const float* weight, const __m128* bias, const __m128* params, float* dst0, float* dst1); - - template InputConvolution1x1_2xM_Ptr GetInputConvolution1x1_2xM(size_t M) - { - switch (M) - { - case 0: return InputConvolution1x1_2xM; - case 1: return InputConvolution1x1_2xM; - case 2: return InputConvolution1x1_2xM; - case 3: return InputConvolution1x1_2xM; - case 4: return InputConvolution1x1_2xM; - case 5: return InputConvolution1x1_2xM; - } - assert(0); - return NULL; - } - - template SIMD_INLINE void InputConvolution1x1_1x6(const float* src0, size_t srcC, - const float* weight, const __m128* bias, const __m128* params, float* dst0) - { - __m128 d00, d10, d20, d30, d40, d50, s0, w0; - d00 = bias[0]; - d10 = bias[0]; - d20 = bias[0]; - d30 = bias[0]; - d40 = bias[0]; - d50 = bias[0]; - const float* src1 = src0 + 1 * srcC; - const float* src2 = src0 + 2 * srcC; - const float* src3 = src0 + 3 * srcC; - const float* src4 = src0 + 4 * srcC; - const float* src5 = src0 + 5 * srcC; - for (size_t sc = 0; sc < srcC; ++sc) - { - w0 = _mm_loadu_ps(weight + 0); - s0 = _mm_set1_ps(src0[sc]); - d00 = _mm_add_ps(_mm_mul_ps(s0, w0), d00); - s0 = _mm_set1_ps(src1[sc]); - d10 = _mm_add_ps(_mm_mul_ps(s0, w0), d10); - s0 = _mm_set1_ps(src2[sc]); - d20 = _mm_add_ps(_mm_mul_ps(s0, w0), d20); - s0 = _mm_set1_ps(src3[sc]); - d30 = _mm_add_ps(_mm_mul_ps(s0, w0), d30); - s0 = _mm_set1_ps(src4[sc]); - d40 = _mm_add_ps(_mm_mul_ps(s0, w0), d40); - s0 = _mm_set1_ps(src5[sc]); - d50 = _mm_add_ps(_mm_mul_ps(s0, w0), d50); - weight += DF; - } - _mm_storeu_ps(dst0 + 0 * F, Activate(d00, params, 0)); - _mm_storeu_ps(dst0 + 1 * F, Activate(d10, params, 0)); - _mm_storeu_ps(dst0 + 2 * F, Activate(d20, params, 0)); - _mm_storeu_ps(dst0 + 3 * F, Activate(d30, params, 0)); - _mm_storeu_ps(dst0 + 4 * F, Activate(d40, params, 0)); - _mm_storeu_ps(dst0 + 5 * F, Activate(d50, params, 0)); - } - - template SIMD_INLINE void InputConvolution1x1_1xM(const float* src0, size_t srcC, - const float* weight, const __m128* bias, const __m128* params, float* dst0) - { - __m128 d00, d10, d20, d30, d40, d50, s0, w0; - if (M > 0) d00 = bias[0]; - if (M > 1) d10 = bias[0]; - if (M > 2) d20 = bias[0]; - if (M > 3) d30 = bias[0]; - if (M > 4) d40 = bias[0]; - if (M > 5) d50 = bias[0]; - const float* src1 = src0 + 1 * srcC; - const float* src2 = src0 + 2 * srcC; - const float* src3 = src0 + 3 * srcC; - const float* src4 = src0 + 4 * srcC; - const float* src5 = src0 + 5 * srcC; - for (size_t sc = 0; sc < srcC; ++sc) - { - w0 = _mm_loadu_ps(weight + 0); - if (M > 0) s0 = _mm_set1_ps(src0[sc]), d00 = _mm_add_ps(_mm_mul_ps(s0, w0), d00); - if (M > 1) s0 = _mm_set1_ps(src1[sc]), d10 = _mm_add_ps(_mm_mul_ps(s0, w0), d10); - if (M > 2) s0 = _mm_set1_ps(src2[sc]), d20 = _mm_add_ps(_mm_mul_ps(s0, w0), d20); - if (M > 3) s0 = _mm_set1_ps(src3[sc]), d30 = _mm_add_ps(_mm_mul_ps(s0, w0), d30); - if (M > 4) s0 = _mm_set1_ps(src4[sc]), d40 = _mm_add_ps(_mm_mul_ps(s0, w0), d40); - if (M > 5) s0 = _mm_set1_ps(src5[sc]), d50 = _mm_add_ps(_mm_mul_ps(s0, w0), d50); - weight += DF; - } - if (M > 0) _mm_storeu_ps(dst0 + 0 * F, Activate(d00, params, 0)); - if (M > 1) _mm_storeu_ps(dst0 + 1 * F, Activate(d10, params, 0)); - if (M > 2) _mm_storeu_ps(dst0 + 2 * F, Activate(d20, params, 0)); - if (M > 3) _mm_storeu_ps(dst0 + 3 * F, Activate(d30, params, 0)); - if (M > 4) _mm_storeu_ps(dst0 + 4 * F, Activate(d40, params, 0)); - if (M > 5) _mm_storeu_ps(dst0 + 5 * F, Activate(d50, params, 0)); - } - - typedef void(*InputConvolution1x1_1xM_Ptr)(const float* src0, size_t srcC, const float* weight, const __m128* bias, const __m128* params, float* dst0); - - template InputConvolution1x1_1xM_Ptr GetInputConvolution1x1_1xM(size_t M) - { - switch (M) - { - case 0: return InputConvolution1x1_1xM; - case 1: return InputConvolution1x1_1xM; - case 2: return InputConvolution1x1_1xM; - case 3: return InputConvolution1x1_1xM; - case 4: return InputConvolution1x1_1xM; - case 5: return InputConvolution1x1_1xM; - } - assert(0); - return NULL; - } - - template void InputConvolution1x1(const float* src, const SimdConvolutionParameters& p, - size_t dstC, size_t yBeg, size_t yEnd, const size_t bufH[2], const float* weight, const float* bias, const float* params, float* dst, int first) - { - size_t srcH = p.srcH, srcW = p.srcW, srcC = p.srcC, dstW = p.dstW; - size_t dstM = (bufH[0] - 1), dstS = bufH[0] * dstW * F; - size_t dstCDF = AlignLo(dstC, DF); - __m128 _params[2], _bias[2]; - _params[0] = _mm_set1_ps(params[0]); - if (type == SimdConvolutionActivationRestrictRange || - type == SimdConvolutionActivationHswish || - type == SimdConvolutionActivationHardSigmoid) - _params[1] = _mm_set1_ps(params[1]); - size_t yInt = Simd::Max(yBeg, yEnd & (~dstM)), nBeg = yBeg * dstW, nInt = yInt * dstW, nEnd = yEnd * dstW; - size_t nInt6 = AlignLoAny(nInt - nBeg, 6) + nBeg, nEnd6 = AlignLoAny(nEnd - nInt, 6) + nInt, nIntTail = nInt - nInt6, nEndTail = nEnd - nEnd6; - InputConvolution1x1_2xM_Ptr tailInt_2 = GetInputConvolution1x1_2xM(nIntTail); - InputConvolution1x1_2xM_Ptr tailEnd_2 = GetInputConvolution1x1_2xM(nEndTail); - - size_t dc = 0; - for (; dc < dstC; dc += DF) - { - _bias[0] = bias ? _mm_loadu_ps(bias + dc + 0) : _mm_setzero_ps(); - _bias[1] = bias ? _mm_loadu_ps(bias + dc + F) : _mm_setzero_ps(); - if (type == ::SimdConvolutionActivationPrelu) - { - _params[0] = _mm_loadu_ps(params + dc + 0); - _params[1] = _mm_loadu_ps(params + dc + F); - } - const float* pS = src + yBeg * srcW * srcC; - const float* pW = weight + dc * srcC; - float* pD = dst + (dc / F) * dstS; - float* dst0 = pD + (yBeg & dstM) * dstW * F; - float* dst1 = pD + (yInt & dstM) * dstW * F; - size_t dn = nBeg; - if (dstC - dc > F) - { - for (; dn < nInt6; dn += 6, pS += 6 * srcC, dst0 += 6 * F) - InputConvolution1x1_2x6(pS, srcC, pW, _bias, _params, dst0, dst0 + dstS); - if (nIntTail) - tailInt_2(pS, srcC, pW, _bias, _params, dst0, dst0 + dstS), pS += nIntTail * srcC, dn += nIntTail; - for (; dn < nEnd6; dn += 6, pS += 6 * srcC, dst1 += 6 * F) - InputConvolution1x1_2x6(pS, srcC, pW, _bias, _params, dst1, dst1 + dstS); - if (nEndTail) - tailEnd_2(pS, srcC, pW, _bias, _params, dst1, dst1 + dstS), pS += nEndTail * srcC, dn += nEndTail; - } - else - { - InputConvolution1x1_1xM_Ptr tailInt_1 = GetInputConvolution1x1_1xM(nIntTail); - InputConvolution1x1_1xM_Ptr tailEnd_1 = GetInputConvolution1x1_1xM(nEndTail); - for (; dn < nInt6; dn += 6, pS += 6 * srcC, dst0 += 6 * F) - InputConvolution1x1_1x6(pS, srcC, pW, _bias, _params, dst0); - if (nIntTail) - tailInt_1(pS, srcC, pW, _bias, _params, dst0), pS += nIntTail * srcC, dn += nIntTail; - for (; dn < nEnd6; dn += 6, pS += 6 * srcC, dst1 += 6 * F) - InputConvolution1x1_1x6(pS, srcC, pW, _bias, _params, dst1); - if (nEndTail) - tailEnd_1(pS, srcC, pW, _bias, _params, dst1), pS += nEndTail * srcC, dn += nEndTail; - } - } - } - - //--------------------------------------------------------------------- - - template SIMD_INLINE void InputConvolution_2x1(const float* src0, const SimdConvolutionParameters& p, - size_t kH, size_t kW, const float* weight, const __m128* bias, const __m128* params, float* dst0, float* dst1) - { - __m128 d00, d01, s0, w0, w1; - d00 = bias[0]; - d01 = bias[1]; - size_t size = kW * p.srcC, tail = DF * (p.kernelX - kW) * p.srcC, stride = p.srcW * p.srcC; - for (size_t ky = 0; ky < kH; ++ky) - { - for (size_t i = 0; i < size; ++i) - { - w0 = _mm_loadu_ps(weight + 0); - w1 = _mm_loadu_ps(weight + F); - s0 = _mm_set1_ps(src0[i]); - d00 = _mm_add_ps(_mm_mul_ps(s0, w0), d00); - d01 = _mm_add_ps(_mm_mul_ps(s0, w1), d01); - weight += DF; - } - weight += tail; - src0 += stride; - } - _mm_storeu_ps(dst0, Activate(d00, params, 0)); - _mm_storeu_ps(dst1, Activate(d01, params, 1)); - } - - template SIMD_INLINE void InputConvolution_1x1(const float* src0, const SimdConvolutionParameters& p, - size_t kH, size_t kW, const float* weight, const __m128* bias, const __m128* params, float* dst0) - { - __m128 d00, s0, w0; - d00 = bias[0]; - size_t size = kW * p.srcC, tail = DF * (p.kernelX - kW) * p.srcC, stride = p.srcW * p.srcC; - for (size_t ky = 0; ky < kH; ++ky) - { - for (size_t i = 0; i < size; ++i) - { - w0 = _mm_loadu_ps(weight + 0); - s0 = _mm_set1_ps(src0[i]); - d00 = _mm_add_ps(_mm_mul_ps(s0, w0), d00); - weight += DF; - } - weight += tail; - src0 += stride; - } - _mm_storeu_ps(dst0, Activate(d00, params, 0)); - } - - template SIMD_INLINE void InputConvolution_2x6(const float* src0, const SimdConvolutionParameters& p, - size_t kH, size_t kW, const float* weight, const __m128* bias, const __m128* params, float* dst0, float* dst1) - { - __m128 d00, d01, d10, d11, d20, d21, d30, d31, d40, d41, d50, d51, s0, w0, w1; - d00 = bias[0], d01 = bias[1]; - d10 = bias[0], d11 = bias[1]; - d20 = bias[0], d21 = bias[1]; - d30 = bias[0], d31 = bias[1]; - d40 = bias[0], d41 = bias[1]; - d50 = bias[0], d51 = bias[1]; - size_t size = kW * p.srcC, tail = DF * (p.kernelX - kW) * p.srcC, stride = p.srcW * p.srcC, step = p.srcC * p.strideX; - const float* src1 = src0 + 1 * step; - const float* src2 = src0 + 2 * step; - const float* src3 = src0 + 3 * step; - const float* src4 = src0 + 4 * step; - const float* src5 = src0 + 5 * step; - for (size_t ky = 0; ky < kH; ++ky) - { - size_t offset = ky * stride; - for (size_t end = offset + size; offset < end; ++offset) - { - w0 = _mm_loadu_ps(weight + 0); - w1 = _mm_loadu_ps(weight + F); - s0 = _mm_set1_ps(src0[offset]); - d00 = _mm_add_ps(_mm_mul_ps(s0, w0), d00); - d01 = _mm_add_ps(_mm_mul_ps(s0, w1), d01); - s0 = _mm_set1_ps(src1[offset]); - d10 = _mm_add_ps(_mm_mul_ps(s0, w0), d10); - d11 = _mm_add_ps(_mm_mul_ps(s0, w1), d11); - s0 = _mm_set1_ps(src2[offset]); - d20 = _mm_add_ps(_mm_mul_ps(s0, w0), d20); - d21 = _mm_add_ps(_mm_mul_ps(s0, w1), d21); - s0 = _mm_set1_ps(src3[offset]); - d30 = _mm_add_ps(_mm_mul_ps(s0, w0), d30); - d31 = _mm_add_ps(_mm_mul_ps(s0, w1), d31); - s0 = _mm_set1_ps(src4[offset]); - d40 = _mm_add_ps(_mm_mul_ps(s0, w0), d40); - d41 = _mm_add_ps(_mm_mul_ps(s0, w1), d41); - s0 = _mm_set1_ps(src5[offset]); - d50 = _mm_add_ps(_mm_mul_ps(s0, w0), d50); - d51 = _mm_add_ps(_mm_mul_ps(s0, w1), d51); - weight += DF; - } - weight += tail; - } - _mm_storeu_ps(dst0 + 0 * F, Activate(d00, params, 0)); - _mm_storeu_ps(dst1 + 0 * F, Activate(d01, params, 1)); - _mm_storeu_ps(dst0 + 1 * F, Activate(d10, params, 0)); - _mm_storeu_ps(dst1 + 1 * F, Activate(d11, params, 1)); - _mm_storeu_ps(dst0 + 2 * F, Activate(d20, params, 0)); - _mm_storeu_ps(dst1 + 2 * F, Activate(d21, params, 1)); - _mm_storeu_ps(dst0 + 3 * F, Activate(d30, params, 0)); - _mm_storeu_ps(dst1 + 3 * F, Activate(d31, params, 1)); - _mm_storeu_ps(dst0 + 4 * F, Activate(d40, params, 0)); - _mm_storeu_ps(dst1 + 4 * F, Activate(d41, params, 1)); - _mm_storeu_ps(dst0 + 5 * F, Activate(d50, params, 0)); - _mm_storeu_ps(dst1 + 5 * F, Activate(d51, params, 1)); - } - - template SIMD_INLINE void InputConvolution_1x6(const float* src0, const SimdConvolutionParameters& p, - size_t kH, size_t kW, const float* weight, const __m128* bias, const __m128* params, float* dst0) - { - __m128 d00, d10, d20, d30, d40, d50, s0, w0; - d00 = bias[0]; - d10 = bias[0]; - d20 = bias[0]; - d30 = bias[0]; - d40 = bias[0]; - d50 = bias[0]; - size_t size = kW * p.srcC, tail = DF * (p.kernelX - kW) * p.srcC, stride = p.srcW * p.srcC, step = p.srcC * p.strideX; - const float* src1 = src0 + 1 * step; - const float* src2 = src0 + 2 * step; - const float* src3 = src0 + 3 * step; - const float* src4 = src0 + 4 * step; - const float* src5 = src0 + 5 * step; - for (size_t ky = 0; ky < kH; ++ky) - { - size_t offset = ky * stride; - for (size_t end = offset + size; offset < end; ++offset) - { - w0 = _mm_loadu_ps(weight + 0); - s0 = _mm_set1_ps(src0[offset]); - d00 = _mm_add_ps(_mm_mul_ps(s0, w0), d00); - s0 = _mm_set1_ps(src1[offset]); - d10 = _mm_add_ps(_mm_mul_ps(s0, w0), d10); - s0 = _mm_set1_ps(src2[offset]); - d20 = _mm_add_ps(_mm_mul_ps(s0, w0), d20); - s0 = _mm_set1_ps(src3[offset]); - d30 = _mm_add_ps(_mm_mul_ps(s0, w0), d30); - s0 = _mm_set1_ps(src4[offset]); - d40 = _mm_add_ps(_mm_mul_ps(s0, w0), d40); - s0 = _mm_set1_ps(src5[offset]); - d50 = _mm_add_ps(_mm_mul_ps(s0, w0), d50); - weight += DF; - } - weight += tail; - } - _mm_storeu_ps(dst0 + 0 * F, Activate(d00, params, 0)); - _mm_storeu_ps(dst0 + 1 * F, Activate(d10, params, 0)); - _mm_storeu_ps(dst0 + 2 * F, Activate(d20, params, 0)); - _mm_storeu_ps(dst0 + 3 * F, Activate(d30, params, 0)); - _mm_storeu_ps(dst0 + 4 * F, Activate(d40, params, 0)); - _mm_storeu_ps(dst0 + 5 * F, Activate(d50, params, 0)); - } - - template void InputConvolution(const float* src, const SimdConvolutionParameters& p, - size_t dstC, size_t yBeg, size_t yEnd, const size_t bufH[2], const float* weight, const float* bias, const float* params, float* dst, int first) - { - size_t srcH = p.srcH, srcW = p.srcW, srcC = p.srcC, dstW = p.dstW; - size_t kernelY = p.kernelY, kernelX = p.kernelX, strideY = p.strideY, strideX = p.strideX; - size_t dstM = (bufH[0] - 1), dstS = bufH[0] * dstW * F; - size_t dstCDF = AlignLo(dstC, DF); - if (dstC - F > dstCDF) - dstCDF += DF; - - size_t noseH = p.padY, noseW = p.padX; - size_t bodyH = p.srcH - p.kernelY + 1 + noseH, bodyW = p.srcW - p.kernelX + 1 + noseW; - size_t bodyW6 = AlignLoAny(bodyW - noseW, 6 * p.strideX) + noseW; - size_t tailH = bodyH + p.padH, tailW = bodyW + p.padW; - size_t wS = p.srcC * p.dstC; - size_t kY = p.kernelY - noseH, kX = p.kernelX - noseW, kH = bodyH + p.kernelY - 1, kW = bodyW + p.kernelX - 1; - - __m128 _params[2], _bias[2]; - _params[0] = _mm_set1_ps(params[0]); - if (type == SimdConvolutionActivationRestrictRange || - type == SimdConvolutionActivationHswish || - type == SimdConvolutionActivationHardSigmoid) - _params[1] = _mm_set1_ps(params[1]); - - size_t dc = 0; - for (; dc < dstCDF; dc += DF) - { - _bias[0] = bias ? _mm_loadu_ps(bias + dc + 0) : _mm_setzero_ps(); - _bias[1] = bias ? _mm_loadu_ps(bias + dc + F) : _mm_setzero_ps(); - if (type == ::SimdConvolutionActivationPrelu) - { - _params[0] = _mm_loadu_ps(params + dc + 0); - _params[1] = _mm_loadu_ps(params + dc + F); - } - size_t dy = yBeg, sy = dy * strideY; - for (; sy < noseH && dy < yEnd; sy += strideY, dy++) - { - float* dst0 = dst + (dy & dstM) * dstW * F + (dc / F) * dstS, * dst1 = dst0 + dstS; - size_t sx = 0; - const float* s = src; - const float* w = weight + (noseH - sy) * kernelX * DF * srcC; - for (; sx < noseW; sx += strideX, dst0 += F, dst1 += F) - InputConvolution_2x1(s, p, kY + sy, kX + sx, w + (noseW - sx) * srcC * DF, _bias, _params, dst0, dst1); - for (; sx < bodyW6; sx += 6 * strideX, dst0 += 6 * F, dst1 += 6 * F) - InputConvolution_2x6(s + (sx - noseW) * srcC, p, kY + sy, kernelX, w, _bias, _params, dst0, dst1); - for (; sx < bodyW; sx += strideX, dst0 += F, dst1 += F) - InputConvolution_2x1(s + (sx - noseW) * srcC, p, kY + sy, kernelX, w, _bias, _params, dst0, dst1); - for (; sx < tailW; sx += strideX, dst0 += F, dst1 += F) - InputConvolution_2x1(s + (sx - noseW) * srcC, p, kY + sy, kW - sx, w, _bias, _params, dst0, dst1); - } - for (; sy < bodyH && dy < yEnd; sy += strideY, dy++) - { - float* dst0 = dst + (dy & dstM) * dstW * F + (dc / F) * dstS, * dst1 = dst0 + dstS; - size_t sx = 0; - const float* s = src + (sy - noseH) * srcW * srcC; - const float* w = weight; - for (; sx < noseW; sx += strideX, dst0 += F, dst1 += F) - InputConvolution_2x1(s, p, kernelY, kX + sx, w + (noseW - sx) * srcC * DF, _bias, _params, dst0, dst1); - for (; sx < bodyW6; sx += 6 * strideX, dst0 += 6 * F, dst1 += 6 * F) - InputConvolution_2x6(s + (sx - noseW) * srcC, p, kernelY, kernelX, w, _bias, _params, dst0, dst1); - for (; sx < bodyW; sx += strideX, dst0 += F, dst1 += F) - InputConvolution_2x1(s + (sx - noseW) * srcC, p, kernelY, kernelX, w, _bias, _params, dst0, dst1); - for (; sx < tailW; sx += strideX, dst0 += F, dst1 += F) - InputConvolution_2x1(s + (sx - noseW) * srcC, p, kernelY, kW - sx, w, _bias, _params, dst0, dst1); - } - for (; sy < tailH && dy < yEnd; sy += strideY, dy++) - { - float* dst0 = dst + (dy & dstM) * dstW * F + (dc / F) * dstS, * dst1 = dst0 + dstS; - size_t sx = 0; - const float* s = src + (sy - noseH) * srcW * srcC; - const float* w = weight; - for (; sx < noseW; sx += strideX, dst0 += F, dst1 += F) - InputConvolution_2x1(s, p, kH - sy, kX + sx, w + (noseW - sx) * srcC * DF, _bias, _params, dst0, dst1); - for (; sx < bodyW6; sx += 6 * strideX, dst0 += 6 * F, dst1 += 6 * F) - InputConvolution_2x6(s + (sx - noseW) * srcC, p, kH - sy, kernelX, w, _bias, _params, dst0, dst1); - for (; sx < bodyW; sx += strideX, dst0 += F, dst1 += F) - InputConvolution_2x1(s + (sx - noseW) * srcC, p, kH - sy, kernelX, w, _bias, _params, dst0, dst1); - for (; sx < tailW; sx += strideX, dst0 += F, dst1 += F) - InputConvolution_2x1(s + (sx - noseW) * srcC, p, kH - sy, kW - sx, w, _bias, _params, dst0, dst1); - } - weight += kernelY * kernelX * srcC * DF; - } - if (dc < dstC) - { - _bias[0] = bias ? _mm_loadu_ps(bias + dc) : _mm_setzero_ps(); - if (type == ::SimdConvolutionActivationPrelu) - _params[0] = _mm_loadu_ps(params + dc); - size_t dy = yBeg, sy = dy * strideY; - for (; sy < noseH && dy < yEnd; sy += strideY, dy++) - { - float* dst0 = dst + (dy & dstM) * dstW * F + (dc / F) * dstS; - size_t sx = 0; - const float* s = src; - const float* w = weight + (noseH - sy) * kernelX * DF * srcC; - for (; sx < noseW; sx += strideX, dst0 += F) - InputConvolution_1x1(s, p, kY + sy, kX + sx, w + (noseW - sx) * srcC * DF, _bias, _params, dst0); - for (; sx < bodyW6; sx += 6 * strideX, dst0 += 6 * F) - InputConvolution_1x6(s + (sx - noseW) * srcC, p, kY + sy, kernelX, w, _bias, _params, dst0); - for (; sx < bodyW; sx += strideX, dst0 += F) - InputConvolution_1x1(s + (sx - noseW) * srcC, p, kY + sy, kernelX, w, _bias, _params, dst0); - for (; sx < tailW; sx += strideX, dst0 += F) - InputConvolution_1x1(s + (sx - noseW) * srcC, p, kY + sy, kW - sx, w, _bias, _params, dst0); - } - for (; sy < bodyH && dy < yEnd; sy += strideY, dy++) - { - float* dst0 = dst + (dy & dstM) * dstW * F + (dc / F) * dstS; - size_t sx = 0; - const float* s = src + (sy - noseH) * srcW * srcC; - const float* w = weight; - for (; sx < noseW; sx += strideX, dst0 += F) - InputConvolution_1x1(s, p, kernelY, kX + sx, w + (noseW - sx) * srcC * DF, _bias, _params, dst0); - for (; sx < bodyW6; sx += 6 * strideX, dst0 += 6 * F) - InputConvolution_1x6(s + (sx - noseW) * srcC, p, kernelY, kernelX, w, _bias, _params, dst0); - for (; sx < bodyW; sx += strideX, dst0 += F) - InputConvolution_1x1(s + (sx - noseW) * srcC, p, kernelY, kernelX, w, _bias, _params, dst0); - for (; sx < tailW; sx += strideX, dst0 += F) - InputConvolution_1x1(s + (sx - noseW) * srcC, p, kernelY, kW - sx, w, _bias, _params, dst0); - } - for (; sy < tailH && dy < yEnd; sy += strideY, dy++) - { - float* dst0 = dst + (dy & dstM) * dstW * F + (dc / F) * dstS; - size_t sx = 0; - const float* s = src + (sy - noseH) * srcW * srcC; - const float* w = weight; - for (; sx < noseW; sx += strideX, dst0 += F) - InputConvolution_1x1(s, p, kH - sy, kX + sx, w + (noseW - sx) * srcC * DF, _bias, _params, dst0); - for (; sx < bodyW6; sx += 6 * strideX, dst0 += 6 * F) - InputConvolution_1x6(s + (sx - noseW) * srcC, p, kH - sy, kernelX, w, _bias, _params, dst0); - for (; sx < bodyW; sx += strideX, dst0 += F) - InputConvolution_1x1(s + (sx - noseW) * srcC, p, kH - sy, kernelX, w, _bias, _params, dst0); - for (; sx < tailW; sx += strideX, dst0 += F) - InputConvolution_1x1(s + (sx - noseW) * srcC, p, kH - sy, kW - sx, w, _bias, _params, dst0); - } - } - } - - //--------------------------------------------------------------------- - - template void DepthwiseConvolution(const float* src, const SimdConvolutionParameters& p, - size_t srcC, size_t yBeg, size_t yEnd, const size_t bufH[2], const float* weight, const float* bias, const float* params, float* dst, int first) - { - size_t strideY = p.strideY, strideX = p.strideX, padY = p.padY, padX = p.padX, padH = p.padH, padW = p.padW; - size_t srcW = p.srcW * F, dstW = p.dstW * F, weightS = p.kernelY * p.kernelX * F, strideXF = strideX * F; - size_t srcM = (bufH[0] - 1), dstM = (bufH[1] - 1), srcS = bufH[0] * srcW, dstS = bufH[1] * dstW; - size_t noseY = (p.padY + p.strideY - 1) / p.strideY; - size_t bodyY = (p.srcH + p.padY + p.strideY - p.kernelY) / p.strideY; - size_t noseX = (p.padX + p.strideX - 1) / p.strideX; - size_t bodyX = (p.srcW + p.padX + p.strideX - p.kernelX) / p.strideX; - size_t bodyX2 = AlignLo(bodyX - noseX, 2) + noseX; - size_t bodyX4 = AlignLo(bodyX - noseX, 4) + noseX; - size_t bodyX8 = AlignLo(bodyX - noseX, 8) + noseX; - - __m128 _params[2]; - _params[0] = _mm_set1_ps(params[0]); - if (type == SimdConvolutionActivationRestrictRange || - type == SimdConvolutionActivationHswish || - type == SimdConvolutionActivationHardSigmoid) - _params[1] = _mm_set1_ps(params[1]); - for (size_t c = 0; c < srcC; c += F) - { - __m128 _bias = bias ? _mm_loadu_ps(bias + c) : _mm_setzero_ps(); - if (type == ::SimdConvolutionActivationPrelu) - _params[0] = _mm_loadu_ps(params + c); - - for (size_t dy = yBeg; dy < yEnd; ++dy) - { - float* pd = dst + (dy & dstM) * dstW; - if (dy >= noseY && dy < bodyY) - { - size_t dx = 0; - for (; dx < noseX; ++dx, pd += F) - { - __m128 sum = _bias; - for (size_t ky = 0; ky < p.kernelY; ++ky) - { - size_t sy = dy * p.strideY + ky - padY; - for (size_t kx = 0; kx < p.kernelX; ++kx) - { - size_t sx = dx * p.strideX + kx - padX; - if (sx < p.srcW) - { - const float* pw = weight + (ky * p.kernelX + kx) * F; - const float* ps = src + ((sy & srcM) * p.srcW + sx) * F; - sum = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps), _mm_loadu_ps(pw)), sum); - } - } - } - _mm_storeu_ps(pd, Activate(sum, _params, 0)); - } - for (; dx < bodyX8; dx += 8, pd += 8 * F) - { - __m128 sum0 = _bias; - __m128 sum1 = _bias; - __m128 sum2 = _bias; - __m128 sum3 = _bias; - __m128 sum4 = _bias; - __m128 sum5 = _bias; - __m128 sum6 = _bias; - __m128 sum7 = _bias; - const float* pw = weight; - for (size_t ky = 0; ky < p.kernelY; ++ky) - { - size_t sy = dy * strideY + ky - padY; - const float* ps = src + ((sy & srcM) * p.srcW + dx * strideX - padX) * F; - for (size_t kx = 0; kx < p.kernelX; ++kx, ps += F, pw += F) - { - __m128 w0 = _mm_loadu_ps(pw); - sum0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 0 * strideXF), w0), sum0); - sum1 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 1 * strideXF), w0), sum1); - sum2 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 2 * strideXF), w0), sum2); - sum3 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 3 * strideXF), w0), sum3); - sum4 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 4 * strideXF), w0), sum4); - sum5 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 5 * strideXF), w0), sum5); - sum6 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 6 * strideXF), w0), sum6); - sum7 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 7 * strideXF), w0), sum7); - } - } - _mm_storeu_ps(pd + 0 * F, Activate(sum0, _params, 0)); - _mm_storeu_ps(pd + 1 * F, Activate(sum1, _params, 0)); - _mm_storeu_ps(pd + 2 * F, Activate(sum2, _params, 0)); - _mm_storeu_ps(pd + 3 * F, Activate(sum3, _params, 0)); - _mm_storeu_ps(pd + 4 * F, Activate(sum4, _params, 0)); - _mm_storeu_ps(pd + 5 * F, Activate(sum5, _params, 0)); - _mm_storeu_ps(pd + 6 * F, Activate(sum6, _params, 0)); - _mm_storeu_ps(pd + 7 * F, Activate(sum7, _params, 0)); - } - for (; dx < bodyX4; dx += 4, pd += 4 * F) - { - __m128 sum0 = _bias; - __m128 sum1 = _bias; - __m128 sum2 = _bias; - __m128 sum3 = _bias; - const float* pw = weight; - for (size_t ky = 0; ky < p.kernelY; ++ky) - { - size_t sy = dy * strideY + ky - padY; - const float* ps = src + ((sy & srcM) * p.srcW + dx * strideX - padX) * F; - for (size_t kx = 0; kx < p.kernelX; ++kx, ps += F, pw += F) - { - __m128 w0 = _mm_loadu_ps(pw); - sum0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 0 * strideXF), w0), sum0); - sum1 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 1 * strideXF), w0), sum1); - sum2 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 2 * strideXF), w0), sum2); - sum3 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 3 * strideXF), w0), sum3); - } - } - _mm_storeu_ps(pd + 0 * F, Activate(sum0, _params, 0)); - _mm_storeu_ps(pd + 1 * F, Activate(sum1, _params, 0)); - _mm_storeu_ps(pd + 2 * F, Activate(sum2, _params, 0)); - _mm_storeu_ps(pd + 3 * F, Activate(sum3, _params, 0)); - } - for (; dx < bodyX2; dx += 2, pd += 2 * F) - { - __m128 sum0 = _bias; - __m128 sum1 = _bias; - const float* pw = weight; - for (size_t ky = 0; ky < p.kernelY; ++ky) - { - size_t sy = dy * strideY + ky - padY; - const float* ps = src + ((sy & srcM) * p.srcW + dx * strideX - padX) * F; - for (size_t kx = 0; kx < p.kernelX; ++kx, ps += F, pw += F) - { - __m128 w0 = _mm_loadu_ps(pw); - sum0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 0 * strideXF), w0), sum0); - sum1 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 1 * strideXF), w0), sum1); - } - } - _mm_storeu_ps(pd + 0 * F, Activate(sum0, _params, 0)); - _mm_storeu_ps(pd + 1 * F, Activate(sum1, _params, 0)); - } - for (; dx < bodyX; ++dx, pd += F) - { - __m128 sum = _bias; - const float* pw = weight; - for (size_t ky = 0; ky < p.kernelY; ++ky) - { - size_t sy = dy * strideY + ky - padY; - const float* ps = src + ((sy & srcM) * p.srcW + dx * strideX - padX) * F; - for (size_t kx = 0; kx < p.kernelX; ++kx, ps += F, pw += F) - { - __m128 w0 = _mm_loadu_ps(pw); - sum = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps), w0), sum); - } - } - _mm_storeu_ps(pd, Activate(sum, _params, 0)); - } - for (; dx < p.dstW; ++dx, pd += F) - { - __m128 sum = _bias; - for (size_t ky = 0; ky < p.kernelY; ++ky) - { - size_t sy = dy * strideY + ky - padY; - for (size_t kx = 0; kx < p.kernelX; ++kx) - { - size_t sx = dx * strideX + kx - padX; - if (sx < p.srcW) - { - const float* pw = weight + (ky * p.kernelX + kx) * F; - const float* ps = src + ((sy & srcM) * p.srcW + sx) * F; - sum = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps), _mm_loadu_ps(pw)), sum); - } - } - } - _mm_storeu_ps(pd, Activate(sum, _params, 0)); - } - } - else - { - for (size_t dx = 0; dx < p.dstW; ++dx, pd += F) - { - __m128 sum = _bias; - for (size_t ky = 0; ky < p.kernelY; ++ky) - { - size_t sy = dy * strideY + ky - padY; - if (sy < p.srcH) - { - for (size_t kx = 0; kx < p.kernelX; ++kx) - { - size_t sx = dx * strideX + kx - padX; - if (sx < p.srcW) - { - const float* pw = weight + (ky * p.kernelX + kx) * F; - const float* ps = src + ((sy & srcM) * p.srcW + sx) * F; - sum = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps), _mm_loadu_ps(pw)), sum); - } - } - } - } - _mm_storeu_ps(pd, Activate(sum, _params, 0)); - } - } - } - src += srcS; - dst += dstS; - weight += weightS; - } - } - - template SIMD_INLINE void ConvolutionDepthwise3x3Edge2x2( - const float* src0, const float* src1, const __m128* weight, const __m128& bias, const __m128* params, float* dst) - { - __m128 sum0 = bias, sum1 = _mm_setzero_ps(); - sum0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src0 + 0 * F), weight[0]), sum0); - sum1 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src0 + 1 * F), weight[1]), sum1); - sum0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src1 + 0 * F), weight[3]), sum0); - sum1 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src1 + 1 * F), weight[4]), sum1); - _mm_storeu_ps(dst, Activate(_mm_add_ps(sum0, sum1), params, 0)); - } - - template SIMD_INLINE void ConvolutionDepthwise3x3Edge2x3( - const float* src0, const float* src1, const __m128* weight, const __m128& bias, const __m128* params, float* dst) - { - __m128 sum0 = bias, sum1 = _mm_setzero_ps(), sum2 = _mm_setzero_ps(); - sum0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src0 + 0 * F), weight[0]), sum0); - sum1 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src0 + 1 * F), weight[1]), sum1); - sum2 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src0 + 2 * F), weight[2]), sum2); - sum0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src1 + 0 * F), weight[3]), sum0); - sum1 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src1 + 1 * F), weight[4]), sum1); - sum2 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src1 + 2 * F), weight[5]), sum2); - _mm_storeu_ps(dst, Activate(_mm_add_ps(_mm_add_ps(sum0, sum1), sum2), params, 0)); - } - - template SIMD_INLINE void ConvolutionDepthwise3x3Edge3x2( - const float* src0, const float* src1, const float* src2, const __m128* weight, const __m128& bias, const __m128* params, float* dst) - { - __m128 sum0 = bias, sum1 = _mm_setzero_ps(); - sum0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src0 + 0 * F), weight[0]), sum0); - sum1 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src0 + 1 * F), weight[1]), sum1); - sum0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src1 + 0 * F), weight[3]), sum0); - sum1 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src1 + 1 * F), weight[4]), sum1); - sum0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src2 + 0 * F), weight[6]), sum0); - sum1 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src2 + 1 * F), weight[7]), sum1); - _mm_storeu_ps(dst, Activate(_mm_add_ps(sum0, sum1), params, 0)); - } - - template SIMD_INLINE void ConvolutionDepthwise3x3Main1x1( - const float* src0, const float* src1, const float* src2, const __m128* weight, const __m128& bias, const __m128* params, float* dst) - { - __m128 sum0 = bias, sum1 = _mm_setzero_ps(), sum2 = _mm_setzero_ps(); - sum0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src0 + 0 * F), weight[0]), sum0); - sum1 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src0 + 1 * F), weight[1]), sum1); - sum2 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src0 + 2 * F), weight[2]), sum2); - sum0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src1 + 0 * F), weight[3]), sum0); - sum1 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src1 + 1 * F), weight[4]), sum1); - sum2 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src1 + 2 * F), weight[5]), sum2); - sum0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src2 + 0 * F), weight[6]), sum0); - sum1 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src2 + 1 * F), weight[7]), sum1); - sum2 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src2 + 2 * F), weight[8]), sum2); - _mm_storeu_ps(dst, Activate(_mm_add_ps(_mm_add_ps(sum0, sum1), sum2), params, 0)); - } - - template void DepthwiseConvolution3x3(const float* src, const SimdConvolutionParameters& p, - size_t srcC, size_t yBeg, size_t yEnd, const size_t bufH[2], const float* weight, const float* bias, const float* params, float* dst, int first) - { - size_t strideY = p.strideY, padY = p.padY, padX = p.padX, padH = p.padH, padW = p.padW; - size_t srcW = p.srcW * F, dstW = p.dstW * F, weightS = p.kernelY * p.kernelX * F; - size_t srcM = (bufH[0] - 1), dstM = (bufH[1] - 1), srcS = bufH[0] * srcW, dstS = bufH[1] * dstW; - size_t xStep = F * p.strideX, xStep0 = (p.strideX - p.padX) * F; - size_t xMainEnd = p.dstW - p.padW, yMainEnd = yEnd == p.dstH && p.padH ? yEnd - 1 : yEnd; - - __m128 _params[2]; - _params[0] = _mm_set1_ps(params[0]); - if (type == SimdConvolutionActivationRestrictRange || - type == SimdConvolutionActivationHswish || - type == SimdConvolutionActivationHardSigmoid) - _params[1] = _mm_set1_ps(params[1]); - for (size_t c = 0; c < srcC; c += F) - { - __m128 _weight[9]; - for (size_t i = 0; i < 9; ++i) - _weight[i] = _mm_loadu_ps(weight + i * F); - __m128 _bias = bias ? _mm_loadu_ps(bias + c) : _mm_setzero_ps(); - if (type == ::SimdConvolutionActivationPrelu) - _params[0] = _mm_loadu_ps(params + c); - - size_t dy = yBeg; - if (yBeg == 0 && padY) - { - size_t sy = 0, dx = 0; - const float* src0 = src + ((sy + 0) & srcM) * srcW; - const float* src1 = src + ((sy + 1) & srcM) * srcW; - float* pDst = dst + (dy & dstM) * dstW; - if (padX) - ConvolutionDepthwise3x3Edge2x2(src0, src1, _weight + 4, _bias, _params, pDst), pDst += F, dx++, src0 += xStep0, src1 += xStep0; - for (; dx < xMainEnd; dx++, pDst += F, src0 += xStep, src1 += xStep) - ConvolutionDepthwise3x3Edge2x3(src0, src1, _weight + 3, _bias, _params, pDst); - if (padW) - ConvolutionDepthwise3x3Edge2x2(src0, src1, _weight + 3, _bias, _params, pDst); - dy++; - } - for (; dy < yMainEnd; ++dy) - { - size_t sy = dy * strideY - padY, dx = 0; - const float* src0 = src + ((sy + 0) & srcM) * srcW; - const float* src1 = src + ((sy + 1) & srcM) * srcW; - const float* src2 = src + ((sy + 2) & srcM) * srcW; - float* pDst = dst + (dy & dstM) * dstW; - if (padX) - ConvolutionDepthwise3x3Edge3x2(src0, src1, src2, _weight + 1, _bias, _params, pDst), pDst += F, dx++, src0 += xStep0, src1 += xStep0, src2 += xStep0; - for (; dx < xMainEnd; dx++, pDst += F, src0 += xStep, src1 += xStep, src2 += xStep) - ConvolutionDepthwise3x3Main1x1(src0, src1, src2, _weight + 0, _bias, _params, pDst); - if (padW) - ConvolutionDepthwise3x3Edge3x2(src0, src1, src2, _weight + 0, _bias, _params, pDst); - } - if (dy < yEnd) - { - size_t sy = dy * strideY - padY, dx = 0; - const float* src0 = src + ((sy + 0) & srcM) * srcW; - const float* src1 = src + ((sy + 1) & srcM) * srcW; - float* pDst = dst + (dy & dstM) * dstW; - if (padX) - ConvolutionDepthwise3x3Edge2x2(src0, src1, _weight + 1, _bias, _params, pDst), pDst += F, dx++, src0 += xStep0, src1 += xStep0; - for (; dx < xMainEnd; dx++, pDst += F, src0 += xStep, src1 += xStep) - ConvolutionDepthwise3x3Edge2x3(src0, src1, _weight + 0, _bias, _params, pDst); - if (padW) - ConvolutionDepthwise3x3Edge2x2(src0, src1, _weight + 0, _bias, _params, pDst); - } - src += srcS; - dst += dstS; - weight += weightS; - } - } - - //--------------------------------------------------------------------- - - template void OutputConvolution_2x6(const float* src, size_t srcC, size_t srcS, - const float* weight, const __m128* bias, const __m128* params, float* dst, size_t dstC, size_t tail, int first) - { - __m128 d00, d01, d10, d11, d20, d21, d30, d31, d40, d41, d50, d51, s0, w0, w1; - if (tail > F) - { - if (first) - { - d00 = _mm_setzero_ps(), d01 = _mm_setzero_ps(); - d10 = _mm_setzero_ps(), d11 = _mm_setzero_ps(); - d20 = _mm_setzero_ps(), d21 = _mm_setzero_ps(); - d30 = _mm_setzero_ps(), d31 = _mm_setzero_ps(); - d40 = _mm_setzero_ps(), d41 = _mm_setzero_ps(); - d50 = _mm_setzero_ps(), d51 = _mm_setzero_ps(); - } - else - { - d00 = _mm_loadu_ps(dst + 0 * dstC + 0), d01 = _mm_loadu_ps(dst + 0 * dstC + F); - d10 = _mm_loadu_ps(dst + 1 * dstC + 0), d11 = _mm_loadu_ps(dst + 1 * dstC + F); - d20 = _mm_loadu_ps(dst + 2 * dstC + 0), d21 = _mm_loadu_ps(dst + 2 * dstC + F); - d30 = _mm_loadu_ps(dst + 3 * dstC + 0), d31 = _mm_loadu_ps(dst + 3 * dstC + F); - d40 = _mm_loadu_ps(dst + 4 * dstC + 0), d41 = _mm_loadu_ps(dst + 4 * dstC + F); - d50 = _mm_loadu_ps(dst + 5 * dstC + 0), d51 = _mm_loadu_ps(dst + 5 * dstC + F); - } - for (size_t c = 0; c < srcC; c += F) - { - size_t n = Simd::Min(F, srcC - c); - for (size_t i = 0; i < n; ++i, weight += DF) - { - w0 = _mm_loadu_ps(weight + 0); - w1 = _mm_loadu_ps(weight + F); - s0 = _mm_set1_ps(src[i + 0 * F]); - d00 = _mm_add_ps(_mm_mul_ps(s0, w0), d00); - d01 = _mm_add_ps(_mm_mul_ps(s0, w1), d01); - s0 = _mm_set1_ps(src[i + 1 * F]); - d10 = _mm_add_ps(_mm_mul_ps(s0, w0), d10); - d11 = _mm_add_ps(_mm_mul_ps(s0, w1), d11); - s0 = _mm_set1_ps(src[i + 2 * F]); - d20 = _mm_add_ps(_mm_mul_ps(s0, w0), d20); - d21 = _mm_add_ps(_mm_mul_ps(s0, w1), d21); - s0 = _mm_set1_ps(src[i + 3 * F]); - d30 = _mm_add_ps(_mm_mul_ps(s0, w0), d30); - d31 = _mm_add_ps(_mm_mul_ps(s0, w1), d31); - s0 = _mm_set1_ps(src[i + 4 * F]); - d40 = _mm_add_ps(_mm_mul_ps(s0, w0), d40); - d41 = _mm_add_ps(_mm_mul_ps(s0, w1), d41); - s0 = _mm_set1_ps(src[i + 5 * F]); - d50 = _mm_add_ps(_mm_mul_ps(s0, w0), d50); - d51 = _mm_add_ps(_mm_mul_ps(s0, w1), d51); - } - src += srcS; - } - if (tail == DF) - { - Term::template Save(dst + 0, d00, bias, params); - Term::template Save(dst + F, d01, bias, params); - dst += dstC; - Term::template Save(dst + 0, d10, bias, params); - Term::template Save(dst + F, d11, bias, params); - dst += dstC; - Term::template Save(dst + 0, d20, bias, params); - Term::template Save(dst + F, d21, bias, params); - dst += dstC; - Term::template Save(dst + 0, d30, bias, params); - Term::template Save(dst + F, d31, bias, params); - dst += dstC; - Term::template Save(dst + 0, d40, bias, params); - Term::template Save(dst + F, d41, bias, params); - dst += dstC; - Term::template Save(dst + 0, d50, bias, params); - Term::template Save(dst + F, d51, bias, params); - } - else - { - tail -= F; - Term::template Save(dst + 0, d00, bias, params); - Term::template Save(dst + F, d01, bias, params, tail); - dst += dstC; - Term::template Save(dst + 0, d10, bias, params); - Term::template Save(dst + F, d11, bias, params, tail); - dst += dstC; - Term::template Save(dst + 0, d20, bias, params); - Term::template Save(dst + F, d21, bias, params, tail); - dst += dstC; - Term::template Save(dst + 0, d30, bias, params); - Term::template Save(dst + F, d31, bias, params, tail); - dst += dstC; - Term::template Save(dst + 0, d40, bias, params); - Term::template Save(dst + F, d41, bias, params, tail); - dst += dstC; - Term::template Save(dst + 0, d50, bias, params); - Term::template Save(dst + F, d51, bias, params, tail); - } - } - else - { - if (first) - { - d00 = _mm_setzero_ps(); - d10 = _mm_setzero_ps(); - d20 = _mm_setzero_ps(); - d30 = _mm_setzero_ps(); - d40 = _mm_setzero_ps(); - d50 = _mm_setzero_ps(); - } - else - { - d00 = _mm_loadu_ps(dst + 0 * dstC + 0); - d10 = _mm_loadu_ps(dst + 1 * dstC + 0); - d20 = _mm_loadu_ps(dst + 2 * dstC + 0); - d30 = _mm_loadu_ps(dst + 3 * dstC + 0); - d40 = _mm_loadu_ps(dst + 4 * dstC + 0); - d50 = _mm_loadu_ps(dst + 5 * dstC + 0); - } - for (size_t c = 0; c < srcC; c += F) - { - size_t n = Simd::Min(F, srcC - c); - for (size_t i = 0; i < n; ++i, weight += DF) - { - w0 = _mm_loadu_ps(weight + 0); - s0 = _mm_set1_ps(src[i + 0 * F]); - d00 = _mm_add_ps(_mm_mul_ps(s0, w0), d00); - s0 = _mm_set1_ps(src[i + 1 * F]); - d10 = _mm_add_ps(_mm_mul_ps(s0, w0), d10); - s0 = _mm_set1_ps(src[i + 2 * F]); - d20 = _mm_add_ps(_mm_mul_ps(s0, w0), d20); - s0 = _mm_set1_ps(src[i + 3 * F]); - d30 = _mm_add_ps(_mm_mul_ps(s0, w0), d30); - s0 = _mm_set1_ps(src[i + 4 * F]); - d40 = _mm_add_ps(_mm_mul_ps(s0, w0), d40); - s0 = _mm_set1_ps(src[i + 5 * F]); - d50 = _mm_add_ps(_mm_mul_ps(s0, w0), d50); - } - src += srcS; - } - if (tail == F) - { - Term::template Save(dst + 0, d00, bias, params); - dst += dstC; - Term::template Save(dst + 0, d10, bias, params); - dst += dstC; - Term::template Save(dst + 0, d20, bias, params); - dst += dstC; - Term::template Save(dst + 0, d30, bias, params); - dst += dstC; - Term::template Save(dst + 0, d40, bias, params); - dst += dstC; - Term::template Save(dst + 0, d50, bias, params); - } - else - { - Term::template Save(dst + 0, d00, bias, params, tail); - dst += dstC; - Term::template Save(dst + 0, d10, bias, params, tail); - dst += dstC; - Term::template Save(dst + 0, d20, bias, params, tail); - dst += dstC; - Term::template Save(dst + 0, d30, bias, params, tail); - dst += dstC; - Term::template Save(dst + 0, d40, bias, params, tail); - dst += dstC; - Term::template Save(dst + 0, d50, bias, params, tail); - } - } - } - - template void OutputConvolution_2x4(const float* src, size_t srcC, size_t srcS, - const float* weight, const __m128* bias, const __m128* params, float* dst, size_t dstC, size_t tail, int first) - { - __m128 d00, d01, d10, d11, d20, d21, d30, d31, s0, w0, w1; - if (tail > F) - { - if (first) - { - d00 = _mm_setzero_ps(), d01 = _mm_setzero_ps(); - d10 = _mm_setzero_ps(), d11 = _mm_setzero_ps(); - d20 = _mm_setzero_ps(), d21 = _mm_setzero_ps(); - d30 = _mm_setzero_ps(), d31 = _mm_setzero_ps(); - } - else - { - d00 = _mm_loadu_ps(dst + 0 * dstC + 0), d01 = _mm_loadu_ps(dst + 0 * dstC + F); - d10 = _mm_loadu_ps(dst + 1 * dstC + 0), d11 = _mm_loadu_ps(dst + 1 * dstC + F); - d20 = _mm_loadu_ps(dst + 2 * dstC + 0), d21 = _mm_loadu_ps(dst + 2 * dstC + F); - d30 = _mm_loadu_ps(dst + 3 * dstC + 0), d31 = _mm_loadu_ps(dst + 3 * dstC + F); - } - for (size_t c = 0; c < srcC; c += F) - { - size_t n = Simd::Min(F, srcC - c); - for (size_t i = 0; i < n; ++i, weight += DF) - { - w0 = _mm_loadu_ps(weight + 0); - w1 = _mm_loadu_ps(weight + F); - s0 = _mm_set1_ps(src[i + 0 * F]); - d00 = _mm_add_ps(_mm_mul_ps(s0, w0), d00); - d01 = _mm_add_ps(_mm_mul_ps(s0, w1), d01); - s0 = _mm_set1_ps(src[i + 1 * F]); - d10 = _mm_add_ps(_mm_mul_ps(s0, w0), d10); - d11 = _mm_add_ps(_mm_mul_ps(s0, w1), d11); - s0 = _mm_set1_ps(src[i + 2 * F]); - d20 = _mm_add_ps(_mm_mul_ps(s0, w0), d20); - d21 = _mm_add_ps(_mm_mul_ps(s0, w1), d21); - s0 = _mm_set1_ps(src[i + 3 * F]); - d30 = _mm_add_ps(_mm_mul_ps(s0, w0), d30); - d31 = _mm_add_ps(_mm_mul_ps(s0, w1), d31); - } - src += srcS; - } - if (tail == DF) - { - Term::template Save(dst + 0, d00, bias, params); - Term::template Save(dst + F, d01, bias, params); - dst += dstC; - Term::template Save(dst + 0, d10, bias, params); - Term::template Save(dst + F, d11, bias, params); - dst += dstC; - Term::template Save(dst + 0, d20, bias, params); - Term::template Save(dst + F, d21, bias, params); - dst += dstC; - Term::template Save(dst + 0, d30, bias, params); - Term::template Save(dst + F, d31, bias, params); - } - else - { - tail -= F; - Term::template Save(dst + 0, d00, bias, params); - Term::template Save(dst + F, d01, bias, params, tail); - dst += dstC; - Term::template Save(dst + 0, d10, bias, params); - Term::template Save(dst + F, d11, bias, params, tail); - dst += dstC; - Term::template Save(dst + 0, d20, bias, params); - Term::template Save(dst + F, d21, bias, params, tail); - dst += dstC; - Term::template Save(dst + 0, d30, bias, params); - Term::template Save(dst + F, d31, bias, params, tail); - } - } - else - { - if (first) - { - d00 = _mm_setzero_ps(); - d10 = _mm_setzero_ps(); - d20 = _mm_setzero_ps(); - d30 = _mm_setzero_ps(); - } - else - { - d00 = _mm_loadu_ps(dst + 0 * dstC + 0); - d10 = _mm_loadu_ps(dst + 1 * dstC + 0); - d20 = _mm_loadu_ps(dst + 2 * dstC + 0); - d30 = _mm_loadu_ps(dst + 3 * dstC + 0); - } - for (size_t c = 0; c < srcC; c += F) - { - size_t n = Simd::Min(F, srcC - c); - for (size_t i = 0; i < n; ++i, weight += DF) - { - w0 = _mm_loadu_ps(weight + 0); - s0 = _mm_set1_ps(src[i + 0 * F]); - d00 = _mm_add_ps(_mm_mul_ps(s0, w0), d00); - s0 = _mm_set1_ps(src[i + 1 * F]); - d10 = _mm_add_ps(_mm_mul_ps(s0, w0), d10); - s0 = _mm_set1_ps(src[i + 2 * F]); - d20 = _mm_add_ps(_mm_mul_ps(s0, w0), d20); - s0 = _mm_set1_ps(src[i + 3 * F]); - d30 = _mm_add_ps(_mm_mul_ps(s0, w0), d30); - } - src += srcS; - } - if (tail == F) - { - Term::template Save(dst + 0, d00, bias, params); - dst += dstC; - Term::template Save(dst + 0, d10, bias, params); - dst += dstC; - Term::template Save(dst + 0, d20, bias, params); - dst += dstC; - Term::template Save(dst + 0, d30, bias, params); - } - else - { - Term::template Save(dst + 0, d00, bias, params, tail); - dst += dstC; - Term::template Save(dst + 0, d10, bias, params, tail); - dst += dstC; - Term::template Save(dst + 0, d20, bias, params, tail); - dst += dstC; - Term::template Save(dst + 0, d30, bias, params, tail); - } - } - } - - template void OutputConvolution_2x3(const float* src, size_t srcC, size_t srcS, - const float* weight, const __m128* bias, const __m128* params, float* dst, size_t dstC, size_t tail, int first) - { - __m128 d00, d01, d10, d11, d20, d21, s0, w0, w1; - if (tail > F) - { - if (first) - { - d00 = _mm_setzero_ps(), d01 = _mm_setzero_ps(); - d10 = _mm_setzero_ps(), d11 = _mm_setzero_ps(); - d20 = _mm_setzero_ps(), d21 = _mm_setzero_ps(); - } - else - { - d00 = _mm_loadu_ps(dst + 0 * dstC + 0), d01 = _mm_loadu_ps(dst + 0 * dstC + F); - d10 = _mm_loadu_ps(dst + 1 * dstC + 0), d11 = _mm_loadu_ps(dst + 1 * dstC + F); - d20 = _mm_loadu_ps(dst + 2 * dstC + 0), d21 = _mm_loadu_ps(dst + 2 * dstC + F); - } - for (size_t c = 0; c < srcC; c += F) - { - size_t n = Simd::Min(F, srcC - c); - for (size_t i = 0; i < n; ++i, weight += DF) - { - w0 = _mm_loadu_ps(weight + 0); - w1 = _mm_loadu_ps(weight + F); - s0 = _mm_set1_ps(src[i + 0 * F]); - d00 = _mm_add_ps(_mm_mul_ps(s0, w0), d00); - d01 = _mm_add_ps(_mm_mul_ps(s0, w1), d01); - s0 = _mm_set1_ps(src[i + 1 * F]); - d10 = _mm_add_ps(_mm_mul_ps(s0, w0), d10); - d11 = _mm_add_ps(_mm_mul_ps(s0, w1), d11); - s0 = _mm_set1_ps(src[i + 2 * F]); - d20 = _mm_add_ps(_mm_mul_ps(s0, w0), d20); - d21 = _mm_add_ps(_mm_mul_ps(s0, w1), d21); - } - src += srcS; - } - if (tail == DF) - { - Term::template Save(dst + 0, d00, bias, params); - Term::template Save(dst + F, d01, bias, params); - dst += dstC; - Term::template Save(dst + 0, d10, bias, params); - Term::template Save(dst + F, d11, bias, params); - dst += dstC; - Term::template Save(dst + 0, d20, bias, params); - Term::template Save(dst + F, d21, bias, params); - } - else - { - tail -= F; - Term::template Save(dst + 0, d00, bias, params); - Term::template Save(dst + F, d01, bias, params, tail); - dst += dstC; - Term::template Save(dst + 0, d10, bias, params); - Term::template Save(dst + F, d11, bias, params, tail); - dst += dstC; - Term::template Save(dst + 0, d20, bias, params); - Term::template Save(dst + F, d21, bias, params, tail); - } - } - else - { - if (first) - { - d00 = _mm_setzero_ps(); - d10 = _mm_setzero_ps(); - d20 = _mm_setzero_ps(); - } - else - { - d00 = _mm_loadu_ps(dst + 0 * dstC + 0); - d10 = _mm_loadu_ps(dst + 1 * dstC + 0); - d20 = _mm_loadu_ps(dst + 2 * dstC + 0); - } - for (size_t c = 0; c < srcC; c += F) - { - size_t n = Simd::Min(F, srcC - c); - for (size_t i = 0; i < n; ++i, weight += DF) - { - w0 = _mm_loadu_ps(weight + 0); - s0 = _mm_set1_ps(src[i + 0 * F]); - d00 = _mm_add_ps(_mm_mul_ps(s0, w0), d00); - s0 = _mm_set1_ps(src[i + 1 * F]); - d10 = _mm_add_ps(_mm_mul_ps(s0, w0), d10); - s0 = _mm_set1_ps(src[i + 2 * F]); - d20 = _mm_add_ps(_mm_mul_ps(s0, w0), d20); - } - src += srcS; - } - if (tail == F) - { - Term::template Save(dst + 0, d00, bias, params); - dst += dstC; - Term::template Save(dst + 0, d10, bias, params); - dst += dstC; - Term::template Save(dst + 0, d20, bias, params); - } - else - { - Term::template Save(dst + 0, d00, bias, params, tail); - dst += dstC; - Term::template Save(dst + 0, d10, bias, params, tail); - dst += dstC; - Term::template Save(dst + 0, d20, bias, params, tail); - } - } - } - - template void OutputConvolution_2x1(const float* src, size_t srcC, size_t srcS, - const float* weight, const __m128* bias, const __m128* params, float* dst, size_t dstC, size_t tail, int first) - { - __m128 d00, d01, s0, w0, w1; - if (tail > F) - { - if (first) - d00 = _mm_setzero_ps(), d01 = _mm_setzero_ps(); - else - d00 = _mm_loadu_ps(dst + 0 * dstC + 0), d01 = _mm_loadu_ps(dst + 0 * dstC + F); - for (size_t c = 0; c < srcC; c += F) - { - size_t n = Simd::Min(F, srcC - c); - for (size_t i = 0; i < n; ++i, weight += DF) - { - w0 = _mm_loadu_ps(weight + 0); - w1 = _mm_loadu_ps(weight + F); - s0 = _mm_set1_ps(src[i + 0 * F]); - d00 = _mm_add_ps(_mm_mul_ps(s0, w0), d00); - d01 = _mm_add_ps(_mm_mul_ps(s0, w1), d01); - } - src += srcS; - } - if (tail == DF) - { - Term::template Save(dst + 0, d00, bias, params); - Term::template Save(dst + F, d01, bias, params); - } - else - { - Term::template Save(dst + 0, d00, bias, params); - Term::template Save(dst + F, d01, bias, params, tail - F); - } - } - else - { - if (first) - d00 = _mm_setzero_ps(); - else - d00 = _mm_loadu_ps(dst + 0 * dstC + 0); - for (size_t c = 0; c < srcC; c += F) - { - size_t n = Simd::Min(F, srcC - c); - for (size_t i = 0; i < n; ++i, weight += DF) - { - w0 = _mm_loadu_ps(weight + 0); - s0 = _mm_set1_ps(src[i + 0 * F]); - d00 = _mm_add_ps(_mm_mul_ps(s0, w0), d00); - } - src += srcS; - } - if (tail == F) - Term::template Save(dst + 0, d00, bias, params); - else - Term::template Save(dst + 0, d00, bias, params, tail); - } - } - - template void OutputConvolution(const float* src, const SimdConvolutionParameters& p, - size_t srcC, size_t yBeg, size_t yEnd, const size_t bufH[2], const float* weight, const float* bias, const float* params, float* dst, int first) - { - assert(p.group == 1 && p.kernelY == 1 && p.strideY == 1); - size_t srcH = p.srcH, srcW = p.srcW, dstW = p.dstW, dstC = p.dstC; - size_t srcM = (bufH[1] - 1), srcS = bufH[1] * srcW * F; - size_t dstW3 = AlignLoAny(dstW, 3), dstW6 = AlignLoAny(dstW, 6); - __m128 _params[2], _bias[2]; - _params[0] = _mm_set1_ps(params[0]); - if (type == SimdConvolutionActivationRestrictRange || - type == SimdConvolutionActivationHswish || - type == SimdConvolutionActivationHardSigmoid) - _params[1] = _mm_set1_ps(params[1]); - - dst += yBeg * p.dstW * p.dstC; - size_t dc = 0; - for (; dc < dstC; dc += DF) - { - size_t tail = Simd::Min(DF, dstC - dc); - _bias[0] = _mm_loadu_ps(bias + dc + 0); - _bias[1] = _mm_loadu_ps(bias + dc + F); - if (type == ::SimdConvolutionActivationPrelu) - { - _params[0] = _mm_loadu_ps(params + dc + 0); - _params[1] = _mm_loadu_ps(params + dc + F); - } - float* pDst = dst + dc; - for (size_t y = yBeg; y < yEnd; ++y) - { - const float* pSrc = src + (y & srcM) * srcW * F; - size_t x = 0; - for (; x < dstW6; x += 6, pDst += 6 * dstC, pSrc += 6 * F) - OutputConvolution_2x6(pSrc, srcC, srcS, weight, _bias, _params, pDst, dstC, tail, first); - if (dstW - dstW6 == 4) - OutputConvolution_2x4(pSrc, srcC, srcS, weight, _bias, _params, pDst, dstC, tail, first), pDst += 4 * dstC; - else - { - for (; x < dstW3; x += 3, pDst += 3 * dstC, pSrc += 3 * F) - OutputConvolution_2x3(pSrc, srcC, srcS, weight, _bias, _params, pDst, dstC, tail, first); - for (; x < dstW; ++x, pDst += dstC, pSrc += F) - OutputConvolution_2x1(pSrc, srcC, srcS, weight, _bias, _params, pDst, dstC, tail, first); - } - } - weight += srcC * DF; - } - } - - //--------------------------------------------------------------------- - - template void Set(const MergConvParam& p, size_t t, size_t i, SynetMergedConvolution32fCdc::ConvolutionPtr * c) - { - switch (t) - { - case 0: - if (p.conv[i].kernelY == 1 && p.conv[i].strideY == 1) - c[i + 0] = InputConvolution1x1; - else - c[i + 0] = InputConvolution; - break; - case 1: - if (p.conv[i].kernelY == 3) - c[i + 0] = DepthwiseConvolution3x3; - else - c[i + 0] = DepthwiseConvolution; - break; - case 2: - c[i + 0] = OutputConvolution; - c[i + 1] = OutputConvolution; - break; - default: - assert(0); - } - } - } - - //--------------------------------------------------------------------- - - SynetMergedConvolution32fCdc::SynetMergedConvolution32fCdc(const MergConvParam& p) - : Base::SynetMergedConvolution32fCdc(p) - { - SetSize(Base::AlgCacheL1(), Base::AlgCacheL2(), Base::AlgCacheL3(), F); - for (size_t i = 0; i < _param.count; ++i) - Set(p, i, i, _convolution); - } - - void SynetMergedConvolution32fCdc::Set(const MergConvParam& p, size_t t, size_t i, SynetMergedConvolution32f::ConvolutionPtr* c) - { - switch (p.conv[i].activation) - { - case SimdConvolutionActivationIdentity: Cdc::Set(p, t, i, c); break; - case SimdConvolutionActivationRelu: Cdc::Set(p, t, i, c); break; - case SimdConvolutionActivationLeakyRelu: Cdc::Set(p, t, i, c); break; - case SimdConvolutionActivationRestrictRange: Cdc::Set(p, t, i, c); break; - case SimdConvolutionActivationPrelu: Cdc::Set(p, t, i, c); break; - case SimdConvolutionActivationElu: Cdc::Set(p, t, i, c); break; - case SimdConvolutionActivationHswish: Cdc::Set(p, t, i, c); break; - case SimdConvolutionActivationMish: Cdc::Set(p, t, i, c); break; - case SimdConvolutionActivationHardSigmoid: Cdc::Set(p, t, i, c); break; - case SimdConvolutionActivationSwish: Cdc::Set(p, t, i, c); break; - case SimdConvolutionActivationGelu: Cdc::Set(p, t, i, c); break; - default: assert(0); - } - } - - //--------------------------------------------------------------------- - - void* SynetMergedConvolution32fInit(size_t batch, const SimdConvolutionParameters* convs, size_t count, SimdBool add) - { - MergConvParam param(batch, convs, count, add, SimdSynetCompatibilityDefault); - if (!param.Valid(SimdTensorData32f)) - return NULL; - if (SynetMergedConvolution32fCdc::Preferable(param)) - return new Sse41::SynetMergedConvolution32fCdc(param); - else if (SynetMergedConvolution32fCd::Preferable(param)) - return new Sse41::SynetMergedConvolution32fCd(param); - else if (SynetMergedConvolution32fDc::Preferable(param)) - return new Sse41::SynetMergedConvolution32fDc(param); - else - return new Base::SynetMergedConvolution32f(param); - } - } -#endif -} diff --git a/src/Simd/SimdSse41SynetMergedConvolution32fDc.cpp b/src/Simd/SimdSse41SynetMergedConvolution32fDc.cpp deleted file mode 100644 index e745b160c9..0000000000 --- a/src/Simd/SimdSse41SynetMergedConvolution32fDc.cpp +++ /dev/null @@ -1,456 +0,0 @@ -/* -* Simd Library (http://ermig1979.github.io/Simd). -* -* Copyright (c) 2011-2024 Yermalayeu Ihar. -* -* Permission is hereby granted, free of charge, to any person obtaining a copy -* of this software and associated documentation files (the "Software"), to deal -* in the Software without restriction, including without limitation the rights -* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the Software is -* furnished to do so, subject to the following conditions: -* -* The above copyright notice and this permission notice shall be included in -* all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -* SOFTWARE. -*/ -#include "Simd/SimdSynetMergedConvolution32f.h" -#include "Simd/SimdSynetConvolution32fCommon.h" -#include "Simd/SimdUpdate.h" -#include "Simd/SimdCpu.h" - -namespace Simd -{ -#if defined(SIMD_SSE41_ENABLE) && defined(SIMD_SYNET_ENABLE) - namespace Sse41 - { - namespace Dc - { - template void DepthwiseConvolution(const float* src, const SimdConvolutionParameters& p, - size_t srcC, size_t yBeg, size_t yEnd, const size_t bufH[2], const float* weight, const float* bias, const float* params, float* dst, int first) - { - size_t strideY = p.strideY, strideX = p.strideX, padY = p.padY, padX = p.padX, padH = p.padH, padW = p.padW; - size_t srcX = p.srcC, srcW = p.srcW * srcX, dstW = p.dstW * F, weightS = p.kernelY * p.kernelX * F, strideXC = strideX * srcX; - size_t dstM = (bufH[1] - 1), dstS = bufH[1] * dstW; - size_t noseY = (p.padY + p.strideY - 1) / p.strideY; - size_t bodyY = (p.srcH + p.padY + p.strideY - p.kernelY) / p.strideY; - size_t noseX = (p.padX + p.strideX - 1) / p.strideX; - size_t bodyX = (p.srcW + p.padX + p.strideX - p.kernelX) / p.strideX; - size_t bodyX2 = AlignLo(bodyX - noseX, 2) + noseX; - size_t bodyX4 = AlignLo(bodyX - noseX, 4) + noseX; - size_t bodyX8 = AlignLo(bodyX - noseX, 8) + noseX; - - __m128 _params[2]; - _params[0] = _mm_set1_ps(params[0]); - if (type == SimdConvolutionActivationRestrictRange || - type == SimdConvolutionActivationHswish || - type == SimdConvolutionActivationHardSigmoid) - _params[1] = _mm_set1_ps(params[1]); - for (size_t c = 0; c < srcC; c += F) - { - __m128 _bias = bias ? _mm_loadu_ps(bias + c) : _mm_setzero_ps(); - if (type == ::SimdConvolutionActivationPrelu) - _params[0] = _mm_loadu_ps(params + c); - - for (size_t dy = yBeg; dy < yEnd; ++dy) - { - float* pd = dst + (dy & dstM) * dstW; - if (dy >= noseY && dy < bodyY) - { - size_t dx = 0; - for (; dx < noseX; ++dx, pd += F) - { - __m128 sum = _bias; - for (size_t ky = 0; ky < p.kernelY; ++ky) - { - size_t sy = dy * p.strideY + ky - padY; - for (size_t kx = 0; kx < p.kernelX; ++kx) - { - size_t sx = dx * p.strideX + kx - padX; - if (sx < p.srcW) - { - const float* pw = weight + (ky * p.kernelX + kx) * F; - const float* ps = src + sy * srcW + sx * srcX; - sum = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps), _mm_loadu_ps(pw)), sum); - } - } - } - _mm_storeu_ps(pd, Activate(sum, _params, 0)); - } - for (; dx < bodyX8; dx += 8, pd += 8 * F) - { - __m128 sum0 = _bias; - __m128 sum1 = _bias; - __m128 sum2 = _bias; - __m128 sum3 = _bias; - __m128 sum4 = _bias; - __m128 sum5 = _bias; - __m128 sum6 = _bias; - __m128 sum7 = _bias; - const float* pw = weight; - for (size_t ky = 0; ky < p.kernelY; ++ky) - { - size_t sy = dy * strideY + ky - padY; - const float* ps = src + sy * srcW + (dx * strideX - padX) * srcX; - for (size_t kx = 0; kx < p.kernelX; ++kx, ps += srcX, pw += F) - { - __m128 w0 = _mm_loadu_ps(pw); - sum0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 0 * strideXC), w0), sum0); - sum1 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 1 * strideXC), w0), sum1); - sum2 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 2 * strideXC), w0), sum2); - sum3 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 3 * strideXC), w0), sum3); - sum4 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 4 * strideXC), w0), sum4); - sum5 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 5 * strideXC), w0), sum5); - sum6 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 6 * strideXC), w0), sum6); - sum7 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 7 * strideXC), w0), sum7); - } - } - _mm_storeu_ps(pd + 0 * F, Activate(sum0, _params, 0)); - _mm_storeu_ps(pd + 1 * F, Activate(sum1, _params, 0)); - _mm_storeu_ps(pd + 2 * F, Activate(sum2, _params, 0)); - _mm_storeu_ps(pd + 3 * F, Activate(sum3, _params, 0)); - _mm_storeu_ps(pd + 4 * F, Activate(sum4, _params, 0)); - _mm_storeu_ps(pd + 5 * F, Activate(sum5, _params, 0)); - _mm_storeu_ps(pd + 6 * F, Activate(sum6, _params, 0)); - _mm_storeu_ps(pd + 7 * F, Activate(sum7, _params, 0)); - } - for (; dx < bodyX4; dx += 4, pd += 4 * F) - { - __m128 sum0 = _bias; - __m128 sum1 = _bias; - __m128 sum2 = _bias; - __m128 sum3 = _bias; - const float* pw = weight; - for (size_t ky = 0; ky < p.kernelY; ++ky) - { - size_t sy = dy * strideY + ky - padY; - const float* ps = src + sy * srcW + (dx * strideX - padX) * srcX; - for (size_t kx = 0; kx < p.kernelX; ++kx, ps += srcX, pw += F) - { - __m128 w0 = _mm_loadu_ps(pw); - sum0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 0 * strideXC), w0), sum0); - sum1 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 1 * strideXC), w0), sum1); - sum2 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 2 * strideXC), w0), sum2); - sum3 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 3 * strideXC), w0), sum3); - } - } - _mm_storeu_ps(pd + 0 * F, Activate(sum0, _params, 0)); - _mm_storeu_ps(pd + 1 * F, Activate(sum1, _params, 0)); - _mm_storeu_ps(pd + 2 * F, Activate(sum2, _params, 0)); - _mm_storeu_ps(pd + 3 * F, Activate(sum3, _params, 0)); - } - for (; dx < bodyX2; dx += 2, pd += 2 * F) - { - __m128 sum0 = _bias; - __m128 sum1 = _bias; - const float* pw = weight; - for (size_t ky = 0; ky < p.kernelY; ++ky) - { - size_t sy = dy * strideY + ky - padY; - const float* ps = src + sy * srcW + (dx * strideX - padX) * srcX; - for (size_t kx = 0; kx < p.kernelX; ++kx, ps += srcX, pw += F) - { - __m128 w0 = _mm_loadu_ps(pw); - sum0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 0 * strideXC), w0), sum0); - sum1 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 1 * strideXC), w0), sum1); - } - } - _mm_storeu_ps(pd + 0 * F, Activate(sum0, _params, 0)); - _mm_storeu_ps(pd + 1 * F, Activate(sum1, _params, 0)); - } - for (; dx < bodyX; ++dx, pd += F) - { - __m128 sum = _bias; - const float* pw = weight; - for (size_t ky = 0; ky < p.kernelY; ++ky) - { - size_t sy = dy * strideY + ky - padY; - const float* ps = src + sy * srcW + (dx * strideX - padX) * srcX; - for (size_t kx = 0; kx < p.kernelX; ++kx, ps += srcX, pw += F) - { - __m128 w0 = _mm_loadu_ps(pw); - sum = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps), w0), sum); - } - } - _mm_storeu_ps(pd, Activate(sum, _params, 0)); - } - for (; dx < p.dstW; ++dx, pd += F) - { - __m128 sum = _bias; - for (size_t ky = 0; ky < p.kernelY; ++ky) - { - size_t sy = dy * strideY + ky - padY; - for (size_t kx = 0; kx < p.kernelX; ++kx) - { - size_t sx = dx * strideX + kx - padX; - if (sx < p.srcW) - { - const float* pw = weight + (ky * p.kernelX + kx) * F; - const float* ps = src + sy * srcW + sx * srcX; - sum = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps), _mm_loadu_ps(pw)), sum); - } - } - } - _mm_storeu_ps(pd, Activate(sum, _params, 0)); - } - } - else - { - for (size_t dx = 0; dx < p.dstW; ++dx, pd += F) - { - __m128 sum = _bias; - for (size_t ky = 0; ky < p.kernelY; ++ky) - { - size_t sy = dy * strideY + ky - padY; - if (sy < p.srcH) - { - for (size_t kx = 0; kx < p.kernelX; ++kx) - { - size_t sx = dx * strideX + kx - padX; - if (sx < p.srcW) - { - const float* pw = weight + (ky * p.kernelX + kx) * F; - const float* ps = src + sy * srcW + sx * srcX; - sum = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps), _mm_loadu_ps(pw)), sum); - } - } - } - } - _mm_storeu_ps(pd, Activate(sum, _params, 0)); - } - } - } - src += F; - dst += dstS; - weight += weightS; - } - } - - //--------------------------------------------------------------------- - - template SIMD_INLINE void ConvolutionDepthwise3x3Edge2x2( - const float* src0, const float* src1, size_t srcC, const __m128* weight, const __m128& bias, const __m128* params, float* dst) - { - __m128 sum0 = bias, sum1 = _mm_setzero_ps(); - sum0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src0 + 0 * srcC), weight[0]), sum0); - sum1 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src0 + 1 * srcC), weight[1]), sum1); - sum0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src1 + 0 * srcC), weight[3]), sum0); - sum1 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src1 + 1 * srcC), weight[4]), sum1); - _mm_storeu_ps(dst, Activate(_mm_add_ps(sum0, sum1), params, 0)); - } - - template SIMD_INLINE void ConvolutionDepthwise3x3Edge2x3( - const float* src0, const float* src1, size_t srcC, const __m128* weight, const __m128& bias, const __m128* params, float* dst) - { - __m128 sum0 = bias, sum1 = _mm_setzero_ps(), sum2 = _mm_setzero_ps(); - sum0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src0 + 0 * srcC), weight[0]), sum0); - sum1 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src0 + 1 * srcC), weight[1]), sum1); - sum2 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src0 + 2 * srcC), weight[2]), sum2); - sum0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src1 + 0 * srcC), weight[3]), sum0); - sum1 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src1 + 1 * srcC), weight[4]), sum1); - sum2 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src1 + 2 * srcC), weight[5]), sum2); - _mm_storeu_ps(dst, Activate(_mm_add_ps(_mm_add_ps(sum0, sum1), sum2), params, 0)); - } - - template SIMD_INLINE void ConvolutionDepthwise3x3Edge3x2( - const float* src0, const float* src1, const float* src2, size_t srcC, const __m128* weight, const __m128& bias, const __m128* params, float* dst) - { - __m128 sum0 = bias, sum1 = _mm_setzero_ps(); - sum0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src0 + 0 * srcC), weight[0]), sum0); - sum1 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src0 + 1 * srcC), weight[1]), sum1); - sum0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src1 + 0 * srcC), weight[3]), sum0); - sum1 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src1 + 1 * srcC), weight[4]), sum1); - sum0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src2 + 0 * srcC), weight[6]), sum0); - sum1 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src2 + 1 * srcC), weight[7]), sum1); - _mm_storeu_ps(dst, Activate(_mm_add_ps(sum0, sum1), params, 0)); - } - - template SIMD_INLINE void ConvolutionDepthwise3x3Main1x1( - const float* src0, const float* src1, const float* src2, size_t srcC, const __m128* weight, const __m128& bias, const __m128* params, float* dst) - { - __m128 sum0 = bias, sum1 = _mm_setzero_ps(), sum2 = _mm_setzero_ps(); - sum0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src0 + 0 * srcC), weight[0]), sum0); - sum1 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src0 + 1 * srcC), weight[1]), sum1); - sum2 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src0 + 2 * srcC), weight[2]), sum2); - sum0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src1 + 0 * srcC), weight[3]), sum0); - sum1 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src1 + 1 * srcC), weight[4]), sum1); - sum2 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src1 + 2 * srcC), weight[5]), sum2); - sum0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src2 + 0 * srcC), weight[6]), sum0); - sum1 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src2 + 1 * srcC), weight[7]), sum1); - sum2 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src2 + 2 * srcC), weight[8]), sum2); - _mm_storeu_ps(dst, Activate(_mm_add_ps(_mm_add_ps(sum0, sum1), sum2), params, 0)); - } - - template SIMD_INLINE void ConvolutionDepthwise3x3Main1x2( - const float* src0, const float* src1, const float* src2, size_t srcC, const __m128* weight, const __m128& bias, const __m128* params, float* dst) - { - __m128 sum0 = bias, sum1 = bias, s0; - - s0 = _mm_loadu_ps(src0 + 0 * srcC); - sum0 = _mm_add_ps(_mm_mul_ps(s0, weight[0]), sum0); - s0 = _mm_loadu_ps(src0 + 1 * srcC); - sum0 = _mm_add_ps(_mm_mul_ps(s0, weight[1]), sum0); - sum1 = _mm_add_ps(_mm_mul_ps(s0, weight[0]), sum1); - s0 = _mm_loadu_ps(src0 + 2 * srcC); - sum0 = _mm_add_ps(_mm_mul_ps(s0, weight[2]), sum0); - sum1 = _mm_add_ps(_mm_mul_ps(s0, weight[1]), sum1); - s0 = _mm_loadu_ps(src0 + 3 * srcC); - sum1 = _mm_add_ps(_mm_mul_ps(s0, weight[2]), sum1); - - s0 = _mm_loadu_ps(src1 + 0 * srcC); - sum0 = _mm_add_ps(_mm_mul_ps(s0, weight[3]), sum0); - s0 = _mm_loadu_ps(src1 + 1 * srcC); - sum0 = _mm_add_ps(_mm_mul_ps(s0, weight[4]), sum0); - sum1 = _mm_add_ps(_mm_mul_ps(s0, weight[3]), sum1); - s0 = _mm_loadu_ps(src1 + 2 * srcC); - sum0 = _mm_add_ps(_mm_mul_ps(s0, weight[5]), sum0); - sum1 = _mm_add_ps(_mm_mul_ps(s0, weight[4]), sum1); - s0 = _mm_loadu_ps(src1 + 3 * srcC); - sum1 = _mm_add_ps(_mm_mul_ps(s0, weight[5]), sum1); - - s0 = _mm_loadu_ps(src2 + 0 * srcC); - sum0 = _mm_add_ps(_mm_mul_ps(s0, weight[6]), sum0); - s0 = _mm_loadu_ps(src2 + 1 * srcC); - sum0 = _mm_add_ps(_mm_mul_ps(s0, weight[7]), sum0); - sum1 = _mm_add_ps(_mm_mul_ps(s0, weight[6]), sum1); - s0 = _mm_loadu_ps(src2 + 2 * srcC); - sum0 = _mm_add_ps(_mm_mul_ps(s0, weight[8]), sum0); - sum1 = _mm_add_ps(_mm_mul_ps(s0, weight[7]), sum1); - s0 = _mm_loadu_ps(src2 + 3 * srcC); - sum1 = _mm_add_ps(_mm_mul_ps(s0, weight[8]), sum1); - - _mm_storeu_ps(dst + 0 * F, Activate(sum0, params, 0)); - _mm_storeu_ps(dst + 1 * F, Activate(sum1, params, 0)); - } - - template void DepthwiseConvolution3x3(const float* src, const SimdConvolutionParameters& p, - size_t srcC, size_t yBeg, size_t yEnd, const size_t bufH[2], const float* weight, const float* bias, const float* params, float* dst, int first) - { - size_t strideY = p.strideY, padY = p.padY, padX = p.padX, padH = p.padH, padW = p.padW; - size_t srcX = p.srcC, srcW = p.srcW * srcX, dstW = p.dstW * F, weightS = p.kernelY * p.kernelX * F; - size_t dstM = (bufH[1] - 1), srcS = bufH[0] * srcW, dstS = bufH[1] * dstW; - size_t xStep = srcX * p.strideX, xStep0 = (p.strideX - p.padX) * srcX; - size_t xMainEnd = p.dstW - p.padW, xMainEnd2 = AlignLo(xMainEnd - padX, 2)* (p.strideX == 1 ? 1 : 0) + padX; - size_t yMainEnd = yEnd == p.dstH && p.padH ? yEnd - 1 : yEnd; - - __m128 _params[2]; - _params[0] = _mm_set1_ps(params[0]); - if (type == SimdConvolutionActivationRestrictRange || - type == SimdConvolutionActivationHswish || - type == SimdConvolutionActivationHardSigmoid) - _params[1] = _mm_set1_ps(params[1]); - for (size_t c = 0; c < srcC; c += F) - { - __m128 _weight[9]; - for (size_t i = 0; i < 9; ++i) - _weight[i] = _mm_loadu_ps(weight + i * F); - __m128 _bias = bias ? _mm_loadu_ps(bias + c) : _mm_setzero_ps(); - if (type == ::SimdConvolutionActivationPrelu) - _params[0] = _mm_loadu_ps(params + c); - - size_t dy = yBeg; - if (yBeg == 0 && padY) - { - size_t sy = 0, dx = 0; - const float* src0 = src + (sy + 0) * srcW; - const float* src1 = src + (sy + 1) * srcW; - float* pDst = dst + (dy & dstM) * dstW; - if (padX) - ConvolutionDepthwise3x3Edge2x2(src0, src1, srcX, _weight + 4, _bias, _params, pDst), pDst += F, dx++, src0 += xStep0, src1 += xStep0; - for (; dx < xMainEnd; dx++, pDst += F, src0 += xStep, src1 += xStep) - ConvolutionDepthwise3x3Edge2x3(src0, src1, srcX, _weight + 3, _bias, _params, pDst); - if (padW) - ConvolutionDepthwise3x3Edge2x2(src0, src1, srcX, _weight + 3, _bias, _params, pDst); - dy++; - } - for (; dy < yMainEnd; ++dy) - { - size_t sy = dy * strideY - padY, dx = 0; - const float* src0 = src + (sy + 0) * srcW; - const float* src1 = src + (sy + 1) * srcW; - const float* src2 = src + (sy + 2) * srcW; - float* pDst = dst + (dy & dstM) * dstW; - if (padX) - ConvolutionDepthwise3x3Edge3x2(src0, src1, src2, srcX, _weight + 1, _bias, _params, pDst), pDst += F, dx++, src0 += xStep0, src1 += xStep0, src2 += xStep0; - for (; dx < xMainEnd2; dx += 2, pDst += F * 2, src0 += xStep * 2, src1 += xStep * 2, src2 += xStep * 2) - ConvolutionDepthwise3x3Main1x2(src0, src1, src2, srcX, _weight + 0, _bias, _params, pDst); - for (; dx < xMainEnd; dx++, pDst += F, src0 += xStep, src1 += xStep, src2 += xStep) - ConvolutionDepthwise3x3Main1x1(src0, src1, src2, srcX, _weight + 0, _bias, _params, pDst); - if (padW) - ConvolutionDepthwise3x3Edge3x2(src0, src1, src2, srcX, _weight + 0, _bias, _params, pDst); - } - if (dy < yEnd) - { - size_t sy = dy * strideY - padY, dx = 0; - const float* src0 = src + (sy + 0) * srcW; - const float* src1 = src + (sy + 1) * srcW; - float* pDst = dst + (dy & dstM) * dstW; - if (padX) - ConvolutionDepthwise3x3Edge2x2(src0, src1, srcX, _weight + 1, _bias, _params, pDst), pDst += F, dx++, src0 += xStep0, src1 += xStep0; - for (; dx < xMainEnd; dx++, pDst += F, src0 += xStep, src1 += xStep) - ConvolutionDepthwise3x3Edge2x3(src0, src1, srcX, _weight + 0, _bias, _params, pDst); - if (padW) - ConvolutionDepthwise3x3Edge2x2(src0, src1, srcX, _weight + 0, _bias, _params, pDst); - } - src += F; - dst += dstS; - weight += weightS; - } - } - - //--------------------------------------------------------------------- - - template void Set(const MergConvParam& p, size_t t, size_t i, SynetMergedConvolution32fCdc::ConvolutionPtr * c) - { - switch (t) - { - case 1: - if (p.conv[i].kernelY == 3) - c[i + 0] = DepthwiseConvolution3x3; - else - c[i + 0] = DepthwiseConvolution; - break; - default: - assert(0); - } - } - } - - //--------------------------------------------------------------------- - - SynetMergedConvolution32fDc::SynetMergedConvolution32fDc(const MergConvParam& p) - : Base::SynetMergedConvolution32fDc(p) - { - SetSize(Base::AlgCacheL1(), Base::AlgCacheL2(), Base::AlgCacheL3(), F); - SynetMergedConvolution32fDc::Set(_param, 1, 0, _convolution); - SynetMergedConvolution32fCdc::Set(_param, 2, 1, _convolution); - } - - void SynetMergedConvolution32fDc::Set(const MergConvParam& p, size_t t, size_t i, SynetMergedConvolution32f::ConvolutionPtr* c) - { - switch (p.conv[i].activation) - { - case SimdConvolutionActivationIdentity: Dc::Set(p, t, i, c); break; - case SimdConvolutionActivationRelu: Dc::Set(p, t, i, c); break; - case SimdConvolutionActivationLeakyRelu: Dc::Set(p, t, i, c); break; - case SimdConvolutionActivationRestrictRange: Dc::Set(p, t, i, c); break; - case SimdConvolutionActivationPrelu: Dc::Set(p, t, i, c); break; - case SimdConvolutionActivationElu: Dc::Set(p, t, i, c); break; - case SimdConvolutionActivationHswish: Dc::Set(p, t, i, c); break; - case SimdConvolutionActivationMish: Dc::Set(p, t, i, c); break; - case SimdConvolutionActivationHardSigmoid: Dc::Set(p, t, i, c); break; - case SimdConvolutionActivationSwish: Dc::Set(p, t, i, c); break; - case SimdConvolutionActivationGelu: Dc::Set(p, t, i, c); break; - default: assert(0); - } - } - } -#endif -} diff --git a/src/Simd/SimdSse41SynetMergedConvolution32fDepthwise.cpp b/src/Simd/SimdSse41SynetMergedConvolution32fDepthwise.cpp new file mode 100644 index 0000000000..179b70add6 --- /dev/null +++ b/src/Simd/SimdSse41SynetMergedConvolution32fDepthwise.cpp @@ -0,0 +1,433 @@ +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2024 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#include "Simd/SimdSynetMergedConvolution32f.h" +#include "Simd/SimdSynetConvolution32fCommon.h" +#include "Simd/SimdUpdate.h" +#include "Simd/SimdCpu.h" + +namespace Simd +{ +#if defined(SIMD_SSE41_ENABLE) && defined(SIMD_SYNET_ENABLE) + namespace Sse41 + { + SIMD_INLINE void Save(float* ptr, __m128 val, size_t tail) + { + float tmp[F]; + _mm_storeu_ps(tmp, val); + for (size_t i = 0; i < tail; ++i) + ptr[i] = tmp[i]; + } + + //------------------------------------------------------------------------------------------------------- + + template void DepthwiseConvolution(const float* src, const SimdConvolutionParameters& p, + size_t srcC, size_t yBeg, size_t yEnd, const size_t bufH[2], const float* weight, const float* bias, const float* params, float* dst, int first) + { + size_t strideY = p.strideY, strideX = p.strideX, padY = p.padY, padX = p.padX, padH = p.padH, padW = p.padW, dstC = srcC; + size_t sM = (bufH[0] - 1), sD = bufH[0] ? bufH[0] * p.srcW * F : F, sX = bufH[0] ? F : p.srcC, sY = sX * p.srcW; + size_t dM = (bufH[1] - 1), dX = (bufH[1] ? F : p.dstC), dY = p.dstW * dX, dy0 = bufH[1] ? yBeg : 0, dD = bufH[1] ? bufH[1] * dY : F; + size_t wD = p.kernelY * p.kernelX * F, ssX = strideX * sX; + size_t noseY = NoseH(p), bodyY = BodyH(p), noseX = NoseW(p), bodyX = BodyW(p); + size_t bodyX2 = AlignLo(bodyX - noseX, 2) + noseX; + size_t bodyX4 = AlignLo(bodyX - noseX, 4) + noseX; + size_t bodyX8 = AlignLo(bodyX - noseX, 8) + noseX; + size_t dstCF = AlignLo(dstC, F); + + __m128 _params[2]; + _params[0] = _mm_set1_ps(params[0]); + if (type == SimdConvolutionActivationRestrictRange || + type == SimdConvolutionActivationHswish || + type == SimdConvolutionActivationHardSigmoid) + _params[1] = _mm_set1_ps(params[1]); + for (size_t c = 0; c < dstC; c += F) + { + __m128 _bias = bias ? _mm_loadu_ps(bias + c) : _mm_setzero_ps(); + if (type == ::SimdConvolutionActivationPrelu) + _params[0] = _mm_loadu_ps(params + c); + if (c == dstCF) + { + size_t tail = dstC - dstCF; + for (size_t dy = yBeg; dy < yEnd; ++dy) + { + float* pd = dst + (dy & dM) * dY; + for (size_t dx = 0; dx < p.dstW; ++dx, pd += dX) + { + __m128 sum0 = _bias; + for (size_t ky = 0; ky < p.kernelY; ++ky) + { + size_t sy = dy * strideY + ky - padY; + if (sy < p.srcH) + { + for (size_t kx = 0; kx < p.kernelX; ++kx) + { + size_t sx = dx * strideX + kx - padX; + if (sx < p.srcW) + { + const float* pw = weight + (ky * p.kernelX + kx) * F; + const float* ps = src + (sy & sM) * sY + sx * sX; + sum0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps), _mm_loadu_ps(pw)), sum0); + } + } + } + } + Save(pd, Activate(sum0, _params, 0), tail); + } + } + return; + } + for (size_t dy = yBeg; dy < yEnd; ++dy) + { + float* pd = dst + (dy & dM) * dY; + if (dy >= noseY && dy < bodyY) + { + size_t dx = 0; + for (; dx < noseX; dx += 1, pd += dX) + { + __m128 sum0 = _bias; + for (size_t ky = 0; ky < p.kernelY; ++ky) + { + size_t sy = dy * p.strideY + ky - padY; + for (size_t kx = 0; kx < p.kernelX; ++kx) + { + size_t sx = dx * p.strideX + kx - padX; + if (sx < p.srcW) + { + const float* pw = weight + (ky * p.kernelX + kx) * F; + const float* ps = src + (sy & sM) * sY + sx * sX; + sum0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps), _mm_loadu_ps(pw)), sum0); + } + } + } + _mm_storeu_ps(pd + 0 * dX, Activate(sum0, _params, 0)); + } + for (; dx < bodyX8; dx += 8, pd += 8 * dX) + { + __m128 sum0 = _bias; + __m128 sum1 = _bias; + __m128 sum2 = _bias; + __m128 sum3 = _bias; + __m128 sum4 = _bias; + __m128 sum5 = _bias; + __m128 sum6 = _bias; + __m128 sum7 = _bias; + const float* pw = weight; + for (size_t ky = 0; ky < p.kernelY; ++ky) + { + size_t sy = dy * strideY + ky - padY; + const float* ps = src + (sy & sM) * sY + (dx * strideX - padX) * sX; + for (size_t kx = 0; kx < p.kernelX; ++kx, ps += sX, pw += F) + { + __m128 w0 = _mm_loadu_ps(pw); + sum0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 0 * ssX), w0), sum0); + sum1 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 1 * ssX), w0), sum1); + sum2 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 2 * ssX), w0), sum2); + sum3 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 3 * ssX), w0), sum3); + sum4 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 4 * ssX), w0), sum4); + sum5 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 5 * ssX), w0), sum5); + sum6 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 6 * ssX), w0), sum6); + sum7 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 7 * ssX), w0), sum7); + } + } + _mm_storeu_ps(pd + 0 * dX, Activate(sum0, _params, 0)); + _mm_storeu_ps(pd + 1 * dX, Activate(sum1, _params, 0)); + _mm_storeu_ps(pd + 2 * dX, Activate(sum2, _params, 0)); + _mm_storeu_ps(pd + 3 * dX, Activate(sum3, _params, 0)); + _mm_storeu_ps(pd + 4 * dX, Activate(sum4, _params, 0)); + _mm_storeu_ps(pd + 5 * dX, Activate(sum5, _params, 0)); + _mm_storeu_ps(pd + 6 * dX, Activate(sum6, _params, 0)); + _mm_storeu_ps(pd + 7 * dX, Activate(sum7, _params, 0)); + } + for (; dx < bodyX4; dx += 4, pd += 4 * dX) + { + __m128 sum0 = _bias; + __m128 sum1 = _bias; + __m128 sum2 = _bias; + __m128 sum3 = _bias; + const float* pw = weight; + for (size_t ky = 0; ky < p.kernelY; ++ky) + { + size_t sy = dy * strideY + ky - padY; + const float* ps = src + (sy & sM) * sY + (dx * strideX - padX) * sX; + for (size_t kx = 0; kx < p.kernelX; ++kx, ps += sX, pw += F) + { + __m128 w0 = _mm_loadu_ps(pw); + sum0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 0 * ssX), w0), sum0); + sum1 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 1 * ssX), w0), sum1); + sum2 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 2 * ssX), w0), sum2); + sum3 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 3 * ssX), w0), sum3); + } + } + _mm_storeu_ps(pd + 0 * dX, Activate(sum0, _params, 0)); + _mm_storeu_ps(pd + 1 * dX, Activate(sum1, _params, 0)); + _mm_storeu_ps(pd + 2 * dX, Activate(sum2, _params, 0)); + _mm_storeu_ps(pd + 3 * dX, Activate(sum3, _params, 0)); + } + for (; dx < bodyX2; dx += 2, pd += 2 * dX) + { + __m128 sum0 = _bias; + __m128 sum1 = _bias; + const float* pw = weight; + for (size_t ky = 0; ky < p.kernelY; ++ky) + { + size_t sy = dy * strideY + ky - padY; + const float* ps = src + (sy & sM) * sY + (dx * strideX - padX) * sX; + for (size_t kx = 0; kx < p.kernelX; ++kx, ps += sX, pw += F) + { + __m128 w0 = _mm_loadu_ps(pw); + sum0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 0 * ssX), w0), sum0); + sum1 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps + 1 * ssX), w0), sum1); + } + } + _mm_storeu_ps(pd + 0 * dX, Activate(sum0, _params, 0)); + _mm_storeu_ps(pd + 1 * dX, Activate(sum1, _params, 0)); + } + for (; dx < bodyX; dx += 1, pd += dX) + { + __m128 sum0 = _bias; + const float* pw = weight; + for (size_t ky = 0; ky < p.kernelY; ++ky) + { + size_t sy = dy * strideY + ky - padY; + const float* ps = src + (sy & sM) * sY + (dx * strideX - padX) * sX; + for (size_t kx = 0; kx < p.kernelX; ++kx, ps += sX, pw += F) + { + __m128 w0 = _mm_loadu_ps(pw); + sum0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps), w0), sum0); + } + } + _mm_storeu_ps(pd + 0 * dX, Activate(sum0, _params, 0)); + } + for (; dx < p.dstW; dx += 1, pd += dX) + { + __m128 sum0 = _bias; + for (size_t ky = 0; ky < p.kernelY; ++ky) + { + size_t sy = dy * strideY + ky - padY; + for (size_t kx = 0; kx < p.kernelX; ++kx) + { + size_t sx = dx * strideX + kx - padX; + if (sx < p.srcW) + { + const float* pw = weight + (ky * p.kernelX + kx) * F; + const float* ps = src + (sy & sM) * sY + sx * sX; + sum0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps), _mm_loadu_ps(pw)), sum0); + } + } + } + _mm_storeu_ps(pd + 0 * dX, Activate(sum0, _params, 0)); + } + } + else + { + for (size_t dx = 0; dx < p.dstW; ++dx, pd += dX) + { + __m128 sum0 = _bias; + for (size_t ky = 0; ky < p.kernelY; ++ky) + { + size_t sy = dy * strideY + ky - padY; + if (sy < p.srcH) + { + for (size_t kx = 0; kx < p.kernelX; ++kx) + { + size_t sx = dx * strideX + kx - padX; + if (sx < p.srcW) + { + const float* pw = weight + (ky * p.kernelX + kx) * F; + const float* ps = src + (sy & sM) * sY + sx * sX; + sum0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(ps), _mm_loadu_ps(pw)), sum0); + } + } + } + } + _mm_storeu_ps(pd + 0 * dX, Activate(sum0, _params, 0)); + } + } + } + src += sD; + dst += dD; + weight += wD; + } + } + + //------------------------------------------------------------------------------------------------------- + + template SIMD_INLINE void ConvolutionDepthwise3x3Edge2x2( + const float* src0, const float* src1, size_t sX, const __m128* weight, const __m128& bias, const __m128* params, float* dst) + { + __m128 sum0 = bias, sum1 = _mm_setzero_ps(); + sum0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src0 + 0 * sX), weight[0]), sum0); + sum1 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src0 + 1 * sX), weight[1]), sum1); + sum0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src1 + 0 * sX), weight[3]), sum0); + sum1 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src1 + 1 * sX), weight[4]), sum1); + _mm_storeu_ps(dst, Activate(_mm_add_ps(sum0, sum1), params, 0)); + } + + template SIMD_INLINE void ConvolutionDepthwise3x3Edge2x3( + const float* src0, const float* src1, size_t sX, const __m128* weight, const __m128& bias, const __m128* params, float* dst) + { + __m128 sum0 = bias, sum1 = _mm_setzero_ps(), sum2 = _mm_setzero_ps(); + sum0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src0 + 0 * sX), weight[0]), sum0); + sum1 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src0 + 1 * sX), weight[1]), sum1); + sum2 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src0 + 2 * sX), weight[2]), sum2); + sum0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src1 + 0 * sX), weight[3]), sum0); + sum1 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src1 + 1 * sX), weight[4]), sum1); + sum2 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src1 + 2 * sX), weight[5]), sum2); + _mm_storeu_ps(dst, Activate(_mm_add_ps(_mm_add_ps(sum0, sum1), sum2), params, 0)); + } + + template SIMD_INLINE void ConvolutionDepthwise3x3Edge3x2( + const float* src0, const float* src1, const float* src2, size_t sX, const __m128* weight, const __m128& bias, const __m128* params, float* dst) + { + __m128 sum0 = bias, sum1 = _mm_setzero_ps(); + sum0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src0 + 0 * sX), weight[0]), sum0); + sum1 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src0 + 1 * sX), weight[1]), sum1); + sum0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src1 + 0 * sX), weight[3]), sum0); + sum1 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src1 + 1 * sX), weight[4]), sum1); + sum0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src2 + 0 * sX), weight[6]), sum0); + sum1 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src2 + 1 * sX), weight[7]), sum1); + _mm_storeu_ps(dst, Activate(_mm_add_ps(sum0, sum1), params, 0)); + } + + template SIMD_INLINE void ConvolutionDepthwise3x3Main1x1( + const float* src0, const float* src1, const float* src2, size_t sX, const __m128* weight, const __m128& bias, const __m128* params, float* dst) + { + __m128 sum0 = bias, sum1 = _mm_setzero_ps(), sum2 = _mm_setzero_ps(); + sum0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src0 + 0 * sX), weight[0]), sum0); + sum1 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src0 + 1 * sX), weight[1]), sum1); + sum2 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src0 + 2 * sX), weight[2]), sum2); + sum0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src1 + 0 * sX), weight[3]), sum0); + sum1 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src1 + 1 * sX), weight[4]), sum1); + sum2 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src1 + 2 * sX), weight[5]), sum2); + sum0 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src2 + 0 * sX), weight[6]), sum0); + sum1 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src2 + 1 * sX), weight[7]), sum1); + sum2 = _mm_add_ps(_mm_mul_ps(_mm_loadu_ps(src2 + 2 * sX), weight[8]), sum2); + _mm_storeu_ps(dst, Activate(_mm_add_ps(_mm_add_ps(sum0, sum1), sum2), params, 0)); + } + + template void DepthwiseConvolution3x3(const float* src, const SimdConvolutionParameters& p, + size_t srcC, size_t yBeg, size_t yEnd, const size_t bufH[2], const float* weight, const float* bias, const float* params, float* dst, int first) + { + size_t strideY = p.strideY, strideX = p.strideX, padY = p.padY, padX = p.padX, padH = p.padH, padW = p.padW, dstC = srcC; + size_t sM = (bufH[0] - 1), sD = bufH[0] ? bufH[0] * p.srcW * F : F, sX = bufH[0] ? F : p.srcC, sY = sX * p.srcW; + size_t dM = (bufH[1] - 1), dX = (bufH[1] ? F : p.dstC), dY = p.dstW * dX, dy0 = bufH[1] ? yBeg : 0, dD = bufH[1] ? bufH[1] * dY : F; + size_t wD = p.kernelY * p.kernelX * F, ssX = p.strideX * sX, ssX0 = (p.strideX - p.padX) * sX; + size_t xMainEnd = p.dstW - p.padW, yMainEnd = yEnd == p.dstH && p.padH ? yEnd - 1 : yEnd; + + __m128 _params[2]; + _params[0] = _mm_set1_ps(params[0]); + if (type == SimdConvolutionActivationRestrictRange || + type == SimdConvolutionActivationHswish || + type == SimdConvolutionActivationHardSigmoid) + _params[1] = _mm_set1_ps(params[1]); + for (size_t c = 0; c < srcC; c += F) + { + __m128 _weight[9]; + for (size_t i = 0; i < 9; ++i) + _weight[i] = _mm_loadu_ps(weight + i * F); + __m128 _bias = bias ? _mm_loadu_ps(bias + c) : _mm_setzero_ps(); + if (type == ::SimdConvolutionActivationPrelu) + _params[0] = _mm_loadu_ps(params + c); + + size_t dy = yBeg; + if (yBeg == 0 && padY) + { + size_t sy = 0, dx = 0; + const float* src0 = src + ((sy + 0) & sM) * sY; + const float* src1 = src + ((sy + 1) & sM) * sY; + float* pDst = dst + (dy & dM) * dY; + if (padX) + ConvolutionDepthwise3x3Edge2x2(src0, src1, sX, _weight + 4, _bias, _params, pDst), + pDst += dX, dx++, src0 += ssX0, src1 += ssX0; + for (; dx < xMainEnd; dx++, pDst += dX, src0 += ssX, src1 += ssX) + ConvolutionDepthwise3x3Edge2x3(src0, src1, sX, _weight + 3, _bias, _params, pDst); + if (padW) + ConvolutionDepthwise3x3Edge2x2(src0, src1, sX, _weight + 3, _bias, _params, pDst); + dy++; + } + for (; dy < yMainEnd; ++dy) + { + size_t sy = dy * strideY - padY, dx = 0; + const float* src0 = src + ((sy + 0) & sM) * sY; + const float* src1 = src + ((sy + 1) & sM) * sY; + const float* src2 = src + ((sy + 2) & sM) * sY; + float* pDst = dst + (dy & dM) * dY; + if (padX) + ConvolutionDepthwise3x3Edge3x2(src0, src1, src2, sX, _weight + 1, _bias, _params, pDst), + pDst += dX, dx++, src0 += ssX0, src1 += ssX0, src2 += ssX0; + for (; dx < xMainEnd; dx++, pDst += dX, src0 += ssX, src1 += ssX, src2 += ssX) + ConvolutionDepthwise3x3Main1x1(src0, src1, src2, sX, _weight + 0, _bias, _params, pDst); + if (padW) + ConvolutionDepthwise3x3Edge3x2(src0, src1, src2, sX, _weight + 0, _bias, _params, pDst); + } + if (dy < yEnd) + { + size_t sy = dy * strideY - padY, dx = 0; + const float* src0 = src + ((sy + 0) & sM) * sY; + const float* src1 = src + ((sy + 1) & sM) * sY; + float* pDst = dst + (dy & dM) * dY; + if (padX) + ConvolutionDepthwise3x3Edge2x2(src0, src1, sX, _weight + 1, _bias, _params, pDst), + pDst += dX, dx++, src0 += ssX0, src1 += ssX0; + for (; dx < xMainEnd; dx++, pDst += dX, src0 += ssX, src1 += ssX) + ConvolutionDepthwise3x3Edge2x3(src0, src1, sX, _weight + 0, _bias, _params, pDst); + if (padW) + ConvolutionDepthwise3x3Edge2x2(src0, src1, sX, _weight + 0, _bias, _params, pDst); + } + src += sD; + dst += dD; + weight += wD; + } + } + + //------------------------------------------------------------------------------------------------------- + + template void SetDepthwise(const ConvParam& p, bool last, Base::SynetMergedConvolution32f::ConvolutionPtr* convolution) + { + if (p.kernelY == 3 && (!last || Aligned(p.dstC, F))) + convolution[0] = DepthwiseConvolution3x3; + else + convolution[0] = DepthwiseConvolution; + } + + void SetDepthwise(const ConvParam& p, bool last, Base::SynetMergedConvolution32f::ConvolutionPtr* convolution) + { + switch (p.activation) + { + case SimdConvolutionActivationIdentity: SetDepthwise(p, last, convolution); break; + case SimdConvolutionActivationRelu: SetDepthwise(p, last, convolution); break; + case SimdConvolutionActivationLeakyRelu: SetDepthwise(p, last, convolution); break; + case SimdConvolutionActivationRestrictRange: SetDepthwise(p, last, convolution); break; + case SimdConvolutionActivationPrelu: SetDepthwise(p, last, convolution); break; + case SimdConvolutionActivationElu: SetDepthwise(p, last, convolution); break; + case SimdConvolutionActivationHswish: SetDepthwise(p, last, convolution); break; + case SimdConvolutionActivationMish: SetDepthwise(p, last, convolution); break; + case SimdConvolutionActivationHardSigmoid: SetDepthwise(p, last, convolution); break; + case SimdConvolutionActivationSwish: SetDepthwise(p, last, convolution); break; + case SimdConvolutionActivationGelu: SetDepthwise(p, last, convolution); break; + default: assert(0); + } + } + } +#endif +} diff --git a/src/Simd/SimdSse41SynetMergedConvolution32fInput.cpp b/src/Simd/SimdSse41SynetMergedConvolution32fInput.cpp new file mode 100644 index 0000000000..577be71ef8 --- /dev/null +++ b/src/Simd/SimdSse41SynetMergedConvolution32fInput.cpp @@ -0,0 +1,609 @@ +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2024 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#include "Simd/SimdSynetMergedConvolution32f.h" +#include "Simd/SimdSynetConvolution32fCommon.h" +#include "Simd/SimdUpdate.h" +#include "Simd/SimdCpu.h" + +namespace Simd +{ +#if defined(SIMD_SSE41_ENABLE) && defined(SIMD_SYNET_ENABLE) + namespace Sse41 + { + template SIMD_INLINE void InputConvolution1x1_2x6(const float* src0, size_t srcC, + const float* weight, const __m128* bias, const __m128* params, float* dst0, float* dst1) + { + __m128 d00, d01, d10, d11, d20, d21, d30, d31, d40, d41, d50, d51, s0, w0, w1; + d00 = bias[0], d01 = bias[1]; + d10 = bias[0], d11 = bias[1]; + d20 = bias[0], d21 = bias[1]; + d30 = bias[0], d31 = bias[1]; + d40 = bias[0], d41 = bias[1]; + d50 = bias[0], d51 = bias[1]; + const float* src1 = src0 + 1 * srcC; + const float* src2 = src0 + 2 * srcC; + const float* src3 = src0 + 3 * srcC; + const float* src4 = src0 + 4 * srcC; + const float* src5 = src0 + 5 * srcC; + for (size_t sc = 0; sc < srcC; ++sc) + { + w0 = _mm_loadu_ps(weight + 0); + w1 = _mm_loadu_ps(weight + F); + s0 = _mm_set1_ps(src0[sc]); + d00 = _mm_add_ps(_mm_mul_ps(s0, w0), d00); + d01 = _mm_add_ps(_mm_mul_ps(s0, w1), d01); + s0 = _mm_set1_ps(src1[sc]); + d10 = _mm_add_ps(_mm_mul_ps(s0, w0), d10); + d11 = _mm_add_ps(_mm_mul_ps(s0, w1), d11); + s0 = _mm_set1_ps(src2[sc]); + d20 = _mm_add_ps(_mm_mul_ps(s0, w0), d20); + d21 = _mm_add_ps(_mm_mul_ps(s0, w1), d21); + s0 = _mm_set1_ps(src3[sc]); + d30 = _mm_add_ps(_mm_mul_ps(s0, w0), d30); + d31 = _mm_add_ps(_mm_mul_ps(s0, w1), d31); + s0 = _mm_set1_ps(src4[sc]); + d40 = _mm_add_ps(_mm_mul_ps(s0, w0), d40); + d41 = _mm_add_ps(_mm_mul_ps(s0, w1), d41); + s0 = _mm_set1_ps(src5[sc]); + d50 = _mm_add_ps(_mm_mul_ps(s0, w0), d50); + d51 = _mm_add_ps(_mm_mul_ps(s0, w1), d51); + weight += DF; + } + _mm_storeu_ps(dst0 + 0 * F, Activate(d00, params, 0)); + _mm_storeu_ps(dst1 + 0 * F, Activate(d01, params, 1)); + _mm_storeu_ps(dst0 + 1 * F, Activate(d10, params, 0)); + _mm_storeu_ps(dst1 + 1 * F, Activate(d11, params, 1)); + _mm_storeu_ps(dst0 + 2 * F, Activate(d20, params, 0)); + _mm_storeu_ps(dst1 + 2 * F, Activate(d21, params, 1)); + _mm_storeu_ps(dst0 + 3 * F, Activate(d30, params, 0)); + _mm_storeu_ps(dst1 + 3 * F, Activate(d31, params, 1)); + _mm_storeu_ps(dst0 + 4 * F, Activate(d40, params, 0)); + _mm_storeu_ps(dst1 + 4 * F, Activate(d41, params, 1)); + _mm_storeu_ps(dst0 + 5 * F, Activate(d50, params, 0)); + _mm_storeu_ps(dst1 + 5 * F, Activate(d51, params, 1)); + } + + template SIMD_INLINE void InputConvolution1x1_2xM(const float* src0, size_t srcC, + const float* weight, const __m128* bias, const __m128* params, float* dst0, float* dst1) + { + __m128 d00, d01, d10, d11, d20, d21, d30, d31, d40, d41, d50, d51, s0, w0, w1; + if (M > 0) d00 = bias[0], d01 = bias[1]; + if (M > 1) d10 = bias[0], d11 = bias[1]; + if (M > 2) d20 = bias[0], d21 = bias[1]; + if (M > 3) d30 = bias[0], d31 = bias[1]; + if (M > 4) d40 = bias[0], d41 = bias[1]; + if (M > 5) d50 = bias[0], d51 = bias[1]; + const float* src1 = src0 + 1 * srcC; + const float* src2 = src0 + 2 * srcC; + const float* src3 = src0 + 3 * srcC; + const float* src4 = src0 + 4 * srcC; + const float* src5 = src0 + 5 * srcC; + for (size_t sc = 0; sc < srcC; ++sc) + { + w0 = _mm_loadu_ps(weight + 0); + w1 = _mm_loadu_ps(weight + F); + if (M > 0) s0 = _mm_set1_ps(src0[sc]), d00 = _mm_add_ps(_mm_mul_ps(s0, w0), d00), d01 = _mm_add_ps(_mm_mul_ps(s0, w1), d01); + if (M > 1) s0 = _mm_set1_ps(src1[sc]), d10 = _mm_add_ps(_mm_mul_ps(s0, w0), d10), d11 = _mm_add_ps(_mm_mul_ps(s0, w1), d11); + if (M > 2) s0 = _mm_set1_ps(src2[sc]), d20 = _mm_add_ps(_mm_mul_ps(s0, w0), d20), d21 = _mm_add_ps(_mm_mul_ps(s0, w1), d21); + if (M > 3) s0 = _mm_set1_ps(src3[sc]), d30 = _mm_add_ps(_mm_mul_ps(s0, w0), d30), d31 = _mm_add_ps(_mm_mul_ps(s0, w1), d31); + if (M > 4) s0 = _mm_set1_ps(src4[sc]), d40 = _mm_add_ps(_mm_mul_ps(s0, w0), d40), d41 = _mm_add_ps(_mm_mul_ps(s0, w1), d41); + if (M > 5) s0 = _mm_set1_ps(src5[sc]), d50 = _mm_add_ps(_mm_mul_ps(s0, w0), d50), d51 = _mm_add_ps(_mm_mul_ps(s0, w1), d51); + weight += DF; + } + if (M > 0) _mm_storeu_ps(dst0 + 0 * F, Activate(d00, params, 0)), _mm_storeu_ps(dst1 + 0 * F, Activate(d01, params, 1)); + if (M > 1) _mm_storeu_ps(dst0 + 1 * F, Activate(d10, params, 0)), _mm_storeu_ps(dst1 + 1 * F, Activate(d11, params, 1)); + if (M > 2) _mm_storeu_ps(dst0 + 2 * F, Activate(d20, params, 0)), _mm_storeu_ps(dst1 + 2 * F, Activate(d21, params, 1)); + if (M > 3) _mm_storeu_ps(dst0 + 3 * F, Activate(d30, params, 0)), _mm_storeu_ps(dst1 + 3 * F, Activate(d31, params, 1)); + if (M > 4) _mm_storeu_ps(dst0 + 4 * F, Activate(d40, params, 0)), _mm_storeu_ps(dst1 + 4 * F, Activate(d41, params, 1)); + if (M > 5) _mm_storeu_ps(dst0 + 5 * F, Activate(d50, params, 0)), _mm_storeu_ps(dst1 + 5 * F, Activate(d51, params, 1)); + } + + typedef void(*InputConvolution1x1_2xM_Ptr)(const float* src0, size_t srcC, const float* weight, const __m128* bias, const __m128* params, float* dst0, float* dst1); + + template InputConvolution1x1_2xM_Ptr GetInputConvolution1x1_2xM(size_t M) + { + switch (M) + { + case 0: return InputConvolution1x1_2xM; + case 1: return InputConvolution1x1_2xM; + case 2: return InputConvolution1x1_2xM; + case 3: return InputConvolution1x1_2xM; + case 4: return InputConvolution1x1_2xM; + case 5: return InputConvolution1x1_2xM; + } + assert(0); + return NULL; + } + + template SIMD_INLINE void InputConvolution1x1_1x6(const float* src0, size_t srcC, + const float* weight, const __m128* bias, const __m128* params, float* dst0) + { + __m128 d00, d10, d20, d30, d40, d50, s0, w0; + d00 = bias[0]; + d10 = bias[0]; + d20 = bias[0]; + d30 = bias[0]; + d40 = bias[0]; + d50 = bias[0]; + const float* src1 = src0 + 1 * srcC; + const float* src2 = src0 + 2 * srcC; + const float* src3 = src0 + 3 * srcC; + const float* src4 = src0 + 4 * srcC; + const float* src5 = src0 + 5 * srcC; + for (size_t sc = 0; sc < srcC; ++sc) + { + w0 = _mm_loadu_ps(weight + 0); + s0 = _mm_set1_ps(src0[sc]); + d00 = _mm_add_ps(_mm_mul_ps(s0, w0), d00); + s0 = _mm_set1_ps(src1[sc]); + d10 = _mm_add_ps(_mm_mul_ps(s0, w0), d10); + s0 = _mm_set1_ps(src2[sc]); + d20 = _mm_add_ps(_mm_mul_ps(s0, w0), d20); + s0 = _mm_set1_ps(src3[sc]); + d30 = _mm_add_ps(_mm_mul_ps(s0, w0), d30); + s0 = _mm_set1_ps(src4[sc]); + d40 = _mm_add_ps(_mm_mul_ps(s0, w0), d40); + s0 = _mm_set1_ps(src5[sc]); + d50 = _mm_add_ps(_mm_mul_ps(s0, w0), d50); + weight += DF; + } + _mm_storeu_ps(dst0 + 0 * F, Activate(d00, params, 0)); + _mm_storeu_ps(dst0 + 1 * F, Activate(d10, params, 0)); + _mm_storeu_ps(dst0 + 2 * F, Activate(d20, params, 0)); + _mm_storeu_ps(dst0 + 3 * F, Activate(d30, params, 0)); + _mm_storeu_ps(dst0 + 4 * F, Activate(d40, params, 0)); + _mm_storeu_ps(dst0 + 5 * F, Activate(d50, params, 0)); + } + + template SIMD_INLINE void InputConvolution1x1_1xM(const float* src0, size_t srcC, + const float* weight, const __m128* bias, const __m128* params, float* dst0) + { + __m128 d00, d10, d20, d30, d40, d50, s0, w0; + if (M > 0) d00 = bias[0]; + if (M > 1) d10 = bias[0]; + if (M > 2) d20 = bias[0]; + if (M > 3) d30 = bias[0]; + if (M > 4) d40 = bias[0]; + if (M > 5) d50 = bias[0]; + const float* src1 = src0 + 1 * srcC; + const float* src2 = src0 + 2 * srcC; + const float* src3 = src0 + 3 * srcC; + const float* src4 = src0 + 4 * srcC; + const float* src5 = src0 + 5 * srcC; + for (size_t sc = 0; sc < srcC; ++sc) + { + w0 = _mm_loadu_ps(weight + 0); + if (M > 0) s0 = _mm_set1_ps(src0[sc]), d00 = _mm_add_ps(_mm_mul_ps(s0, w0), d00); + if (M > 1) s0 = _mm_set1_ps(src1[sc]), d10 = _mm_add_ps(_mm_mul_ps(s0, w0), d10); + if (M > 2) s0 = _mm_set1_ps(src2[sc]), d20 = _mm_add_ps(_mm_mul_ps(s0, w0), d20); + if (M > 3) s0 = _mm_set1_ps(src3[sc]), d30 = _mm_add_ps(_mm_mul_ps(s0, w0), d30); + if (M > 4) s0 = _mm_set1_ps(src4[sc]), d40 = _mm_add_ps(_mm_mul_ps(s0, w0), d40); + if (M > 5) s0 = _mm_set1_ps(src5[sc]), d50 = _mm_add_ps(_mm_mul_ps(s0, w0), d50); + weight += DF; + } + if (M > 0) _mm_storeu_ps(dst0 + 0 * F, Activate(d00, params, 0)); + if (M > 1) _mm_storeu_ps(dst0 + 1 * F, Activate(d10, params, 0)); + if (M > 2) _mm_storeu_ps(dst0 + 2 * F, Activate(d20, params, 0)); + if (M > 3) _mm_storeu_ps(dst0 + 3 * F, Activate(d30, params, 0)); + if (M > 4) _mm_storeu_ps(dst0 + 4 * F, Activate(d40, params, 0)); + if (M > 5) _mm_storeu_ps(dst0 + 5 * F, Activate(d50, params, 0)); + } + + typedef void(*InputConvolution1x1_1xM_Ptr)(const float* src0, size_t srcC, const float* weight, const __m128* bias, const __m128* params, float* dst0); + + template InputConvolution1x1_1xM_Ptr GetInputConvolution1x1_1xM(size_t M) + { + switch (M) + { + case 0: return InputConvolution1x1_1xM; + case 1: return InputConvolution1x1_1xM; + case 2: return InputConvolution1x1_1xM; + case 3: return InputConvolution1x1_1xM; + case 4: return InputConvolution1x1_1xM; + case 5: return InputConvolution1x1_1xM; + } + assert(0); + return NULL; + } + + template void InputConvolution1x1(const float* src, const SimdConvolutionParameters& p, + size_t dstC, size_t yBeg, size_t yEnd, const size_t bufH[2], const float* weight, const float* bias, const float* params, float* dst, int first) + { + size_t srcH = p.srcH, srcW = p.srcW, srcC = p.srcC, dstW = p.dstW; + size_t dstM = (bufH[0] - 1), dstS = bufH[0] * dstW * F; + size_t dstCDF = AlignLo(dstC, DF); + __m128 _params[2], _bias[2]; + _params[0] = _mm_set1_ps(params[0]); + if (type == SimdConvolutionActivationRestrictRange || + type == SimdConvolutionActivationHswish || + type == SimdConvolutionActivationHardSigmoid) + _params[1] = _mm_set1_ps(params[1]); + size_t yInt = Simd::Max(yBeg, yEnd & (~dstM)), nBeg = yBeg * dstW, nInt = yInt * dstW, nEnd = yEnd * dstW; + size_t nInt6 = AlignLoAny(nInt - nBeg, 6) + nBeg, nEnd6 = AlignLoAny(nEnd - nInt, 6) + nInt, nIntTail = nInt - nInt6, nEndTail = nEnd - nEnd6; + InputConvolution1x1_2xM_Ptr tailInt_2 = GetInputConvolution1x1_2xM(nIntTail); + InputConvolution1x1_2xM_Ptr tailEnd_2 = GetInputConvolution1x1_2xM(nEndTail); + + size_t dc = 0; + for (; dc < dstC; dc += DF) + { + _bias[0] = bias ? _mm_loadu_ps(bias + dc + 0) : _mm_setzero_ps(); + _bias[1] = bias ? _mm_loadu_ps(bias + dc + F) : _mm_setzero_ps(); + if (type == ::SimdConvolutionActivationPrelu) + { + _params[0] = _mm_loadu_ps(params + dc + 0); + _params[1] = _mm_loadu_ps(params + dc + F); + } + const float* pS = src + yBeg * srcW * srcC; + const float* pW = weight + dc * srcC; + float* pD = dst + (dc / F) * dstS; + float* dst0 = pD + (yBeg & dstM) * dstW * F; + float* dst1 = pD + (yInt & dstM) * dstW * F; + size_t dn = nBeg; + if (dstC - dc > F) + { + for (; dn < nInt6; dn += 6, pS += 6 * srcC, dst0 += 6 * F) + InputConvolution1x1_2x6(pS, srcC, pW, _bias, _params, dst0, dst0 + dstS); + if (nIntTail) + tailInt_2(pS, srcC, pW, _bias, _params, dst0, dst0 + dstS), pS += nIntTail * srcC, dn += nIntTail; + for (; dn < nEnd6; dn += 6, pS += 6 * srcC, dst1 += 6 * F) + InputConvolution1x1_2x6(pS, srcC, pW, _bias, _params, dst1, dst1 + dstS); + if (nEndTail) + tailEnd_2(pS, srcC, pW, _bias, _params, dst1, dst1 + dstS), pS += nEndTail * srcC, dn += nEndTail; + } + else + { + InputConvolution1x1_1xM_Ptr tailInt_1 = GetInputConvolution1x1_1xM(nIntTail); + InputConvolution1x1_1xM_Ptr tailEnd_1 = GetInputConvolution1x1_1xM(nEndTail); + for (; dn < nInt6; dn += 6, pS += 6 * srcC, dst0 += 6 * F) + InputConvolution1x1_1x6(pS, srcC, pW, _bias, _params, dst0); + if (nIntTail) + tailInt_1(pS, srcC, pW, _bias, _params, dst0), pS += nIntTail * srcC, dn += nIntTail; + for (; dn < nEnd6; dn += 6, pS += 6 * srcC, dst1 += 6 * F) + InputConvolution1x1_1x6(pS, srcC, pW, _bias, _params, dst1); + if (nEndTail) + tailEnd_1(pS, srcC, pW, _bias, _params, dst1), pS += nEndTail * srcC, dn += nEndTail; + } + } + } + + //--------------------------------------------------------------------- + + template SIMD_INLINE void InputConvolution_2x1(const float* src0, const SimdConvolutionParameters& p, + size_t kH, size_t kW, const float* weight, const __m128* bias, const __m128* params, float* dst0, float* dst1) + { + __m128 d00, d01, s0, w0, w1; + d00 = bias[0]; + d01 = bias[1]; + size_t size = kW * p.srcC, tail = DF * (p.kernelX - kW) * p.srcC, stride = p.srcW * p.srcC; + for (size_t ky = 0; ky < kH; ++ky) + { + for (size_t i = 0; i < size; ++i) + { + w0 = _mm_loadu_ps(weight + 0); + w1 = _mm_loadu_ps(weight + F); + s0 = _mm_set1_ps(src0[i]); + d00 = _mm_add_ps(_mm_mul_ps(s0, w0), d00); + d01 = _mm_add_ps(_mm_mul_ps(s0, w1), d01); + weight += DF; + } + weight += tail; + src0 += stride; + } + _mm_storeu_ps(dst0, Activate(d00, params, 0)); + _mm_storeu_ps(dst1, Activate(d01, params, 1)); + } + + template SIMD_INLINE void InputConvolution_1x1(const float* src0, const SimdConvolutionParameters& p, + size_t kH, size_t kW, const float* weight, const __m128* bias, const __m128* params, float* dst0) + { + __m128 d00, s0, w0; + d00 = bias[0]; + size_t size = kW * p.srcC, tail = DF * (p.kernelX - kW) * p.srcC, stride = p.srcW * p.srcC; + for (size_t ky = 0; ky < kH; ++ky) + { + for (size_t i = 0; i < size; ++i) + { + w0 = _mm_loadu_ps(weight + 0); + s0 = _mm_set1_ps(src0[i]); + d00 = _mm_add_ps(_mm_mul_ps(s0, w0), d00); + weight += DF; + } + weight += tail; + src0 += stride; + } + _mm_storeu_ps(dst0, Activate(d00, params, 0)); + } + + template SIMD_INLINE void InputConvolution_2x6(const float* src0, const SimdConvolutionParameters& p, + size_t kH, size_t kW, const float* weight, const __m128* bias, const __m128* params, float* dst0, float* dst1) + { + __m128 d00, d01, d10, d11, d20, d21, d30, d31, d40, d41, d50, d51, s0, w0, w1; + d00 = bias[0], d01 = bias[1]; + d10 = bias[0], d11 = bias[1]; + d20 = bias[0], d21 = bias[1]; + d30 = bias[0], d31 = bias[1]; + d40 = bias[0], d41 = bias[1]; + d50 = bias[0], d51 = bias[1]; + size_t size = kW * p.srcC, tail = DF * (p.kernelX - kW) * p.srcC, stride = p.srcW * p.srcC, step = p.srcC * p.strideX; + const float* src1 = src0 + 1 * step; + const float* src2 = src0 + 2 * step; + const float* src3 = src0 + 3 * step; + const float* src4 = src0 + 4 * step; + const float* src5 = src0 + 5 * step; + for (size_t ky = 0; ky < kH; ++ky) + { + size_t offset = ky * stride; + for (size_t end = offset + size; offset < end; ++offset) + { + w0 = _mm_loadu_ps(weight + 0); + w1 = _mm_loadu_ps(weight + F); + s0 = _mm_set1_ps(src0[offset]); + d00 = _mm_add_ps(_mm_mul_ps(s0, w0), d00); + d01 = _mm_add_ps(_mm_mul_ps(s0, w1), d01); + s0 = _mm_set1_ps(src1[offset]); + d10 = _mm_add_ps(_mm_mul_ps(s0, w0), d10); + d11 = _mm_add_ps(_mm_mul_ps(s0, w1), d11); + s0 = _mm_set1_ps(src2[offset]); + d20 = _mm_add_ps(_mm_mul_ps(s0, w0), d20); + d21 = _mm_add_ps(_mm_mul_ps(s0, w1), d21); + s0 = _mm_set1_ps(src3[offset]); + d30 = _mm_add_ps(_mm_mul_ps(s0, w0), d30); + d31 = _mm_add_ps(_mm_mul_ps(s0, w1), d31); + s0 = _mm_set1_ps(src4[offset]); + d40 = _mm_add_ps(_mm_mul_ps(s0, w0), d40); + d41 = _mm_add_ps(_mm_mul_ps(s0, w1), d41); + s0 = _mm_set1_ps(src5[offset]); + d50 = _mm_add_ps(_mm_mul_ps(s0, w0), d50); + d51 = _mm_add_ps(_mm_mul_ps(s0, w1), d51); + weight += DF; + } + weight += tail; + } + _mm_storeu_ps(dst0 + 0 * F, Activate(d00, params, 0)); + _mm_storeu_ps(dst1 + 0 * F, Activate(d01, params, 1)); + _mm_storeu_ps(dst0 + 1 * F, Activate(d10, params, 0)); + _mm_storeu_ps(dst1 + 1 * F, Activate(d11, params, 1)); + _mm_storeu_ps(dst0 + 2 * F, Activate(d20, params, 0)); + _mm_storeu_ps(dst1 + 2 * F, Activate(d21, params, 1)); + _mm_storeu_ps(dst0 + 3 * F, Activate(d30, params, 0)); + _mm_storeu_ps(dst1 + 3 * F, Activate(d31, params, 1)); + _mm_storeu_ps(dst0 + 4 * F, Activate(d40, params, 0)); + _mm_storeu_ps(dst1 + 4 * F, Activate(d41, params, 1)); + _mm_storeu_ps(dst0 + 5 * F, Activate(d50, params, 0)); + _mm_storeu_ps(dst1 + 5 * F, Activate(d51, params, 1)); + } + + template SIMD_INLINE void InputConvolution_1x6(const float* src0, const SimdConvolutionParameters& p, + size_t kH, size_t kW, const float* weight, const __m128* bias, const __m128* params, float* dst0) + { + __m128 d00, d10, d20, d30, d40, d50, s0, w0; + d00 = bias[0]; + d10 = bias[0]; + d20 = bias[0]; + d30 = bias[0]; + d40 = bias[0]; + d50 = bias[0]; + size_t size = kW * p.srcC, tail = DF * (p.kernelX - kW) * p.srcC, stride = p.srcW * p.srcC, step = p.srcC * p.strideX; + const float* src1 = src0 + 1 * step; + const float* src2 = src0 + 2 * step; + const float* src3 = src0 + 3 * step; + const float* src4 = src0 + 4 * step; + const float* src5 = src0 + 5 * step; + for (size_t ky = 0; ky < kH; ++ky) + { + size_t offset = ky * stride; + for (size_t end = offset + size; offset < end; ++offset) + { + w0 = _mm_loadu_ps(weight + 0); + s0 = _mm_set1_ps(src0[offset]); + d00 = _mm_add_ps(_mm_mul_ps(s0, w0), d00); + s0 = _mm_set1_ps(src1[offset]); + d10 = _mm_add_ps(_mm_mul_ps(s0, w0), d10); + s0 = _mm_set1_ps(src2[offset]); + d20 = _mm_add_ps(_mm_mul_ps(s0, w0), d20); + s0 = _mm_set1_ps(src3[offset]); + d30 = _mm_add_ps(_mm_mul_ps(s0, w0), d30); + s0 = _mm_set1_ps(src4[offset]); + d40 = _mm_add_ps(_mm_mul_ps(s0, w0), d40); + s0 = _mm_set1_ps(src5[offset]); + d50 = _mm_add_ps(_mm_mul_ps(s0, w0), d50); + weight += DF; + } + weight += tail; + } + _mm_storeu_ps(dst0 + 0 * F, Activate(d00, params, 0)); + _mm_storeu_ps(dst0 + 1 * F, Activate(d10, params, 0)); + _mm_storeu_ps(dst0 + 2 * F, Activate(d20, params, 0)); + _mm_storeu_ps(dst0 + 3 * F, Activate(d30, params, 0)); + _mm_storeu_ps(dst0 + 4 * F, Activate(d40, params, 0)); + _mm_storeu_ps(dst0 + 5 * F, Activate(d50, params, 0)); + } + + template void InputConvolution(const float* src, const SimdConvolutionParameters& p, + size_t dstC, size_t yBeg, size_t yEnd, const size_t bufH[2], const float* weight, const float* bias, const float* params, float* dst, int first) + { + size_t srcH = p.srcH, srcW = p.srcW, srcC = p.srcC, dstW = p.dstW; + size_t kernelY = p.kernelY, kernelX = p.kernelX, strideY = p.strideY, strideX = p.strideX; + size_t dstM = (bufH[0] - 1), dstS = bufH[0] * dstW * F; + size_t dstCDF = AlignLo(dstC, DF); + if (dstC - F > dstCDF) + dstCDF += DF; + + size_t noseH = p.padY, noseW = p.padX; + size_t bodyH = p.srcH - p.kernelY + 1 + noseH, bodyW = p.srcW - p.kernelX + 1 + noseW; + size_t bodyW6 = AlignLoAny(bodyW - noseW, 6 * p.strideX) + noseW; + size_t tailH = bodyH + p.padH, tailW = bodyW + p.padW; + size_t wS = p.srcC * p.dstC; + size_t kY = p.kernelY - noseH, kX = p.kernelX - noseW, kH = bodyH + p.kernelY - 1, kW = bodyW + p.kernelX - 1; + + __m128 _params[2], _bias[2]; + _params[0] = _mm_set1_ps(params[0]); + if (type == SimdConvolutionActivationRestrictRange || + type == SimdConvolutionActivationHswish || + type == SimdConvolutionActivationHardSigmoid) + _params[1] = _mm_set1_ps(params[1]); + + size_t dc = 0; + for (; dc < dstCDF; dc += DF) + { + _bias[0] = bias ? _mm_loadu_ps(bias + dc + 0) : _mm_setzero_ps(); + _bias[1] = bias ? _mm_loadu_ps(bias + dc + F) : _mm_setzero_ps(); + if (type == ::SimdConvolutionActivationPrelu) + { + _params[0] = _mm_loadu_ps(params + dc + 0); + _params[1] = _mm_loadu_ps(params + dc + F); + } + size_t dy = yBeg, sy = dy * strideY; + for (; sy < noseH && dy < yEnd; sy += strideY, dy++) + { + float* dst0 = dst + (dy & dstM) * dstW * F + (dc / F) * dstS, * dst1 = dst0 + dstS; + size_t sx = 0; + const float* s = src; + const float* w = weight + (noseH - sy) * kernelX * DF * srcC; + for (; sx < noseW; sx += strideX, dst0 += F, dst1 += F) + InputConvolution_2x1(s, p, kY + sy, kX + sx, w + (noseW - sx) * srcC * DF, _bias, _params, dst0, dst1); + for (; sx < bodyW6; sx += 6 * strideX, dst0 += 6 * F, dst1 += 6 * F) + InputConvolution_2x6(s + (sx - noseW) * srcC, p, kY + sy, kernelX, w, _bias, _params, dst0, dst1); + for (; sx < bodyW; sx += strideX, dst0 += F, dst1 += F) + InputConvolution_2x1(s + (sx - noseW) * srcC, p, kY + sy, kernelX, w, _bias, _params, dst0, dst1); + for (; sx < tailW; sx += strideX, dst0 += F, dst1 += F) + InputConvolution_2x1(s + (sx - noseW) * srcC, p, kY + sy, kW - sx, w, _bias, _params, dst0, dst1); + } + for (; sy < bodyH && dy < yEnd; sy += strideY, dy++) + { + float* dst0 = dst + (dy & dstM) * dstW * F + (dc / F) * dstS, * dst1 = dst0 + dstS; + size_t sx = 0; + const float* s = src + (sy - noseH) * srcW * srcC; + const float* w = weight; + for (; sx < noseW; sx += strideX, dst0 += F, dst1 += F) + InputConvolution_2x1(s, p, kernelY, kX + sx, w + (noseW - sx) * srcC * DF, _bias, _params, dst0, dst1); + for (; sx < bodyW6; sx += 6 * strideX, dst0 += 6 * F, dst1 += 6 * F) + InputConvolution_2x6(s + (sx - noseW) * srcC, p, kernelY, kernelX, w, _bias, _params, dst0, dst1); + for (; sx < bodyW; sx += strideX, dst0 += F, dst1 += F) + InputConvolution_2x1(s + (sx - noseW) * srcC, p, kernelY, kernelX, w, _bias, _params, dst0, dst1); + for (; sx < tailW; sx += strideX, dst0 += F, dst1 += F) + InputConvolution_2x1(s + (sx - noseW) * srcC, p, kernelY, kW - sx, w, _bias, _params, dst0, dst1); + } + for (; sy < tailH && dy < yEnd; sy += strideY, dy++) + { + float* dst0 = dst + (dy & dstM) * dstW * F + (dc / F) * dstS, * dst1 = dst0 + dstS; + size_t sx = 0; + const float* s = src + (sy - noseH) * srcW * srcC; + const float* w = weight; + for (; sx < noseW; sx += strideX, dst0 += F, dst1 += F) + InputConvolution_2x1(s, p, kH - sy, kX + sx, w + (noseW - sx) * srcC * DF, _bias, _params, dst0, dst1); + for (; sx < bodyW6; sx += 6 * strideX, dst0 += 6 * F, dst1 += 6 * F) + InputConvolution_2x6(s + (sx - noseW) * srcC, p, kH - sy, kernelX, w, _bias, _params, dst0, dst1); + for (; sx < bodyW; sx += strideX, dst0 += F, dst1 += F) + InputConvolution_2x1(s + (sx - noseW) * srcC, p, kH - sy, kernelX, w, _bias, _params, dst0, dst1); + for (; sx < tailW; sx += strideX, dst0 += F, dst1 += F) + InputConvolution_2x1(s + (sx - noseW) * srcC, p, kH - sy, kW - sx, w, _bias, _params, dst0, dst1); + } + weight += kernelY * kernelX * srcC * DF; + } + if (dc < dstC) + { + _bias[0] = bias ? _mm_loadu_ps(bias + dc) : _mm_setzero_ps(); + if (type == ::SimdConvolutionActivationPrelu) + _params[0] = _mm_loadu_ps(params + dc); + size_t dy = yBeg, sy = dy * strideY; + for (; sy < noseH && dy < yEnd; sy += strideY, dy++) + { + float* dst0 = dst + (dy & dstM) * dstW * F + (dc / F) * dstS; + size_t sx = 0; + const float* s = src; + const float* w = weight + (noseH - sy) * kernelX * DF * srcC; + for (; sx < noseW; sx += strideX, dst0 += F) + InputConvolution_1x1(s, p, kY + sy, kX + sx, w + (noseW - sx) * srcC * DF, _bias, _params, dst0); + for (; sx < bodyW6; sx += 6 * strideX, dst0 += 6 * F) + InputConvolution_1x6(s + (sx - noseW) * srcC, p, kY + sy, kernelX, w, _bias, _params, dst0); + for (; sx < bodyW; sx += strideX, dst0 += F) + InputConvolution_1x1(s + (sx - noseW) * srcC, p, kY + sy, kernelX, w, _bias, _params, dst0); + for (; sx < tailW; sx += strideX, dst0 += F) + InputConvolution_1x1(s + (sx - noseW) * srcC, p, kY + sy, kW - sx, w, _bias, _params, dst0); + } + for (; sy < bodyH && dy < yEnd; sy += strideY, dy++) + { + float* dst0 = dst + (dy & dstM) * dstW * F + (dc / F) * dstS; + size_t sx = 0; + const float* s = src + (sy - noseH) * srcW * srcC; + const float* w = weight; + for (; sx < noseW; sx += strideX, dst0 += F) + InputConvolution_1x1(s, p, kernelY, kX + sx, w + (noseW - sx) * srcC * DF, _bias, _params, dst0); + for (; sx < bodyW6; sx += 6 * strideX, dst0 += 6 * F) + InputConvolution_1x6(s + (sx - noseW) * srcC, p, kernelY, kernelX, w, _bias, _params, dst0); + for (; sx < bodyW; sx += strideX, dst0 += F) + InputConvolution_1x1(s + (sx - noseW) * srcC, p, kernelY, kernelX, w, _bias, _params, dst0); + for (; sx < tailW; sx += strideX, dst0 += F) + InputConvolution_1x1(s + (sx - noseW) * srcC, p, kernelY, kW - sx, w, _bias, _params, dst0); + } + for (; sy < tailH && dy < yEnd; sy += strideY, dy++) + { + float* dst0 = dst + (dy & dstM) * dstW * F + (dc / F) * dstS; + size_t sx = 0; + const float* s = src + (sy - noseH) * srcW * srcC; + const float* w = weight; + for (; sx < noseW; sx += strideX, dst0 += F) + InputConvolution_1x1(s, p, kH - sy, kX + sx, w + (noseW - sx) * srcC * DF, _bias, _params, dst0); + for (; sx < bodyW6; sx += 6 * strideX, dst0 += 6 * F) + InputConvolution_1x6(s + (sx - noseW) * srcC, p, kH - sy, kernelX, w, _bias, _params, dst0); + for (; sx < bodyW; sx += strideX, dst0 += F) + InputConvolution_1x1(s + (sx - noseW) * srcC, p, kH - sy, kernelX, w, _bias, _params, dst0); + for (; sx < tailW; sx += strideX, dst0 += F) + InputConvolution_1x1(s + (sx - noseW) * srcC, p, kH - sy, kW - sx, w, _bias, _params, dst0); + } + } + } + + //------------------------------------------------------------------------------------------------------- + + template void SetInput(const ConvParam& p, Base::SynetMergedConvolution32f::ConvolutionPtr* convolution) + { + if (p.kernelY == 1 && p.strideY == 1) + convolution[0] = InputConvolution1x1; + else + convolution[0] = InputConvolution; + } + + void SetInput(const ConvParam& p, Base::SynetMergedConvolution32f::ConvolutionPtr* convolution) + { + switch (p.activation) + { + case SimdConvolutionActivationIdentity: SetInput(p, convolution); break; + case SimdConvolutionActivationRelu: SetInput(p, convolution); break; + case SimdConvolutionActivationLeakyRelu: SetInput(p, convolution); break; + case SimdConvolutionActivationRestrictRange: SetInput(p, convolution); break; + case SimdConvolutionActivationPrelu: SetInput(p, convolution); break; + case SimdConvolutionActivationElu: SetInput(p, convolution); break; + case SimdConvolutionActivationHswish: SetInput(p, convolution); break; + case SimdConvolutionActivationMish: SetInput(p, convolution); break; + case SimdConvolutionActivationHardSigmoid: SetInput(p, convolution); break; + case SimdConvolutionActivationSwish: SetInput(p, convolution); break; + case SimdConvolutionActivationGelu: SetInput(p, convolution); break; + default: assert(0); + } + } + } +#endif +} diff --git a/src/Simd/SimdSse41SynetMergedConvolution32fOutput.cpp b/src/Simd/SimdSse41SynetMergedConvolution32fOutput.cpp new file mode 100644 index 0000000000..cfdd9b7705 --- /dev/null +++ b/src/Simd/SimdSse41SynetMergedConvolution32fOutput.cpp @@ -0,0 +1,570 @@ +/* +* Simd Library (http://ermig1979.github.io/Simd). +* +* Copyright (c) 2011-2024 Yermalayeu Ihar. +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice shall be included in +* all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ +#include "Simd/SimdSynetMergedConvolution32f.h" +#include "Simd/SimdSynetConvolution32fCommon.h" +#include "Simd/SimdUpdate.h" +#include "Simd/SimdCpu.h" + +namespace Simd +{ +#if defined(SIMD_SSE41_ENABLE) && defined(SIMD_SYNET_ENABLE) + namespace Sse41 + { + template void OutputConvolution_2x6(const float* src, size_t srcC, size_t srcS, + const float* weight, const __m128* bias, const __m128* params, float* dst, size_t dstC, size_t tail, int first) + { + __m128 d00, d01, d10, d11, d20, d21, d30, d31, d40, d41, d50, d51, s0, w0, w1; + if (tail > F) + { + if (first) + { + d00 = _mm_setzero_ps(), d01 = _mm_setzero_ps(); + d10 = _mm_setzero_ps(), d11 = _mm_setzero_ps(); + d20 = _mm_setzero_ps(), d21 = _mm_setzero_ps(); + d30 = _mm_setzero_ps(), d31 = _mm_setzero_ps(); + d40 = _mm_setzero_ps(), d41 = _mm_setzero_ps(); + d50 = _mm_setzero_ps(), d51 = _mm_setzero_ps(); + } + else + { + d00 = _mm_loadu_ps(dst + 0 * dstC + 0), d01 = _mm_loadu_ps(dst + 0 * dstC + F); + d10 = _mm_loadu_ps(dst + 1 * dstC + 0), d11 = _mm_loadu_ps(dst + 1 * dstC + F); + d20 = _mm_loadu_ps(dst + 2 * dstC + 0), d21 = _mm_loadu_ps(dst + 2 * dstC + F); + d30 = _mm_loadu_ps(dst + 3 * dstC + 0), d31 = _mm_loadu_ps(dst + 3 * dstC + F); + d40 = _mm_loadu_ps(dst + 4 * dstC + 0), d41 = _mm_loadu_ps(dst + 4 * dstC + F); + d50 = _mm_loadu_ps(dst + 5 * dstC + 0), d51 = _mm_loadu_ps(dst + 5 * dstC + F); + } + for (size_t c = 0; c < srcC; c += F) + { + size_t n = Simd::Min(F, srcC - c); + for (size_t i = 0; i < n; ++i, weight += DF) + { + w0 = _mm_loadu_ps(weight + 0); + w1 = _mm_loadu_ps(weight + F); + s0 = _mm_set1_ps(src[i + 0 * F]); + d00 = _mm_add_ps(_mm_mul_ps(s0, w0), d00); + d01 = _mm_add_ps(_mm_mul_ps(s0, w1), d01); + s0 = _mm_set1_ps(src[i + 1 * F]); + d10 = _mm_add_ps(_mm_mul_ps(s0, w0), d10); + d11 = _mm_add_ps(_mm_mul_ps(s0, w1), d11); + s0 = _mm_set1_ps(src[i + 2 * F]); + d20 = _mm_add_ps(_mm_mul_ps(s0, w0), d20); + d21 = _mm_add_ps(_mm_mul_ps(s0, w1), d21); + s0 = _mm_set1_ps(src[i + 3 * F]); + d30 = _mm_add_ps(_mm_mul_ps(s0, w0), d30); + d31 = _mm_add_ps(_mm_mul_ps(s0, w1), d31); + s0 = _mm_set1_ps(src[i + 4 * F]); + d40 = _mm_add_ps(_mm_mul_ps(s0, w0), d40); + d41 = _mm_add_ps(_mm_mul_ps(s0, w1), d41); + s0 = _mm_set1_ps(src[i + 5 * F]); + d50 = _mm_add_ps(_mm_mul_ps(s0, w0), d50); + d51 = _mm_add_ps(_mm_mul_ps(s0, w1), d51); + } + src += srcS; + } + if (tail == DF) + { + Term::template Save(dst + 0, d00, bias, params); + Term::template Save(dst + F, d01, bias, params); + dst += dstC; + Term::template Save(dst + 0, d10, bias, params); + Term::template Save(dst + F, d11, bias, params); + dst += dstC; + Term::template Save(dst + 0, d20, bias, params); + Term::template Save(dst + F, d21, bias, params); + dst += dstC; + Term::template Save(dst + 0, d30, bias, params); + Term::template Save(dst + F, d31, bias, params); + dst += dstC; + Term::template Save(dst + 0, d40, bias, params); + Term::template Save(dst + F, d41, bias, params); + dst += dstC; + Term::template Save(dst + 0, d50, bias, params); + Term::template Save(dst + F, d51, bias, params); + } + else + { + tail -= F; + Term::template Save(dst + 0, d00, bias, params); + Term::template Save(dst + F, d01, bias, params, tail); + dst += dstC; + Term::template Save(dst + 0, d10, bias, params); + Term::template Save(dst + F, d11, bias, params, tail); + dst += dstC; + Term::template Save(dst + 0, d20, bias, params); + Term::template Save(dst + F, d21, bias, params, tail); + dst += dstC; + Term::template Save(dst + 0, d30, bias, params); + Term::template Save(dst + F, d31, bias, params, tail); + dst += dstC; + Term::template Save(dst + 0, d40, bias, params); + Term::template Save(dst + F, d41, bias, params, tail); + dst += dstC; + Term::template Save(dst + 0, d50, bias, params); + Term::template Save(dst + F, d51, bias, params, tail); + } + } + else + { + if (first) + { + d00 = _mm_setzero_ps(); + d10 = _mm_setzero_ps(); + d20 = _mm_setzero_ps(); + d30 = _mm_setzero_ps(); + d40 = _mm_setzero_ps(); + d50 = _mm_setzero_ps(); + } + else + { + d00 = _mm_loadu_ps(dst + 0 * dstC + 0); + d10 = _mm_loadu_ps(dst + 1 * dstC + 0); + d20 = _mm_loadu_ps(dst + 2 * dstC + 0); + d30 = _mm_loadu_ps(dst + 3 * dstC + 0); + d40 = _mm_loadu_ps(dst + 4 * dstC + 0); + d50 = _mm_loadu_ps(dst + 5 * dstC + 0); + } + for (size_t c = 0; c < srcC; c += F) + { + size_t n = Simd::Min(F, srcC - c); + for (size_t i = 0; i < n; ++i, weight += DF) + { + w0 = _mm_loadu_ps(weight + 0); + s0 = _mm_set1_ps(src[i + 0 * F]); + d00 = _mm_add_ps(_mm_mul_ps(s0, w0), d00); + s0 = _mm_set1_ps(src[i + 1 * F]); + d10 = _mm_add_ps(_mm_mul_ps(s0, w0), d10); + s0 = _mm_set1_ps(src[i + 2 * F]); + d20 = _mm_add_ps(_mm_mul_ps(s0, w0), d20); + s0 = _mm_set1_ps(src[i + 3 * F]); + d30 = _mm_add_ps(_mm_mul_ps(s0, w0), d30); + s0 = _mm_set1_ps(src[i + 4 * F]); + d40 = _mm_add_ps(_mm_mul_ps(s0, w0), d40); + s0 = _mm_set1_ps(src[i + 5 * F]); + d50 = _mm_add_ps(_mm_mul_ps(s0, w0), d50); + } + src += srcS; + } + if (tail == F) + { + Term::template Save(dst + 0, d00, bias, params); + dst += dstC; + Term::template Save(dst + 0, d10, bias, params); + dst += dstC; + Term::template Save(dst + 0, d20, bias, params); + dst += dstC; + Term::template Save(dst + 0, d30, bias, params); + dst += dstC; + Term::template Save(dst + 0, d40, bias, params); + dst += dstC; + Term::template Save(dst + 0, d50, bias, params); + } + else + { + Term::template Save(dst + 0, d00, bias, params, tail); + dst += dstC; + Term::template Save(dst + 0, d10, bias, params, tail); + dst += dstC; + Term::template Save(dst + 0, d20, bias, params, tail); + dst += dstC; + Term::template Save(dst + 0, d30, bias, params, tail); + dst += dstC; + Term::template Save(dst + 0, d40, bias, params, tail); + dst += dstC; + Term::template Save(dst + 0, d50, bias, params, tail); + } + } + } + + template void OutputConvolution_2x4(const float* src, size_t srcC, size_t srcS, + const float* weight, const __m128* bias, const __m128* params, float* dst, size_t dstC, size_t tail, int first) + { + __m128 d00, d01, d10, d11, d20, d21, d30, d31, s0, w0, w1; + if (tail > F) + { + if (first) + { + d00 = _mm_setzero_ps(), d01 = _mm_setzero_ps(); + d10 = _mm_setzero_ps(), d11 = _mm_setzero_ps(); + d20 = _mm_setzero_ps(), d21 = _mm_setzero_ps(); + d30 = _mm_setzero_ps(), d31 = _mm_setzero_ps(); + } + else + { + d00 = _mm_loadu_ps(dst + 0 * dstC + 0), d01 = _mm_loadu_ps(dst + 0 * dstC + F); + d10 = _mm_loadu_ps(dst + 1 * dstC + 0), d11 = _mm_loadu_ps(dst + 1 * dstC + F); + d20 = _mm_loadu_ps(dst + 2 * dstC + 0), d21 = _mm_loadu_ps(dst + 2 * dstC + F); + d30 = _mm_loadu_ps(dst + 3 * dstC + 0), d31 = _mm_loadu_ps(dst + 3 * dstC + F); + } + for (size_t c = 0; c < srcC; c += F) + { + size_t n = Simd::Min(F, srcC - c); + for (size_t i = 0; i < n; ++i, weight += DF) + { + w0 = _mm_loadu_ps(weight + 0); + w1 = _mm_loadu_ps(weight + F); + s0 = _mm_set1_ps(src[i + 0 * F]); + d00 = _mm_add_ps(_mm_mul_ps(s0, w0), d00); + d01 = _mm_add_ps(_mm_mul_ps(s0, w1), d01); + s0 = _mm_set1_ps(src[i + 1 * F]); + d10 = _mm_add_ps(_mm_mul_ps(s0, w0), d10); + d11 = _mm_add_ps(_mm_mul_ps(s0, w1), d11); + s0 = _mm_set1_ps(src[i + 2 * F]); + d20 = _mm_add_ps(_mm_mul_ps(s0, w0), d20); + d21 = _mm_add_ps(_mm_mul_ps(s0, w1), d21); + s0 = _mm_set1_ps(src[i + 3 * F]); + d30 = _mm_add_ps(_mm_mul_ps(s0, w0), d30); + d31 = _mm_add_ps(_mm_mul_ps(s0, w1), d31); + } + src += srcS; + } + if (tail == DF) + { + Term::template Save(dst + 0, d00, bias, params); + Term::template Save(dst + F, d01, bias, params); + dst += dstC; + Term::template Save(dst + 0, d10, bias, params); + Term::template Save(dst + F, d11, bias, params); + dst += dstC; + Term::template Save(dst + 0, d20, bias, params); + Term::template Save(dst + F, d21, bias, params); + dst += dstC; + Term::template Save(dst + 0, d30, bias, params); + Term::template Save(dst + F, d31, bias, params); + } + else + { + tail -= F; + Term::template Save(dst + 0, d00, bias, params); + Term::template Save(dst + F, d01, bias, params, tail); + dst += dstC; + Term::template Save(dst + 0, d10, bias, params); + Term::template Save(dst + F, d11, bias, params, tail); + dst += dstC; + Term::template Save(dst + 0, d20, bias, params); + Term::template Save(dst + F, d21, bias, params, tail); + dst += dstC; + Term::template Save(dst + 0, d30, bias, params); + Term::template Save(dst + F, d31, bias, params, tail); + } + } + else + { + if (first) + { + d00 = _mm_setzero_ps(); + d10 = _mm_setzero_ps(); + d20 = _mm_setzero_ps(); + d30 = _mm_setzero_ps(); + } + else + { + d00 = _mm_loadu_ps(dst + 0 * dstC + 0); + d10 = _mm_loadu_ps(dst + 1 * dstC + 0); + d20 = _mm_loadu_ps(dst + 2 * dstC + 0); + d30 = _mm_loadu_ps(dst + 3 * dstC + 0); + } + for (size_t c = 0; c < srcC; c += F) + { + size_t n = Simd::Min(F, srcC - c); + for (size_t i = 0; i < n; ++i, weight += DF) + { + w0 = _mm_loadu_ps(weight + 0); + s0 = _mm_set1_ps(src[i + 0 * F]); + d00 = _mm_add_ps(_mm_mul_ps(s0, w0), d00); + s0 = _mm_set1_ps(src[i + 1 * F]); + d10 = _mm_add_ps(_mm_mul_ps(s0, w0), d10); + s0 = _mm_set1_ps(src[i + 2 * F]); + d20 = _mm_add_ps(_mm_mul_ps(s0, w0), d20); + s0 = _mm_set1_ps(src[i + 3 * F]); + d30 = _mm_add_ps(_mm_mul_ps(s0, w0), d30); + } + src += srcS; + } + if (tail == F) + { + Term::template Save(dst + 0, d00, bias, params); + dst += dstC; + Term::template Save(dst + 0, d10, bias, params); + dst += dstC; + Term::template Save(dst + 0, d20, bias, params); + dst += dstC; + Term::template Save(dst + 0, d30, bias, params); + } + else + { + Term::template Save(dst + 0, d00, bias, params, tail); + dst += dstC; + Term::template Save(dst + 0, d10, bias, params, tail); + dst += dstC; + Term::template Save(dst + 0, d20, bias, params, tail); + dst += dstC; + Term::template Save(dst + 0, d30, bias, params, tail); + } + } + } + + template void OutputConvolution_2x3(const float* src, size_t srcC, size_t srcS, + const float* weight, const __m128* bias, const __m128* params, float* dst, size_t dstC, size_t tail, int first) + { + __m128 d00, d01, d10, d11, d20, d21, s0, w0, w1; + if (tail > F) + { + if (first) + { + d00 = _mm_setzero_ps(), d01 = _mm_setzero_ps(); + d10 = _mm_setzero_ps(), d11 = _mm_setzero_ps(); + d20 = _mm_setzero_ps(), d21 = _mm_setzero_ps(); + } + else + { + d00 = _mm_loadu_ps(dst + 0 * dstC + 0), d01 = _mm_loadu_ps(dst + 0 * dstC + F); + d10 = _mm_loadu_ps(dst + 1 * dstC + 0), d11 = _mm_loadu_ps(dst + 1 * dstC + F); + d20 = _mm_loadu_ps(dst + 2 * dstC + 0), d21 = _mm_loadu_ps(dst + 2 * dstC + F); + } + for (size_t c = 0; c < srcC; c += F) + { + size_t n = Simd::Min(F, srcC - c); + for (size_t i = 0; i < n; ++i, weight += DF) + { + w0 = _mm_loadu_ps(weight + 0); + w1 = _mm_loadu_ps(weight + F); + s0 = _mm_set1_ps(src[i + 0 * F]); + d00 = _mm_add_ps(_mm_mul_ps(s0, w0), d00); + d01 = _mm_add_ps(_mm_mul_ps(s0, w1), d01); + s0 = _mm_set1_ps(src[i + 1 * F]); + d10 = _mm_add_ps(_mm_mul_ps(s0, w0), d10); + d11 = _mm_add_ps(_mm_mul_ps(s0, w1), d11); + s0 = _mm_set1_ps(src[i + 2 * F]); + d20 = _mm_add_ps(_mm_mul_ps(s0, w0), d20); + d21 = _mm_add_ps(_mm_mul_ps(s0, w1), d21); + } + src += srcS; + } + if (tail == DF) + { + Term::template Save(dst + 0, d00, bias, params); + Term::template Save(dst + F, d01, bias, params); + dst += dstC; + Term::template Save(dst + 0, d10, bias, params); + Term::template Save(dst + F, d11, bias, params); + dst += dstC; + Term::template Save(dst + 0, d20, bias, params); + Term::template Save(dst + F, d21, bias, params); + } + else + { + tail -= F; + Term::template Save(dst + 0, d00, bias, params); + Term::template Save(dst + F, d01, bias, params, tail); + dst += dstC; + Term::template Save(dst + 0, d10, bias, params); + Term::template Save(dst + F, d11, bias, params, tail); + dst += dstC; + Term::template Save(dst + 0, d20, bias, params); + Term::template Save(dst + F, d21, bias, params, tail); + } + } + else + { + if (first) + { + d00 = _mm_setzero_ps(); + d10 = _mm_setzero_ps(); + d20 = _mm_setzero_ps(); + } + else + { + d00 = _mm_loadu_ps(dst + 0 * dstC + 0); + d10 = _mm_loadu_ps(dst + 1 * dstC + 0); + d20 = _mm_loadu_ps(dst + 2 * dstC + 0); + } + for (size_t c = 0; c < srcC; c += F) + { + size_t n = Simd::Min(F, srcC - c); + for (size_t i = 0; i < n; ++i, weight += DF) + { + w0 = _mm_loadu_ps(weight + 0); + s0 = _mm_set1_ps(src[i + 0 * F]); + d00 = _mm_add_ps(_mm_mul_ps(s0, w0), d00); + s0 = _mm_set1_ps(src[i + 1 * F]); + d10 = _mm_add_ps(_mm_mul_ps(s0, w0), d10); + s0 = _mm_set1_ps(src[i + 2 * F]); + d20 = _mm_add_ps(_mm_mul_ps(s0, w0), d20); + } + src += srcS; + } + if (tail == F) + { + Term::template Save(dst + 0, d00, bias, params); + dst += dstC; + Term::template Save(dst + 0, d10, bias, params); + dst += dstC; + Term::template Save(dst + 0, d20, bias, params); + } + else + { + Term::template Save(dst + 0, d00, bias, params, tail); + dst += dstC; + Term::template Save(dst + 0, d10, bias, params, tail); + dst += dstC; + Term::template Save(dst + 0, d20, bias, params, tail); + } + } + } + + template void OutputConvolution_2x1(const float* src, size_t srcC, size_t srcS, + const float* weight, const __m128* bias, const __m128* params, float* dst, size_t dstC, size_t tail, int first) + { + __m128 d00, d01, s0, w0, w1; + if (tail > F) + { + if (first) + d00 = _mm_setzero_ps(), d01 = _mm_setzero_ps(); + else + d00 = _mm_loadu_ps(dst + 0 * dstC + 0), d01 = _mm_loadu_ps(dst + 0 * dstC + F); + for (size_t c = 0; c < srcC; c += F) + { + size_t n = Simd::Min(F, srcC - c); + for (size_t i = 0; i < n; ++i, weight += DF) + { + w0 = _mm_loadu_ps(weight + 0); + w1 = _mm_loadu_ps(weight + F); + s0 = _mm_set1_ps(src[i + 0 * F]); + d00 = _mm_add_ps(_mm_mul_ps(s0, w0), d00); + d01 = _mm_add_ps(_mm_mul_ps(s0, w1), d01); + } + src += srcS; + } + if (tail == DF) + { + Term::template Save(dst + 0, d00, bias, params); + Term::template Save(dst + F, d01, bias, params); + } + else + { + Term::template Save(dst + 0, d00, bias, params); + Term::template Save(dst + F, d01, bias, params, tail - F); + } + } + else + { + if (first) + d00 = _mm_setzero_ps(); + else + d00 = _mm_loadu_ps(dst + 0 * dstC + 0); + for (size_t c = 0; c < srcC; c += F) + { + size_t n = Simd::Min(F, srcC - c); + for (size_t i = 0; i < n; ++i, weight += DF) + { + w0 = _mm_loadu_ps(weight + 0); + s0 = _mm_set1_ps(src[i + 0 * F]); + d00 = _mm_add_ps(_mm_mul_ps(s0, w0), d00); + } + src += srcS; + } + if (tail == F) + Term::template Save(dst + 0, d00, bias, params); + else + Term::template Save(dst + 0, d00, bias, params, tail); + } + } + + template void OutputConvolution(const float* src, const SimdConvolutionParameters& p, + size_t srcC, size_t yBeg, size_t yEnd, const size_t bufH[2], const float* weight, const float* bias, const float* params, float* dst, int first) + { + assert(p.group == 1 && p.kernelY == 1 && p.strideY == 1); + size_t srcH = p.srcH, srcW = p.srcW, dstW = p.dstW, dstC = p.dstC; + size_t srcM = (bufH[1] - 1), srcS = bufH[1] * srcW * F; + size_t dstW3 = AlignLoAny(dstW, 3), dstW6 = AlignLoAny(dstW, 6); + __m128 _params[2], _bias[2]; + _params[0] = _mm_set1_ps(params[0]); + if (type == SimdConvolutionActivationRestrictRange || + type == SimdConvolutionActivationHswish || + type == SimdConvolutionActivationHardSigmoid) + _params[1] = _mm_set1_ps(params[1]); + + dst += yBeg * p.dstW * p.dstC; + size_t dc = 0; + for (; dc < dstC; dc += DF) + { + size_t tail = Simd::Min(DF, dstC - dc); + _bias[0] = _mm_loadu_ps(bias + dc + 0); + _bias[1] = _mm_loadu_ps(bias + dc + F); + if (type == ::SimdConvolutionActivationPrelu) + { + _params[0] = _mm_loadu_ps(params + dc + 0); + _params[1] = _mm_loadu_ps(params + dc + F); + } + float* pDst = dst + dc; + for (size_t y = yBeg; y < yEnd; ++y) + { + const float* pSrc = src + (y & srcM) * srcW * F; + size_t x = 0; + for (; x < dstW6; x += 6, pDst += 6 * dstC, pSrc += 6 * F) + OutputConvolution_2x6(pSrc, srcC, srcS, weight, _bias, _params, pDst, dstC, tail, first); + if (dstW - dstW6 == 4) + OutputConvolution_2x4(pSrc, srcC, srcS, weight, _bias, _params, pDst, dstC, tail, first), pDst += 4 * dstC; + else + { + for (; x < dstW3; x += 3, pDst += 3 * dstC, pSrc += 3 * F) + OutputConvolution_2x3(pSrc, srcC, srcS, weight, _bias, _params, pDst, dstC, tail, first); + for (; x < dstW; ++x, pDst += dstC, pSrc += F) + OutputConvolution_2x1(pSrc, srcC, srcS, weight, _bias, _params, pDst, dstC, tail, first); + } + } + weight += srcC * DF; + } + } + + //------------------------------------------------------------------------------------------------------- + + template void SetOutput(const ConvParam& p, Base::SynetMergedConvolution32f::ConvolutionPtr* convolution) + { + convolution[0] = OutputConvolution; + convolution[1] = OutputConvolution; + } + + void SetOutput(const ConvParam& p, Base::SynetMergedConvolution32f::ConvolutionPtr* convolution) + { + switch (p.activation) + { + case SimdConvolutionActivationIdentity: SetOutput(p, convolution); break; + case SimdConvolutionActivationRelu: SetOutput(p, convolution); break; + case SimdConvolutionActivationLeakyRelu: SetOutput(p, convolution); break; + case SimdConvolutionActivationRestrictRange: SetOutput(p, convolution); break; + case SimdConvolutionActivationPrelu: SetOutput(p, convolution); break; + case SimdConvolutionActivationElu: SetOutput(p, convolution); break; + case SimdConvolutionActivationHswish: SetOutput(p, convolution); break; + case SimdConvolutionActivationMish: SetOutput(p, convolution); break; + case SimdConvolutionActivationHardSigmoid: SetOutput(p, convolution); break; + case SimdConvolutionActivationSwish: SetOutput(p, convolution); break; + case SimdConvolutionActivationGelu: SetOutput(p, convolution); break; + default: assert(0); + } + } + } +#endif +} diff --git a/src/Simd/SimdSynetMergedConvolution32f.h b/src/Simd/SimdSynetMergedConvolution32f.h index bc8d6cfb62..e0f3b146cf 100644 --- a/src/Simd/SimdSynetMergedConvolution32f.h +++ b/src/Simd/SimdSynetMergedConvolution32f.h @@ -186,13 +186,19 @@ namespace Simd #ifdef SIMD_SSE41_ENABLE namespace Sse41 { + void SetInput(const ConvParam& p, Base::SynetMergedConvolution32f::ConvolutionPtr* convolution); + + void SetDepthwise(const ConvParam& p, bool last, Base::SynetMergedConvolution32f::ConvolutionPtr* convolution); + + void SetOutput(const ConvParam& p, Base::SynetMergedConvolution32f::ConvolutionPtr* convolution); + + //------------------------------------------------------------------------------------------------- + class SynetMergedConvolution32fCdc : public Base::SynetMergedConvolution32fCdc { public: SynetMergedConvolution32fCdc(const MergConvParam& p); virtual String Ext() const { return "Sse41"; } - - static void Set(const MergConvParam& p, size_t t, size_t i, SynetMergedConvolution32f::ConvolutionPtr* c); }; class SynetMergedConvolution32fCd : public Base::SynetMergedConvolution32fCd @@ -200,8 +206,6 @@ namespace Simd public: SynetMergedConvolution32fCd(const MergConvParam& p); virtual String Ext() const { return "Sse41"; } - - static void Set(const MergConvParam& p, size_t t, size_t i, SynetMergedConvolution32f::ConvolutionPtr* c); }; class SynetMergedConvolution32fDc : public Base::SynetMergedConvolution32fDc @@ -209,8 +213,6 @@ namespace Simd public: SynetMergedConvolution32fDc(const MergConvParam& p); virtual String Ext() const { return "Sse41"; } - - static void Set(const MergConvParam& p, size_t t, size_t i, SynetMergedConvolution32f::ConvolutionPtr* c); }; //------------------------------------------------------------------------------------------------- diff --git a/src/Test/TestSynetMergedConvolution32f.cpp b/src/Test/TestSynetMergedConvolution32f.cpp index 890fa42357..97d13a7077 100644 --- a/src/Test/TestSynetMergedConvolution32f.cpp +++ b/src/Test/TestSynetMergedConvolution32f.cpp @@ -314,6 +314,7 @@ namespace Test result = result && SynetMergedConvolution32fForwardAutoTest(eps, Param(Shp(1, 555, 40, 23), Cnv(a0, 3, 2), Cnv(a1, 1, 1, 1555)), f1, f2); result = result && SynetMergedConvolution32fForwardAutoTest(eps, Param(Shp(1, 1091, 39, 39), Cnv(a1, 3, 1), Cnv(a2, 1, 1, 181)), f1, f2); result = result && SynetMergedConvolution32fForwardAutoTest(eps, Param(Shp(1, 391, 39, 39), Cnv(a1, 1, 1, 1181), Cnv(a2, 3, 1)), f1, f2); + result = result && SynetMergedConvolution32fForwardAutoTest(eps, Param(Shp(1, 391, 39, 39), Cnv(a1, 1, 1, 1024), Cnv(a2, 3, 1)), f1, f2); result = result && SynetMergedConvolution32fForwardAutoTest(eps, Param(Shp(1, 768, 40, 40), Cnv(a0, 1, 1, 192), Cnv(a1, 3, 1), Cnv(a2, 1, 1, 256), f), f1, f2); result = result && SynetMergedConvolution32fForwardAutoTest(eps, Param(Shp(1, 1024, 40, 40), Cnv(a0, 1, 1, 201), Cnv(a1, 5, 1), Cnv(a2, 1, 1, 191), f), f1, f2); result = result && SynetMergedConvolution32fForwardAutoTest(eps, Param(Shp(1, 192, 60, 60), Cnv(a0, 1, 1, 384), Cnv(a1, 3, 1), Cnv(a2, 1, 1, 192), f), f1, f2); @@ -328,13 +329,7 @@ namespace Test //result = result && SynetMergedConvolution32fForwardAutoTest(eps, Param(Shp(1, 91, 41, 41), Cnv(a0, 1, 1, 191), Cnv(a1, 3, 1)), f1, f2); #endif #else - result = result && SynetMergedConvolution32fForwardAutoTest(eps, Param(Shp(1, 391, 39, 39), Cnv(a1, 1, 1, 1181), Cnv(a2, 5, 1)), f1, f2); - //result = result && SynetMergedConvolution32fForwardAutoTest(eps, Param(Shp(1, 1035, 40, 40), Cnv(a0, 1, 1, 201), Cnv(a1, 5, 1), Cnv(a2, 1, 1, 191), f), f1, f2); - //result = result && SynetMergedConvolution32fForwardAutoTest(eps, Param(Shp(1, 1911, 40, 40), Cnv(a1, 3, 1), Cnv(a2, 1, 1, 64)), f1, f2); - //result = result && SynetMergedConvolution32fForwardAutoTest(eps, Param(Shp(1, 1910, 40, 40), Cnv(a1, 5, 1), Cnv(a2, 1, 1, 192)), f1, f2); - //result = result && SynetMergedConvolution32fForwardAutoTest(eps, Param(Shp(1, 1035, 40, 40), Cnv(a0, 1, 1, 200), Cnv(a1, 5, 1), Cnv(a2, 1, 1, 190), f), f1, f2); - //result = result && SynetMergedConvolution32fForwardAutoTest(eps, Param(Shp(1, 191, 39, 39), Cnv(a1, 5, 1), Cnv(a2, 1, 1, 181)), f1, f2); - //result = result && SynetMergedConvolution32fForwardAutoTest(eps, Param(Shp(1, 91, 40, 40), Cnv(a0, 1, 1, 192), Cnv(a1, 3, 1)), f1, f2); + result = result && SynetMergedConvolution32fForwardAutoTest(eps, Param(Shp(1, 555, 40, 23), Cnv(a0, 3, 2), Cnv(a1, 1, 1, 1555)), f1, f2); #endif return result; }