diff --git a/CHANGELOG.md b/CHANGELOG.md index 6d85f938b9..2f0cce18bc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,8 +8,11 @@ Full documentation for MIOpen is available [here](https://rocm.docs.amd.com/proj * [Conv] Enabled tuning through the `miopenSetConvolutionFindMode` API * [RNN] Added the new algorithm type `miopenRNNroundedDynamic` for LSTM * [TunaNet] Enabled NHWC for MI300 +* [BatchNorm] Enabled broad support for NHWC +* [BatchNorm] Enabled tuning through MIOPEN_FIND_ENFORCE ### Optimized * Updated KernelTuningNet for CK solvers +* NHWC Batchnorm ### Resolved issues diff --git a/CMakeLists.txt b/CMakeLists.txt index 0222db304a..3fd4b0775b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -128,7 +128,7 @@ if(MIOPEN_STRIP_SYMBOLS AND NOT WIN32 AND NOT APPLE) set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -s") endif() -rocm_setup_version(VERSION 3.4.0) +rocm_setup_version(VERSION 3.4.1) list( APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake ) include(TargetFlags) diff --git a/Jenkinsfile b/Jenkinsfile index 149035a575..37ace74aa6 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -245,7 +245,11 @@ def buildHipClangJob(Map conf=[:]){ if (conf.get("enforce_xnack_on", false)) { dockerOpts = dockerOpts + " --env HSA_XNACK=1" } - + def video_id = sh(returnStdout: true, script: 'getent group video | cut -d: -f3') + def render_id = sh(returnStdout: true, script: 'getent group render | cut -d: -f3') + dockerOpts = dockerOpts + " --group-add=${video_id} --group-add=${render_id} " + echo "Docker flags: ${dockerOpts}" + def variant = env.STAGE_NAME def needs_gpu = conf.get("needs_gpu", true) diff --git a/driver/bn_driver.hpp b/driver/bn_driver.hpp index bfa1b91aef..f7409d29b8 100644 --- a/driver/bn_driver.hpp +++ b/driver/bn_driver.hpp @@ -195,10 +195,8 @@ int BatchNormDriver::GetandSetData() SetBNParametersFromCmdLineArgs(); in.AllocOnHost(tensor{bn_layout, in_len}); - for(size_t i = 0; i < in.GetVector().size(); i++) - { - in.GetVector()[i] = prng::gen_canonical(); - } + // 0.0 to 2.0 + in.GetTensor().generate(uniform_unsigned_initializer(2e-3 /*scale*/, 1000 /*range*/)); auto derivedBnDesc = miopen::TensorDescriptor{}; miopen::DeriveBNTensorDescriptor(derivedBnDesc, in.GetTensor().desc, bn_mode); @@ -208,20 +206,25 @@ int BatchNormDriver::GetandSetData() out.AllocOnHost(tensor{bn_layout, in_len}); scale.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); bias.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); - - for(int i = 0; i < scale.GetVector().size(); i++) - { - scale.GetVector()[i] = prng::gen_canonical(); - bias.GetVector()[i] = prng::gen_canonical(); - } + // -2.0 to 2.0 + scale.GetTensor().generate( + uniform_signed_initializer(2e-3 /*scale*/, 1000 /*range*/)); + bias.GetTensor().generate( + uniform_signed_initializer(2e-3 /*scale*/, 1000 /*range*/)); } if(isFwdInfer) { estMean.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); estVariance.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); - auto gen_value_emean = [](auto...) { return prng::gen_descreet_unsigned(1e-2, 100); }; - estMean.InitHostData(estMean.GetTensor().desc.GetElementSize(), true, gen_value_emean); + // -2.0 to 2.0 + estMean.InitHostData(estMean.GetTensor().desc.GetElementSize(), + true, + uniform_signed_initializer(2e-3 /*scale*/, 1000 /*range*/)); + // estVaraince has to be +ve number otherwise 1/sqrt(-ve) would + // give img number + estVariance.GetTensor().generate( + uniform_unsigned_initializer(2e-3 /*scale*/, 1000 /*range*/)); } else if(isFwdTrain) { @@ -230,11 +233,11 @@ int BatchNormDriver::GetandSetData() runMean.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); runVariance.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); - for(int i = 0; i < runVariance.GetVector().size(); i++) - { - runMean.GetVector()[i] = prng::gen_canonical(); - runVariance.GetVector()[i] = prng::gen_canonical(); - } + // -2.0 to 2.0 + runMean.GetTensor().generate( + uniform_signed_initializer(2e-3 /*scale*/, 1000 /*range*/)); + runVariance.GetTensor().generate( + uniform_signed_initializer(2e-3 /*scale*/, 1000 /*range*/)); } else if(isBwd) { @@ -242,33 +245,33 @@ int BatchNormDriver::GetandSetData() bnScale.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); dy.AllocOnHost(tensor{bn_layout, in_len}); - - auto gen_var_bwd = [](auto...) { - return static_cast(1e-2 * (prng::gen_0_to_B(100) + 1)); - }; - - dy.InitHostData(dy.GetTensor().desc.GetElementSize(), true, gen_var_bwd); + // -2.0 to 2.0 + dy.InitHostData(dy.GetTensor().desc.GetElementSize(), + true, + uniform_signed_initializer(2e-3 /*scale*/, 1000 /*range*/)); dScale.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); dBias.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); savedMean.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); savedInvVar.AllocOnHost(tensor{bn_layout, derivedBnDesc.GetLengths()}); - auto gen_value = [](auto...) { return prng::gen_descreet_unsigned(1e-2, 100); }; - bnScale.InitHostData(bnScale.GetTensor().desc.GetElementSize(), true, gen_value); - - auto gen_in_var = [](auto...) { - return static_cast(1e-2 * (prng::gen_0_to_B(100) + 1)); - }; - savedMean.InitHostData(savedMean.GetTensor().desc.GetElementSize(), true, gen_in_var); - savedInvVar.InitHostData(savedInvVar.GetTensor().desc.GetElementSize(), true, gen_in_var); + bnScale.InitHostData( + bnScale.GetTensor().desc.GetElementSize(), + true, + uniform_signed_initializer(2e-3 /*scale*/, 1000 /*range*/)); + // -2.0 to 2.0 + savedMean.InitHostData(savedMean.GetTensor().desc.GetElementSize(), + true, + uniform_signed_initializer(2e-3 /*scale*/, 1000 /*range*/)); + savedInvVar.InitHostData(savedInvVar.GetTensor().desc.GetElementSize(), + true, + uniform_signed_initializer(2e-3 /*scale*/, 1000 /*range*/)); } else { std::cout << "\nUnknown batch norm state!\n"; exit(EXIT_FAILURE); } - return miopenStatusSuccess; } @@ -297,7 +300,7 @@ int BatchNormDriver::AddCmdLineArgs() inflags.AddInputFlag("alpha", 'A', "1.0", "Alpha (Default=1.0)", "float"); inflags.AddInputFlag("beta", 'B', "0.", "Beta (Default=0.)", "float"); - inflags.AddInputFlag("iter", 'i', "1", "Number of Iterations (Default=1)", "int"); + inflags.AddInputFlag("iter", 'i', "10", "Number of Iterations (Default=10)", "int"); inflags.AddInputFlag("verify", 'V', "1", "Verify Each Layer (Default=1)", "int"); inflags.AddInputFlag("time", 't', "0", "Time Each Layer (Default=0)", "int"); inflags.AddInputFlag("printconv", 'P', "1", "Print Convolution Dimensions (Default=1)", "int"); @@ -1364,11 +1367,6 @@ int BatchNormDriver::RunBackwardCPU() if(!back) return miopenStatusSuccess; - // T alphaDiff = 1, betaDiff = 0; - // T alphaParam = 1, betaParam = 0; - double alpha = static_cast(1), beta = static_cast(0), - gamma = static_cast(1); - // float alphaDataDiff = static_cast(1), betaDataDiff = static_cast(0); // float alphaParamDiff = static_cast(1), betaParamDiff = static_cast(0); int size{0}; @@ -1394,20 +1392,14 @@ int BatchNormDriver::RunBackwardCPU() if(bn_mode == miopenBNPerActivation) { // 1xCxHxW - batchNormActivSpatialHostBwdTrain(activ_mode, - gamma, - beta, - alpha, - in.GetTensor(), - dy.GetTensor(), - out.GetTensor(), - out_ref, - bnScale.GetTensor(), - dBias.GetTensor(), - dScale_ref, - dBias_ref, - savedMean.GetTensor(), - savedInvVar.GetTensor()); + batchNormPerActHostBwdTrain(in.GetTensor(), + dy.GetTensor(), + out_ref, + bnScale.GetTensor(), + dScale_ref, + dBias_ref, + savedMean.GetTensor(), + savedInvVar.GetTensor()); } else if(bn_mode == miopenBNSpatial) { // 1xCx1x1 diff --git a/fin b/fin index 344cf42f6c..25a7bffc6c 160000 --- a/fin +++ b/fin @@ -1 +1 @@ -Subproject commit 344cf42f6c18f309f3d1dd08af1cd7b73dd38e46 +Subproject commit 25a7bffc6c252bc5f5a4c05eba10f1c2930faa0f diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 78a4ef79ec..d69586adcb 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -211,15 +211,13 @@ set( MIOpen_Source solver/batchnorm/backward_ck.cpp solver/batchnorm/backward_per_activation.cpp solver/batchnorm/backward_per_activation_fused.cpp - solver/batchnorm/backward_spatial_multiple.cpp - solver/batchnorm/backward_spatial_single.cpp + solver/batchnorm/backward_spatial.cpp solver/batchnorm/forward_inference.cpp solver/batchnorm/forward_inference_ck.cpp solver/batchnorm/forward_inference_fused.cpp solver/batchnorm/forward_per_activation.cpp solver/batchnorm/forward_per_activation_fused.cpp - solver/batchnorm/forward_spatial_multiple.cpp - solver/batchnorm/forward_spatial_single.cpp + solver/batchnorm/forward_spatial.cpp solver/batchnorm/forward_training_ck.cpp solver/cat/forward_cat.cpp solver/conv/conv_asm_1x1u.cpp diff --git a/src/batch_norm.cpp b/src/batch_norm.cpp index 2c5486f307..2d56ec2aa2 100644 --- a/src/batch_norm.cpp +++ b/src/batch_norm.cpp @@ -73,21 +73,10 @@ TensorDescriptor BuildReshaped4DTensorDescriptor(const miopen::TensorDescriptor& if(layout == miopenTensorNCDHW) { layout = miopenTensorNCHW; - - // NxCxDxHxW -> NxCx(D*H)xW - dims[2] *= dims[3]; - dims[3] = dims[4]; - dims.pop_back(); } else if(layout == miopenTensorNDHWC) { layout = miopenTensorNHWC; - - // NxDxHxWxC -> Nx(D*H)xWxC - dims[1] *= dims[2]; - dims[2] = dims[3]; - dims[3] = dims[4]; - dims.pop_back(); } else { @@ -95,6 +84,12 @@ TensorDescriptor BuildReshaped4DTensorDescriptor(const miopen::TensorDescriptor& exit(EXIT_FAILURE); // NOLINT (concurrency-mt-unsafe) } + // Both NCDHW and NDHWC layout store the lens in NCHDW form + // hence : NxCxDxHxW -> NxCx(D*H)xW + dims[2] *= dims[3]; + dims[3] = dims[4]; + dims.pop_back(); + return {dataType, layout, dims}; } @@ -109,14 +104,11 @@ void profileSequence(const Handle& handle, unsigned char select, float* ctime) case 0: if(handle.IsProfilingEnabled()) { - *ctime = 0.; - handle.ResetKernelTime(); ktime = handle.GetKernelTime(); *ctime = ktime; #if(MIO_BN_CPP_PROF == 1) - printf("ktime0: %lf\n", ktime); - printf("ctime: %f\n", *ctime); + printf("kernel0: %7.3f ms total: %7.3f ms\n", ktime, *ctime); #endif } #if(MIOPEN_BN_SYNCH == 1) @@ -133,8 +125,7 @@ void profileSequence(const Handle& handle, unsigned char select, float* ctime) *ctime += ktime; #if(MIO_BN_CPP_PROF == 1) - printf("ktime1: %lf\n", ktime); - printf("ctime: %f\n", *ctime); + printf("kernel1: %7.3f ms total: %7.3f ms\n", ktime, *ctime); #endif } #if(MIOPEN_BN_SYNCH == 1) @@ -148,15 +139,12 @@ void profileSequence(const Handle& handle, unsigned char select, float* ctime) case 2: if(handle.IsProfilingEnabled()) { - -#if(MIO_BN_CPP_PROF == 1) ktime = handle.GetKernelTime(); + *ctime += ktime; + handle.ResetKernelTime(); handle.AccumKernelTime(*ctime); - printf("ktime2: %lf\n", ktime); - printf("ctime: %f\n", *ctime + ktime); -#else - handle.GetKernelTime(); - handle.AccumKernelTime(*ctime); +#if(MIO_BN_CPP_PROF == 1) + printf("kernel2: %7.3f ms total: %7.3f ms\n", ktime, *ctime); #endif } break; diff --git a/src/batchnorm/problem_description.cpp b/src/batchnorm/problem_description.cpp index daebf49208..2a5513bb24 100644 --- a/src/batchnorm/problem_description.cpp +++ b/src/batchnorm/problem_description.cpp @@ -30,8 +30,6 @@ #include #include -#define WORKAROUND_SWDEV_253606 1 - namespace miopen { namespace batchnorm { @@ -140,314 +138,49 @@ bool IsCKBwdTypeValid(const ProblemDescription& bn_problem) } NetworkConfig ProblemDescription::MakeNetworkConfig() const -{ - switch(direction) - { - case Direction::ForwardTraining: return MakeForwardTrainingNetworkConfig(); - case Direction::ForwardInference: return MakeForwardInferenceNetworkConfig(); - case Direction::Backward: return MakeBackwardNetworkConfig(); - default: MIOPEN_THROW(miopenStatusInternalError); - } -} - -NetworkConfig ProblemDescription::MakeForwardTrainingNetworkConfig() const { std::ostringstream ss; int n, c, h, w; std::tie(n, c, h, w) = tien<4>(xDesc.GetLengths()); - - const unsigned int in_cstride = h * w; - const unsigned int in_nhw = n * in_cstride; - - size_t xlocalsize = 1024; - if(((in_cstride < 256) && (n < 256)) || ((in_cstride < 100) && (n <= 256))) - xlocalsize = 256; - - size_t ylocalsize = 1; - - size_t xgridsize = c * xlocalsize; - size_t ygridsize = 1; - - bool bfpmixparm = false; - if(IsMix()) + int d = 1; + // dimensions + ss << c; + ss << "x" << d << "x" << h << "x" << w; + ss << "x" << n; + // layout + ss << "x" << ComputeInLayout(); + ss << "x" << ComputeOutLayout(); + if(direction == Direction::Backward) { - bfpmixparm = true; + ss << "x" << ComputeDinLayout(); } - - if(bn_mode == miopenBNSpatial) + // data type + ss << "x" << GetDataTypeName(xDesc.GetType()); + ss << "x" << GetDataTypeName(yOrDyDesc.GetType()); + ss << "x" << GetDataTypeName(scaleDesc.GetType()); + ss << "x" << GetDataTypeName(biasDesc.GetType()); + ss << "x" << GetDataTypeName(sMeanDesc.GetType()); + ss << "x" << GetDataTypeName(sVarianceDesc.GetType()); + if(direction == Direction::Backward) { - bool single = true; - int variant = 1; - unsigned int ldsgcn = xlocalsize / 64; - -#if(WORKAROUND_SWDEV_253606 == 0) - if(n < 3) - { - variant = 4; - xlocalsize = 256; - xgridsize = c * xlocalsize; - ylocalsize = 1; - ygridsize = 1; - ldsgcn = xlocalsize / 64; - } - else -#endif - - // clang-format off - if((in_nhw < 33554432 && in_cstride > 1024) || - ((n >= 256) && (in_cstride > 60) && bfpmixparm) || - ((in_cstride > 512) && bfpmixparm)) - { - variant = 1; - } - else if(in_cstride <= 512) - { - variant = 0; - } - else - { - variant = 2; - xlocalsize = 1; - ylocalsize = 1024; - const auto segment = int(std::ceil(double(in_cstride) / double(ylocalsize))); - xgridsize = c; - ygridsize = segment * ylocalsize; - single = false; - ldsgcn = ylocalsize / 64; - } - // clang-format on - - if((n > 768) && (in_cstride > 150) && IsFp32()) - { - variant = 2; - xlocalsize = 1; - ylocalsize = 1024; - const auto segment = int(std::ceil(double(in_cstride) / double(ylocalsize))); - xgridsize = c; - ygridsize = segment * ylocalsize; - single = false; - ldsgcn = ylocalsize / 64; - } - - ss << "variant" << variant; - -#if(WORKAROUND_SWDEV_253606 == 0) - if(variant == 4) - { - ss << "rs" << static_cast(resultsave); - ss << "rr" << static_cast(resultrunning); - ss << "fp16" << static_cast(IsFp16()); - ss << "fp32" << static_cast(IsFp32()); - ss << "fp64" << static_cast(IsFp64()); - ss << "fbf16" << static_cast(IsBFp16()); - ss << "fmix" << static_cast(IsMix()); - ss << "c" << c; - } - else -#endif - { - ss << "gx" << xgridsize; - ss << "gy" << ygridsize; - ss << "xl" << xlocalsize; - ss << "yl" << ylocalsize; - ss << "ldsgcn" << ldsgcn; - ss << "rs" << static_cast(resultsave); - ss << "rr" << static_cast(resultrunning); - ss << "fp16" << static_cast(IsFp16()); - ss << "fp32" << static_cast(IsFp32()); - ss << "fp64" << static_cast(IsFp64()); - ss << "fbf16" << static_cast(IsBFp16()); - ss << "fmix" << static_cast(IsMix()); - ss << "single" << static_cast(single); - ss << "n" << n; - ss << "c" << c; - ss << "hw" << in_cstride; - } - } - else - { - xlocalsize = 1; - ylocalsize = 256; - const std::size_t segment = (in_cstride + ylocalsize - 1) / ylocalsize; - xgridsize = c; - ygridsize = segment * ylocalsize; - - ss << "fp16" << static_cast(IsFp16()); - ss << "fp32" << static_cast(IsFp32()); - ss << "fp64" << static_cast(IsFp64()); - ss << "fbf16" << static_cast(IsBFp16()); - ss << "fmix" << static_cast(IsMix()); - ss << "gx" << xgridsize; - ss << "gy" << ygridsize; - ss << "lx" << xlocalsize; - ss << "ly" << ylocalsize; - ss << "rs" << static_cast(resultsave); - ss << "rr" << static_cast(resultrunning); - ss << "segment" << segment; - ss << "n" << n; - ss << "c" << c; - ss << "hw" << in_cstride; + ss << "x" << GetDataTypeName(dxDesc.GetType()); } - ss << "layout" << in_layout; - ss << "scaleType" << static_cast(IsScaleFp16()); - ss << "scaleType" << static_cast(IsScaleFp32()); - - return NetworkConfig{ss.str()}; -} + ss << "x" << IsMix(); -NetworkConfig ProblemDescription::MakeForwardInferenceNetworkConfig() const -{ - std::ostringstream ss; - - int n, c, h, w; - std::tie(n, c, h, w) = tien<4>(xDesc.GetLengths()); - - const unsigned int in_cstride = h * w; - - ss << "fp16" << static_cast(IsFp16()); - ss << "fp32" << static_cast(IsFp32()); - ss << "fp64" << static_cast(IsFp64()); - ss << "fbf16" << static_cast(IsBFp16()); - ss << "fmix" << static_cast(IsMix()); - ss << "mode" << bn_mode; - ss << "HWdims" << in_cstride; - ss << "C" << c; - ss << "layout" << in_layout; - ss << "scaleType" << static_cast(IsScaleFp16()); - ss << "scaleType" << static_cast(IsScaleFp32()); - - return NetworkConfig{ss.str()}; -} - -NetworkConfig ProblemDescription::MakeBackwardNetworkConfig() const -{ - std::ostringstream ss; - - bool bfpmixparm = false; - if(xDesc.GetType() == miopenHalf && GetBnScale().GetType() == miopenFloat) + // direction + ss << "x" << GetDirectionStr(); + // save and running + if(direction == Direction::ForwardTraining) { - bfpmixparm = true; + ss << "x" << resultsave; + ss << "x" << resultrunning; } - - int n, c, h, w; - std::tie(n, c, h, w) = tien<4>(xDesc.GetLengths()); - - const unsigned int in_cstride = h * w; - const unsigned int in_nhw = n * in_cstride; - - size_t xlocalsize = 1; - size_t ylocalsize = 1; - - size_t xgridsize = 1; - size_t ygridsize = 1; - - if(bn_mode == miopenBNSpatial) - { - unsigned int ldsgcn = 0; - bool single = true; - int variant = 1; - - if((in_nhw < (32 * 1024 * 1024) && in_cstride > 1024)) - { - variant = 1; - xlocalsize = 1024; - xgridsize = c * xlocalsize; - ldsgcn = xlocalsize / 64; - } - else if(in_nhw < (32 * 1024 * 1024) && in_cstride > 512) - { - variant = (n >= 32) ? 1 : 3; - xlocalsize = std::min(64 * ((in_cstride + 63) / 64), static_cast(1024)); - xgridsize = c * xlocalsize; - ldsgcn = xlocalsize / 64; - } - else if(in_cstride <= 512) - { - if((n > 64) && (in_cstride > 160)) - { - variant = 3; - xlocalsize = - std::min(64 * ((in_cstride + 63) / 64), static_cast(1024)); - xgridsize = c * xlocalsize; - ldsgcn = xlocalsize / 64; - } - else - { - variant = 0; - if(IsFp32()) - { - xlocalsize = 1024; - xgridsize = 1024 * static_cast(c); - } - else - { - xlocalsize = 256; - xgridsize = 256 * static_cast(c); - } - ldsgcn = xlocalsize / 64; - } - } - else - { - variant = 2; - ylocalsize = 1024; - auto segment = int(std::ceil(double(in_cstride) / double(ylocalsize))); - xgridsize = c; - ygridsize = segment * ylocalsize; - single = false; - ldsgcn = ylocalsize / 64; - } - if((in_cstride < 200) && (in_cstride > 60) && bfpmixparm) - { - variant = 1; - xlocalsize = 1024; - xgridsize = c * xlocalsize; - ldsgcn = xlocalsize / 64; - } - - ss << "variant" << variant; - ss << "gx" << xgridsize; - ss << "n" << n; - ss << "c" << c; - ss << "hw" << in_cstride; - ss << "gy" << ygridsize; - ss << "lx" << xlocalsize; - ss << "ly" << ylocalsize; - ss << "us" << static_cast(useSaved); - ss << "fp16" << static_cast(IsFp16()); - ss << "fp32" << static_cast(IsFp32()); - ss << "fp64" << static_cast(IsFp64()); - ss << "fbf16" << static_cast(IsBFp16()); - ss << "fmix" << static_cast(IsMix()); - ss << "single" << static_cast(single); - ss << "gcn" << ldsgcn; - } - else + if(direction == Direction::Backward) { - ylocalsize = (64 >= in_cstride) ? 64 : 256; - const unsigned int segment = std::ceil(double(in_cstride) / double(ylocalsize)); - xgridsize = c; - ygridsize = segment * ylocalsize; - - ss << "gx" << xgridsize; - ss << "gy" << ygridsize; - ss << "lx" << xlocalsize; - ss << "ly" << ylocalsize; - ss << "n" << n; - ss << "c" << c; - ss << "hw" << in_cstride; - ss << "u" << static_cast(useSaved); - ss << "fp16" << static_cast(IsFp16()); - ss << "fp32" << static_cast(IsFp32()); - ss << "fp64" << static_cast(IsFp64()); - ss << "fbf16" << static_cast(IsBFp16()); - ss << "fmix" << static_cast(IsMix()); - ss << "nhw" << in_nhw; + ss << "x" << useSaved; } - ss << "layout" << in_layout; - ss << "scaleType" << static_cast(IsScaleFp16()); - ss << "scaleType" << static_cast(IsScaleFp32()); + ss << "x" << GetModeStr(); return NetworkConfig{ss.str()}; } diff --git a/src/driver_arguments.cpp b/src/driver_arguments.cpp index b730325add..6f079e01f9 100644 --- a/src/driver_arguments.cpp +++ b/src/driver_arguments.cpp @@ -156,11 +156,11 @@ void BnDriverInfo(std::stringstream& ss, } if((resultRunningMean != nullptr) && (resultRunningVariance != nullptr)) { - ss << " -s 1"; + ss << " -r 1"; } if((resultSaveMean != nullptr) && (resultSaveInvVariance != nullptr)) { - ss << " -r 1"; + ss << " -s 1"; } } diff --git a/src/fin/fin_interface.cpp b/src/fin/fin_interface.cpp index d321a687a0..f704461a89 100644 --- a/src/fin/fin_interface.cpp +++ b/src/fin/fin_interface.cpp @@ -354,6 +354,8 @@ AnySolver::AnyS switch(id) { + case 113: SetObject(); break; + case 117: SetObject(); break; case 142: SetObject(); break; case 143: SetObject(); break; case 144: SetObject(); break; diff --git a/src/include/miopen/batchnorm/common_spatial.hpp b/src/include/miopen/batchnorm/common_spatial.hpp new file mode 100644 index 0000000000..c95ef8b4f1 --- /dev/null +++ b/src/include/miopen/batchnorm/common_spatial.hpp @@ -0,0 +1,453 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2025 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#pragma once + +#include + +#define WORKAROUND_SWDEV_253606 1 + +namespace miopen { + +namespace solver { + +namespace batchnorm { + +inline void GetWGSizeNHWC(size_t c, + size_t h, + size_t w, + size_t min_workgroups, + bool bfp32parm, + size_t vectorsize, + size_t& xlocalsize, + size_t& ylocalsize) +{ + unsigned int xlocalsize_limit = vectorsize > 1 ? (bfp32parm ? 16 : 32) : 64; + // shared memory size per workgroup is fixed + unsigned int max_localsize = 1024 / vectorsize; + + size_t nworkgroups = 0; + xlocalsize = 0; + // decrease max_localsize until the number of workgroups is greater than 80% + // of the available CUs + while(nworkgroups < min_workgroups && max_localsize >= xlocalsize_limit) + { + // xlocalsize must be power of 2 as reductions in the kernels rely on it, here c is rounded + // up to next power of 2. + xlocalsize = std::min(size_t{1 << int(std::ceil(std::log2(c / vectorsize)))}, + size_t{xlocalsize_limit}); + ylocalsize = max_localsize / xlocalsize; + nworkgroups = ((c / vectorsize + xlocalsize - 1) / xlocalsize) * + ((h * w + ylocalsize - 1) / ylocalsize); + max_localsize >>= 1; + } +} + +inline int GetStashMethod(bool IsLayoutNHWC, + miopenDataType_t problem_type, + unsigned int stash_values, + size_t c, + size_t n, + size_t in_cstride, + unsigned int ylocalsize) +{ + // See `batchnorm_functions.hpp` for stash implementation of different methods + int stash_method = 0; + stash_values *= (problem_type == miopenFloat ? 1 : 2); + unsigned int last_ylocalsize = + (in_cstride) % ylocalsize == 0 ? ylocalsize : (in_cstride) % ylocalsize; + if(last_ylocalsize < stash_values && n >= (size_t)stash_values) + { + stash_method = 1; + } + if(IsLayoutNHWC && !(problem_type == miopenFloat) && (c % 2 != 0) && (n >= stash_values)) + { + stash_method = 2; + } + return stash_method; +} + +// Returns true if spatial multiple is applicable and fill NHWC configuration +// (xlocalsize, ylocalsize). +// First workgroup size is computed given a problem and vectorsize, then it checks +// if the computed workgroup is applicable (spatial multiple restrictions) +inline bool GetLocalConfigNHWC(const miopen::batchnorm::ProblemDescription& problem, + unsigned int stash_values, + size_t vectorsize, + size_t& xlocalsize, + size_t& ylocalsize) +{ + bool bfp32parm = + problem.GetXDesc().GetType() == miopenHalf || problem.GetXDesc().GetType() == miopenBFloat16 + ? false + : true; + + size_t n, c, h, w = 0; + std::tie(n, c, h, w) = tien<4>(problem.GetXDesc().GetLengths()); + assert((n != 0) && "n cannot be 0"); + assert((c != 0) && "c cannot be 0"); + assert((h != 0) && "h cannot be 0"); + assert((w != 0) && "w cannot be 0"); + + GetWGSizeNHWC( + c, h, w, problem.GetMinWorkgroups(), bfp32parm, vectorsize, xlocalsize, ylocalsize); + assert((xlocalsize != 0) && "xlocalsize cannot be 0"); + assert((ylocalsize != 0) && "ylocalsize cannot be 0"); + if(ylocalsize == 0) + { + ylocalsize = 1; + } + stash_values *= (bfp32parm ? 1 : 2); + unsigned int last_ylocalsize = (h * w) % ylocalsize == 0 ? ylocalsize : (h * w) % ylocalsize; + // FP32: + // - last block must have enough space to stash intermediate results in HW dimension + // - if last block doesn't fit, intermediate results are stored in N dimension which must + // be large enough + // Mix precision: + // - last block must have enough space to stash intermediate results in HW dimension + // - if last block doesn't fit, intermediate results are stored in N dimension which must + // be large enough + // - if C is not multiple of 2, intermediate results are stored in N dimension splitting + // float values in group of 2 bytes. N must be large enough + if((!bfp32parm && (c % 2 != 0 && n < (size_t)stash_values)) || + ((last_ylocalsize < stash_values) && (n < (size_t)stash_values))) + { + return false; + } + + return true; +} + +inline bool IsSpatialMultipleApplicable(const miopen::batchnorm::ProblemDescription& problem, + size_t vectorsize, + unsigned int stash_values) +{ + int n, c, h, w = 0; + std::tie(n, c, h, w) = tien<4>(problem.GetXDesc().GetLengths()); + assert((n != 0) && "n cannot be 0"); + assert((c != 0) && "c cannot be 0"); + assert((h != 0) && "h cannot be 0"); + assert((w != 0) && "w cannot be 0"); + + unsigned int in_cstride = h * w; + + if(problem.IsLayoutNHWC()) + { + // check if the provided vectorsize can be used + if(c % vectorsize != 0) + { + return false; + } + // Variant 2 is the primary choice for NHWC + size_t xlocalsize, ylocalsize = 0; + + // The configuration is ignored at this point, it was just computed to check + // if spatial multiple could be applied. + return GetLocalConfigNHWC(problem, stash_values, vectorsize, xlocalsize, ylocalsize); + } + else + { + // check if the provided vectorsize can be used + if(in_cstride % vectorsize != 0) + { + return false; + } + + unsigned int ylocalsize = 1024; + unsigned int last_ylocalsize = + in_cstride % ylocalsize == 0 ? ylocalsize : in_cstride % ylocalsize; + // Restrictions: + // - last block must have enough space to stash intermediate results in HW dimension + // - if last block doesn't fit, intermediate results are stored in N dimension which must + // be large enough + stash_values *= (problem.GetXDesc().GetType() == miopenFloat ? 1 : 2); + if(last_ylocalsize < stash_values && n < (size_t)stash_values) + { + return false; + } + } + return true; +} + +inline void GetSpatialMultipleConfig(const miopen::batchnorm::ProblemDescription& problem, + unsigned int stash_values, + size_t vectorsize, + size_t& xlocalsize, + size_t& ylocalsize, + size_t& xgridsize, + size_t& ygridsize, + int& stash_method) +{ + int n, c, h, w; + std::tie(n, c, h, w) = tien<4>(problem.GetXDesc().GetLengths()); + unsigned int in_cstride = h * w; + + if(problem.IsLayoutNHWC()) + { + // The function returns if the method is valid but we can ignore it + // at this point + GetLocalConfigNHWC(problem, stash_values, vectorsize, xlocalsize, ylocalsize); + + xgridsize = xlocalsize * ((c / vectorsize + xlocalsize - 1) / xlocalsize); + ygridsize = ylocalsize * ((in_cstride + ylocalsize - 1) / ylocalsize); + } + else + { + xlocalsize = 1; + xgridsize = c; + ylocalsize = 1024; + if(ylocalsize > in_cstride / vectorsize) + { + // No need to use workgroups larger than the HW dimension + ylocalsize = std::max(size_t{64}, + size_t{1 << int(std::ceil(std::log2(in_cstride / vectorsize)))}); + } + ygridsize = ylocalsize * ((in_cstride / vectorsize + ylocalsize - 1) / ylocalsize); + } + stash_method = GetStashMethod(problem.IsLayoutNHWC(), + problem.GetXDesc().GetType(), + stash_values, + c, + n, + in_cstride, + ylocalsize); +} + +inline void GetVariantFromKernelId(const std::string& kernel_id, int& variant, size_t& vectorsize) +{ + // kernel_id has the following standard: + // Variant- + size_t pos = kernel_id.find("Variant"); + if(pos != std::string::npos) + { + variant = kernel_id[pos + 7] - '0'; + vectorsize = kernel_id[pos + 9] - '0'; + } +} + +inline std::string GetKernelIdFromVariant(int variant, size_t vectorsize) +{ + std::stringstream stream; + stream << "Variant" << variant << "-" << vectorsize; + return stream.str(); +} + +inline bool UseMultiple(const miopen::batchnorm::ProblemDescription& problem) +{ + size_t n, c, h, w; + std::tie(n, c, h, w) = tien<4>(problem.GetXDesc().GetLengths()); + + bool bfpmixparm = (problem.GetXDesc().GetType() == miopenHalf || + problem.GetXDesc().GetType() == miopenBFloat16) && + problem.GetBnScale().GetType() == miopenFloat + ? true + : false; + + unsigned int in_cstride = h * w; + unsigned int in_nhw = n * in_cstride; + // Check heuristics (used to choose between spatial single and multiple for performance) + // TODO: review these conditions (variant 2 was optimized and vectorization was added, + // so we need a set of benchmarks to check that these conditions are still correct) + if(!problem.IsLayoutNHWC() && + problem.GetDirection() == miopen::batchnorm::Direction::Backward && + (!((in_nhw >= static_cast(32 * 1024 * 1024) || in_cstride <= 1024) && + (in_nhw >= static_cast(32 * 1024 * 1024) || in_cstride <= 512) && + in_cstride > 512))) + { + return false; + } + + if(!problem.IsLayoutNHWC() && + problem.GetDirection() == miopen::batchnorm::Direction::ForwardTraining && + (!((n >= 3 && in_cstride > 512 && (in_nhw >= 33554432 || in_cstride <= 1024) && + ((n < 256) || (in_cstride <= 60) || !bfpmixparm) && + (!bfpmixparm || in_cstride <= 512)) || + ((n > 768) && (in_cstride > 150))))) + { + return false; + } + + return true; +} + +inline void DefaultConfigSpatialSingle(const miopen::batchnorm::ProblemDescription& problem, + std::vector& valid_kernels) +{ + int n, c, h, w; + std::tie(n, c, h, w) = tien<4>(problem.GetXDesc().GetLengths()); + + unsigned int in_cstride = h * w; + unsigned int in_nhw = n * in_cstride; + + bool bfpmixparm = + problem.GetXDesc().GetType() == miopenHalf && problem.GetBnScale().GetType() == miopenFloat + ? true + : false; + + bool bbfpmixparam = problem.GetXDesc().GetType() == miopenBFloat16 && + problem.GetBnScale().GetType() == miopenFloat + ? true + : false; + + // NCHW supports also variants 0 and 3 which can be much faster than + // variant 1 but have more restrictions. Here we decide if we use variant + // 0, 1, 3 + // In case variant 0 or 3 are selected, we add also variant 1 for tuning. + // Almost always variant 0 and 3 will be faster than variant 1 but + // we add the latter for tuning to be sure and because it is cheap + if(!problem.IsLayoutNHWC()) + { + if(problem.GetDirection() == miopen::batchnorm::Direction::Backward) + { + if((in_cstride < 200) && (in_cstride > 60) && bfpmixparm) + { + valid_kernels.push_back(GetKernelIdFromVariant(1, 1)); + return; + } + + // N*H*W < 32M and H*W > 1024 + // use batchnorm variant#1 implementation which parallelize + // work groups over channels and loop through NHW. + if((in_nhw < (32 * 1024 * 1024) && in_cstride > 1024)) + { + valid_kernels.push_back(GetKernelIdFromVariant(1, 1)); + return; + } + // N*H*W < 32M and H*W > 512 + // use batchnorm variant#1 or variant#3 implementation which + // parallelize work groups over channels and loop through N. + else if(in_nhw < (32 * 1024 * 1024) && in_cstride > 512) + { + if(n >= 32) + { + valid_kernels.push_back(GetKernelIdFromVariant(1, 1)); + return; + } + else + { + valid_kernels.push_back(GetKernelIdFromVariant(3, 1)); + valid_kernels.push_back(GetKernelIdFromVariant(1, 1)); + return; + } + } + // H*W < 512 use batchnorm variant#0 or variant#3 implementation + // based on batch size and H*W + else if(in_cstride <= 512) + { + if((n > 64) && (in_cstride > 160)) + { + valid_kernels.push_back(GetKernelIdFromVariant(3, 1)); + valid_kernels.push_back(GetKernelIdFromVariant(1, 1)); + return; + } + else + { + valid_kernels.push_back(GetKernelIdFromVariant(0, 1)); + valid_kernels.push_back(GetKernelIdFromVariant(1, 1)); + return; + } + } + } + else + { +#if(WORKAROUND_SWDEV_253606 == 0) + if(n < 3) + { + valid_kernels.push_back(GetKernelIdFromVariant(4, 1)); + valid_kernels.push_back(GetKernelIdFromVariant(1, 1)); + return; + } + else +#endif + { + // clang-format off + if(in_cstride > 512 && in_cstride <= 1024 && n < 32) + { + valid_kernels.push_back(GetKernelIdFromVariant(3, 1)); + valid_kernels.push_back(GetKernelIdFromVariant(1, 1)); + return; + } + + if( (in_nhw < 33554432 && in_cstride > 1024) || + ((n >= 256) && (in_cstride > 60) && (bfpmixparm || bbfpmixparam)) || + ((in_cstride > 512) && (bfpmixparm || bbfpmixparam))) + { + valid_kernels.push_back(GetKernelIdFromVariant(1, 1)); + if(in_cstride <= 512) + { + valid_kernels.push_back(GetKernelIdFromVariant(0, 1)); + } + return; + } + else if(in_cstride <= 512) + { + valid_kernels.push_back(GetKernelIdFromVariant(0, 1)); + valid_kernels.push_back(GetKernelIdFromVariant(1, 1)); + return; + } + // clang-format on + } + } + valid_kernels.push_back(GetKernelIdFromVariant(1, 1)); + } +} + +inline void DefaultConfigSpatialMultiple(const miopen::batchnorm::ProblemDescription& problem, + unsigned int stash_values, + std::vector& valid_kernels) +{ + int n, c, h, w; + std::tie(n, c, h, w) = tien<4>(problem.GetXDesc().GetLengths()); + unsigned int in_cstride = h * w; + + // Default configuration for spatial multiple tries to use vectorization + // for both NCHW or NHWC + size_t vectorsize = + problem.IsLayoutNHWC() ? (c % 4 == 0 ? 4 : 1) : (in_cstride % 4 == 0 ? 4 : 1); + if(IsSpatialMultipleApplicable(problem, vectorsize, stash_values)) + { + valid_kernels.push_back(GetKernelIdFromVariant(2, vectorsize)); + // if vectorized version is applicable, then the non vectorized version + // is also added to the list of configurations + if(vectorsize > 1) + { + valid_kernels.push_back(GetKernelIdFromVariant(2, 1)); + } + return; + } + + // If spatial multiple with vectorization can not be used, try without vectorization + if(vectorsize > 1 && IsSpatialMultipleApplicable(problem, 1, stash_values)) + { + valid_kernels.push_back(GetKernelIdFromVariant(2, 1)); + } +} + +} // namespace batchnorm + +} // namespace solver + +} // namespace miopen diff --git a/src/include/miopen/batchnorm/problem_description.hpp b/src/include/miopen/batchnorm/problem_description.hpp index 4fed9033a6..50d026ea71 100644 --- a/src/include/miopen/batchnorm/problem_description.hpp +++ b/src/include/miopen/batchnorm/problem_description.hpp @@ -73,7 +73,8 @@ struct MIOPEN_INTERNALS_EXPORT ProblemDescription : ProblemDescriptionBase, double expAvgFactor_, double epsilon_, bool resultsave_, - bool resultrunning_) + bool resultrunning_, + size_t min_workgroups_) : direction(Direction::ForwardTraining), bn_mode(bn_mode_), xDesc(xDesc_), @@ -85,7 +86,8 @@ struct MIOPEN_INTERNALS_EXPORT ProblemDescription : ProblemDescriptionBase, expAvgFactor(expAvgFactor_), epsilon(epsilon_), resultsave(resultsave_), - resultrunning(resultrunning_) + resultrunning(resultrunning_), + min_workgroups(min_workgroups_) { SetSpatialDims(); in_layout = ComputeInLayout(); @@ -126,7 +128,8 @@ struct MIOPEN_INTERNALS_EXPORT ProblemDescription : ProblemDescriptionBase, const TensorDescriptor& sMeanDesc_, const TensorDescriptor& sVarianceDesc_, double epsilon_, - bool useSaved_) + bool useSaved_, + size_t min_workgroups_) : direction(Direction::Backward), bn_mode(bn_mode_), xDesc(xDesc_), @@ -137,7 +140,8 @@ struct MIOPEN_INTERNALS_EXPORT ProblemDescription : ProblemDescriptionBase, sMeanDesc(sMeanDesc_), sVarianceDesc(sVarianceDesc_), epsilon(epsilon_), - useSaved(useSaved_) + useSaved(useSaved_), + min_workgroups(min_workgroups_) { SetSpatialDims(); in_layout = ComputeInLayout(); @@ -196,6 +200,12 @@ struct MIOPEN_INTERNALS_EXPORT ProblemDescription : ProblemDescriptionBase, return resultrunning; } + std::size_t GetMinWorkgroups() const + { + assert(direction == Direction::ForwardTraining || direction == Direction::Backward); + return min_workgroups; + } + bool UseSaved() const { assert(direction == Direction::Backward); @@ -307,17 +317,14 @@ struct MIOPEN_INTERNALS_EXPORT ProblemDescription : ProblemDescriptionBase, #pragma clang diagnostic pop #endif - bool resultsave = false; - bool resultrunning = false; - bool useSaved = false; - std::string in_layout = "NCHW"; - std::string out_layout = "NCHW"; - std::string din_layout = "NCHW"; - std::size_t spatial_dim = 2; - - NetworkConfig MakeForwardTrainingNetworkConfig() const; - NetworkConfig MakeForwardInferenceNetworkConfig() const; - NetworkConfig MakeBackwardNetworkConfig() const; + bool resultsave = false; + bool resultrunning = false; + bool useSaved = false; + std::string in_layout = "NCHW"; + std::string out_layout = "NCHW"; + std::string din_layout = "NCHW"; + std::size_t spatial_dim = 2; + std::size_t min_workgroups = 1; std::string ComputeLayout(const TensorDescriptor& td) const { return td.GetLayout_str(); } std::string ComputeInLayout() const { return ComputeLayout(xDesc); } diff --git a/src/include/miopen/batchnorm/solvers.hpp b/src/include/miopen/batchnorm/solvers.hpp index 7b8696b95d..13a9a52387 100644 --- a/src/include/miopen/batchnorm/solvers.hpp +++ b/src/include/miopen/batchnorm/solvers.hpp @@ -49,11 +49,11 @@ using BatchNormTunableSolver = TunableSolverMixin; ; -struct BnFwdTrainingSpatialSingle final : BatchnormSolver +struct BnFwdTrainingPerActivation final : BatchnormSolver { const std::string& SolverDbId() const override { - return GetSolverDbId(); + return GetSolverDbId(); } bool IsApplicable(const ExecutionContext& context, @@ -62,56 +62,106 @@ struct BnFwdTrainingSpatialSingle final : BatchnormSolver const miopen::batchnorm::ProblemDescription& problem) const override; }; -struct BnFwdTrainingSpatialMultiple final : BatchnormSolver +struct PerformanceConfigBnBwdBackward : PerfConfigBase { - const std::string& SolverDbId() const override + int index; + std::string kernel_id; + std::vector valid_kernels; + MIOPEN_INTERNALS_EXPORT PerformanceConfigBnBwdBackward(int idx, std::string kernl_id) + : index(idx), kernel_id(kernl_id) { - return GetSolverDbId(); } + PerformanceConfigBnBwdBackward() : PerformanceConfigBnBwdBackward(0, "") {} + PerformanceConfigBnBwdBackward(bool) : PerformanceConfigBnBwdBackward(0, "") {} + MIOPEN_INTERNALS_EXPORT void + HeuristicInit(const miopen::batchnorm::ProblemDescription& problem); + MIOPEN_INTERNALS_EXPORT bool SetNextValue(const miopen::batchnorm::ProblemDescription& problem); + MIOPEN_INTERNALS_EXPORT bool IsValidValue() const; + MIOPEN_INTERNALS_EXPORT bool + IsValid(const ExecutionContext&, const miopen::batchnorm::ProblemDescription& problem) const; - bool IsApplicable(const ExecutionContext& context, - const miopen::batchnorm::ProblemDescription& problem) const override; - ConvSolution GetSolution(const ExecutionContext& context, - const miopen::batchnorm::ProblemDescription& problem) const override; + template + static void Visit(Self&& s, F f) + { + f(s.kernel_id, "kernel_id"); + } + MIOPEN_INTERNALS_EXPORT bool operator==(const PerformanceConfigBnBwdBackward& other) const; }; -struct BnFwdTrainingPerActivation final : BatchnormSolver +struct PerformanceConfigBnFwdTraining : PerfConfigBase { - const std::string& SolverDbId() const override + int index; + std::string kernel_id; + std::vector valid_kernels; + MIOPEN_INTERNALS_EXPORT PerformanceConfigBnFwdTraining(int idx, std::string kernl_id) + : index(idx), kernel_id(kernl_id) { - return GetSolverDbId(); } + PerformanceConfigBnFwdTraining() : PerformanceConfigBnFwdTraining(0, "") {} + PerformanceConfigBnFwdTraining(bool) : PerformanceConfigBnFwdTraining(0, "") {} + MIOPEN_INTERNALS_EXPORT void + HeuristicInit(const miopen::batchnorm::ProblemDescription& problem); + MIOPEN_INTERNALS_EXPORT bool SetNextValue(const miopen::batchnorm::ProblemDescription& problem); + MIOPEN_INTERNALS_EXPORT bool IsValidValue() const; + MIOPEN_INTERNALS_EXPORT bool + IsValid(const ExecutionContext&, const miopen::batchnorm::ProblemDescription& problem) const; - bool IsApplicable(const ExecutionContext& context, - const miopen::batchnorm::ProblemDescription& problem) const override; - ConvSolution GetSolution(const ExecutionContext& context, - const miopen::batchnorm::ProblemDescription& problem) const override; + template + static void Visit(Self&& s, F f) + { + f(s.kernel_id, "kernel_id"); + } + MIOPEN_INTERNALS_EXPORT bool operator==(const PerformanceConfigBnFwdTraining& other) const; }; -struct BnBwdTrainingSpatialSingle final : BatchnormSolver +struct BnBwdTrainingSpatial final : BatchNormTunableSolver { - const std::string& SolverDbId() const override - { - return GetSolverDbId(); - } + const std::string& SolverDbId() const override { return GetSolverDbId(); } + + MIOPEN_INTERNALS_EXPORT PerformanceConfigBnBwdBackward GetDefaultPerformanceConfig( + const ExecutionContext& context, + const miopen::batchnorm::ProblemDescription& problem) const override; + + MIOPEN_INTERNALS_EXPORT bool + IsValidPerformanceConfig(const ExecutionContext& ctx, + const miopen::batchnorm::ProblemDescription& problem, + const PerformanceConfigBnBwdBackward& config) const override; + + MIOPEN_INTERNALS_EXPORT PerformanceConfigBnBwdBackward + Search(const ExecutionContext& ctx, + const miopen::batchnorm::ProblemDescription& problem, + const AnyInvokeParams& invoke_ctx) const override; bool IsApplicable(const ExecutionContext& context, const miopen::batchnorm::ProblemDescription& problem) const override; ConvSolution GetSolution(const ExecutionContext& context, - const miopen::batchnorm::ProblemDescription& problem) const override; + const miopen::batchnorm::ProblemDescription& problem, + const PerformanceConfigBnBwdBackward& config) const override; }; -struct BnBwdTrainingSpatialMultiple final : BatchnormSolver +struct BnFwdTrainingSpatial final : BatchNormTunableSolver { - const std::string& SolverDbId() const override - { - return GetSolverDbId(); - } + const std::string& SolverDbId() const override { return GetSolverDbId(); } + + MIOPEN_INTERNALS_EXPORT PerformanceConfigBnFwdTraining GetDefaultPerformanceConfig( + const ExecutionContext& context, + const miopen::batchnorm::ProblemDescription& problem) const override; + + MIOPEN_INTERNALS_EXPORT bool + IsValidPerformanceConfig(const ExecutionContext& ctx, + const miopen::batchnorm::ProblemDescription& problem, + const PerformanceConfigBnFwdTraining& config) const override; + + MIOPEN_INTERNALS_EXPORT PerformanceConfigBnFwdTraining + Search(const ExecutionContext& ctx, + const miopen::batchnorm::ProblemDescription& problem, + const AnyInvokeParams& invoke_ctx) const override; bool IsApplicable(const ExecutionContext& context, const miopen::batchnorm::ProblemDescription& problem) const override; ConvSolution GetSolution(const ExecutionContext& context, - const miopen::batchnorm::ProblemDescription& problem) const override; + const miopen::batchnorm::ProblemDescription& problem, + const PerformanceConfigBnFwdTraining& config) const override; }; struct BnBwdTrainingPerActivation final : BatchnormSolver diff --git a/src/include/miopen/fusion/problem_description.hpp b/src/include/miopen/fusion/problem_description.hpp index b3d1669cee..fc485300ce 100644 --- a/src/include/miopen/fusion/problem_description.hpp +++ b/src/include/miopen/fusion/problem_description.hpp @@ -153,7 +153,8 @@ struct FusionDescription : ProblemDescriptionBase not_used, // expAvgFactor filler not_used, true /* resultSave*/, - bn_op.runningMeanVar}; + bn_op.runningMeanVar, + 1}; /* min number of workgroups */ } else if(dir == miopen::batchnorm::Direction::Backward) { @@ -170,7 +171,8 @@ struct FusionDescription : ProblemDescriptionBase {} /*bn_op.base_desc*/, {} /*bn_op.base_desc*/, not_used, - bn_op.useBatchStats /*useSaved*/}; + bn_op.useBatchStats /*useSaved*/, + 1}; /*min number of workgroups */ } else MIOPEN_THROW(miopenStatusNotImplemented); diff --git a/src/include/miopen/solver/implicitgemm_ck_util.hpp b/src/include/miopen/solver/implicitgemm_ck_util.hpp index 27800f15b0..91d055571b 100644 --- a/src/include/miopen/solver/implicitgemm_ck_util.hpp +++ b/src/include/miopen/solver/implicitgemm_ck_util.hpp @@ -312,12 +312,13 @@ ConvSolution InitAnyInvokerFactory(const ProblemDescriptionType& problem, const auto& data_ctx = primitive_parameters.CastTo(); auto argument_ptr = ck_args.MakeArgPtr(sh_conv_ptr, data_ctx); auto invoker_ptr = sh_conv_ptr->MakeInvokerPointer(); - - const auto enable_profiling = handle.IsProfilingEnabled(); - float elapsed_time = - invoker_ptr->Run(argument_ptr.get(), {handle.GetStream(), enable_profiling}); - if(enable_profiling) { + WorkAroundHipEventProfiler prf(handle); + invoker_ptr->Run(argument_ptr.get(), {handle.GetStream(), false}); + } + if(handle.IsProfilingEnabled()) + { + float elapsed_time = handle.GetKernelTime(); handle.ResetKernelTime(); handle.AccumKernelTime(elapsed_time); } diff --git a/src/kernels/MIOpenBatchNormBwdSpatial.cl b/src/kernels/MIOpenBatchNormBwdSpatial.cl index a018d35640..7465ff547f 100644 --- a/src/kernels/MIOpenBatchNormBwdSpatial.cl +++ b/src/kernels/MIOpenBatchNormBwdSpatial.cl @@ -638,300 +638,451 @@ MIOpenBatchNormBwdSpatial(const __global _FLOAT* __restrict x_in, __attribute__((reqd_work_group_size(MIO_BN_GRP0, MIO_BN_GRP1, MIO_BN_GRP2))) __kernel void MIOpenBatchNormBwdSpatialFinalMeanVariance(__global _FLOAT* __restrict meanvarbuff, - _FLOAT INHW, + _FLOAT_PREC INHW, double epsilon) { - _FLOAT variance = (_FLOAT)0.; - _FLOAT invVariance = (_FLOAT)0.; - _FLOAT mean = (_FLOAT)0.; - unsigned int lid = get_local_id(1); - unsigned int ygrp_id = get_group_id(1); - unsigned int xgid = get_global_id(0); - unsigned int ygrp_sz = get_local_size(1); - unsigned int yngrps = get_num_groups(1); - unsigned int cidx = xgid * MIO_BN_HW; - unsigned int meanstashindex = cidx + ygrp_sz * ygrp_id + 1; - unsigned int varstashindex = cidx + ygrp_sz * ygrp_id + 3; - unsigned int commitID = 0; - - for(int gn = 0; gn < yngrps; gn++) + + unsigned int xlid = get_local_id(0); + unsigned int ylid = get_local_id(1); + unsigned int xgrp_id = get_group_id(0); + unsigned int xgid = get_global_id(0); + unsigned int xgrp_sz = get_local_size(0); + unsigned int ygrp_sz = get_local_size(1); + + unsigned int xstride = MIO_LAYOUT_NHWC ? 1 : MIO_BN_HW; + unsigned int ystride = MIO_LAYOUT_NHWC ? MIO_BN_C : 1; + + if(xgid * VEC_SIZE_X >= MIO_BN_C) + return; + + _FLOAT_PREC_C variance = (_FLOAT_PREC_C)0.; + _FLOAT_PREC_C mean = (_FLOAT_PREC_C)0.; + _FLOAT_PREC_C invVariance; + + for(unsigned int yoffset = ylid; yoffset < MIO_BN_NGRPS; yoffset += ygrp_sz) { - unsigned int offset = gn * ygrp_sz + lid; - unsigned int meanindex = cidx + ygrp_sz * offset; - unsigned int varindex = cidx + ygrp_sz * offset + 2; - if(offset < yngrps) - { // modify to span larger number of groups - mean += *(meanvarbuff + meanindex); - variance += *(meanvarbuff + varindex); // load per group variance - } + mean += loadFromStash((__global _FLOAT_C*)meanvarbuff, + 0, + ygrp_sz * yoffset * VEC_SIZE_Y, + ystride / VEC_SIZE_X, + xgrp_sz, + xgrp_id, + xlid, + xstride); + variance += loadFromStash((__global _FLOAT_C*)meanvarbuff, + 1, + ygrp_sz * yoffset * VEC_SIZE_Y, + ystride / VEC_SIZE_X, + xgrp_sz, + xgrp_id, + xlid, + xstride); } -#if !MIOPEN_USE_AMDGCN - local _FLOAT_ACCUM lcl_data_x[MIO_BN_LDS_SIZE]; - local _FLOAT_ACCUM lcl_data_y[MIO_BN_LDS_SIZE]; - lds_reduce2(&mean, &variance, FLOAT2ACCUM(INHW), lcl_data_x, lcl_data_y, lid); +#if !MIOPEN_USE_AMDGCN || MIO_BN_GRP0 > 1 || MIO_BN_LDSGCN_SIZE == 1 + // TODO: this simple approach has many bank conflicts, optimize if it affects performance + local _FLOAT_ACCUM_C lcl_data_x[MIO_BN_LDS_SIZE]; + local _FLOAT_ACCUM_C lcl_data_y[MIO_BN_LDS_SIZE]; + lds_reduce2_2d(&mean, + &variance, + INHW, + lcl_data_x + xlid * ygrp_sz, + lcl_data_y + xlid * ygrp_sz, + ylid, + ygrp_sz); #else - commitID = 64; - local _FLOAT_ACCUM lcl_data_x[MIO_BN_LDSGCN_SIZE]; - local _FLOAT_ACCUM lcl_data_y[MIO_BN_LDSGCN_SIZE]; - gcn_reduce2(&mean, &variance, FLOAT2ACCUM(INHW), lcl_data_x, lcl_data_y, lid); + local _FLOAT_ACCUM_C lcl_data_x[MIO_BN_LDSGCN_SIZE]; + local _FLOAT_ACCUM_C lcl_data_y[MIO_BN_LDSGCN_SIZE]; + gcn_reduce2(&mean, &variance, INHW, lcl_data_x, lcl_data_y, ylid); #endif - barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE); - variance = mad(-mean, mean, variance); - if(variance < 0) - { - variance = 0; - } - invVariance = rsqrt(variance + epsilon); - if(lid == commitID) + variance = mad(-mean, mean, variance); + variance = max(variance, (_FLOAT_PREC_C)0.); + invVariance = rsqrt(variance + (_FLOAT_PREC_C)epsilon); + + for(unsigned int yoffset = ylid; yoffset < MIO_BN_NGRPS; yoffset += ygrp_sz) { - meanvarbuff[meanstashindex] = mean; // stash mean - meanvarbuff[varstashindex] = invVariance; // stash mean + // Replicate mean and variance for all y groups because stash == dx_out and + // MIOpenBatchNormBwdSpatialDX will read them and rewrite the buffer entirely. + storeToStash(mean, + (__global _FLOAT_C*)meanvarbuff, + 0, + ygrp_sz * yoffset * VEC_SIZE_Y, + ystride / VEC_SIZE_X, + xgrp_sz, + xgrp_id, + xlid, + xstride); + storeToStash(invVariance, + (__global _FLOAT_C*)meanvarbuff, + 1, + ygrp_sz * yoffset * VEC_SIZE_Y, + ystride / VEC_SIZE_X, + xgrp_sz, + xgrp_id, + xlid, + xstride); } } __attribute__((reqd_work_group_size(MIO_BN_GRP0, MIO_BN_GRP1, MIO_BN_GRP2))) __kernel void MIOpenBatchNormBwdSpatialMeanVariance(const __global _FLOAT* __restrict in, - __global _FLOAT* __restrict mvbuff) + __global _FLOAT* __restrict meanvarbuff) { + unsigned int xlid = get_local_id(0); unsigned int ylid = get_local_id(1); + unsigned int xgrp_id = get_group_id(0); unsigned int ygrp_id = get_group_id(1); unsigned int xgid = get_global_id(0); unsigned int ygid = get_global_id(1); + unsigned int xgrp_sz = get_local_size(0); unsigned int ygrp_sz = get_local_size(1); + + unsigned int xstride = MIO_LAYOUT_NHWC ? 1 : MIO_BN_HW; + unsigned int ystride = MIO_LAYOUT_NHWC ? MIO_BN_C : 1; + + if(xgid * VEC_SIZE_X >= MIO_BN_C) + return; + unsigned int index; - unsigned int cidx = xgid * MIO_BN_HW; - unsigned int meanindex = cidx + ygrp_sz * ygrp_id; - unsigned int varindex = meanindex + 2; - _FLOAT mean = (_FLOAT)0.; - _FLOAT variance = (_FLOAT)0.; - _FLOAT value = (_FLOAT)0.; - - if(ygid < MIO_BN_HW) - { + _FLOAT_PREC_LS value; + _FLOAT_PREC_C mean = (_FLOAT_PREC_C)0.; + _FLOAT_PREC_C variance = (_FLOAT_PREC_C)0.; + if(ygid * VEC_SIZE_Y < MIO_BN_HW) + { + _FLOAT_LS read4; for(unsigned int n = 0; n < MIO_BN_N; n++) { - index = n * MIO_BN_CHW + cidx + ygid; - value = *(in + index); - mean += value; - variance = mad(value, value, variance); + index = n * MIO_BN_CHW + ygid * ystride * VEC_SIZE_Y + xgid * xstride * VEC_SIZE_X; + read4 = *((const __global _FLOAT_LS*)(in + index)); + value = FLOAT2FLOATPREC_VEC(read4); + _ACCUMULATE(mean, value) + _ACCUMULATE_MAD(variance, value, value, variance) } } -#if !MIOPEN_USE_AMDGCN - local _FLOAT_ACCUM lcl_data_x[MIO_BN_NGRPS]; - local _FLOAT_ACCUM lcl_data_y[MIO_BN_NGRPS]; - lds_reduce2(&mean, &variance, (_FLOAT_ACCUM)1.0, lcl_data_x, lcl_data_y, ylid); +#if !MIOPEN_USE_AMDGCN || MIO_BN_GRP0 > 1 || MIO_BN_LDSGCN_SIZE == 1 + local _FLOAT_ACCUM_C lcl_data_x[MIO_BN_LDS_SIZE]; + local _FLOAT_ACCUM_C lcl_data_y[MIO_BN_LDS_SIZE]; + lds_reduce2_2d(&mean, + &variance, + (_FLOAT_ACCUM)1.0, + lcl_data_x + xlid * ygrp_sz, + lcl_data_y + xlid * ygrp_sz, + ylid, + ygrp_sz); #else - local _FLOAT_ACCUM lcl_data_x[MIO_BN_LDSGCN_SIZE]; - local _FLOAT_ACCUM lcl_data_y[MIO_BN_LDSGCN_SIZE]; + local _FLOAT_ACCUM_C lcl_data_x[MIO_BN_LDSGCN_SIZE]; + local _FLOAT_ACCUM_C lcl_data_y[MIO_BN_LDSGCN_SIZE]; gcn_reduce2(&mean, &variance, (_FLOAT_ACCUM)1.0, lcl_data_x, lcl_data_y, ylid); #endif if(ylid == 0) { - mvbuff[meanindex] = mean; - mvbuff[varindex] = variance; + storeToStash(mean, + (__global _FLOAT_C*)meanvarbuff, + 0, + ygrp_sz * ygrp_id * VEC_SIZE_Y, + ystride / VEC_SIZE_X, + xgrp_sz, + xgrp_id, + xlid, + xstride); + storeToStash(variance, + (__global _FLOAT_C*)meanvarbuff, + 1, + ygrp_sz * ygrp_id * VEC_SIZE_Y, + ystride / VEC_SIZE_X, + xgrp_sz, + xgrp_id, + xlid, + xstride); } } // end spatial mean kernel #endif // end USESAVED == 0 __attribute__((reqd_work_group_size(MIO_BN_GRP0, MIO_BN_GRP1, MIO_BN_GRP2))) __kernel void -MIOpenBatchNormBwdSpatialDScaleDBias(const __global _FLOAT* x_in, - const __global _FLOAT* dy_in, - __global _FLOAT* buff -#if(MIO_BN_USESAVED == 1) - +MIOpenBatchNormBwdSpatialDScaleDBias(const __global _FLOAT* __restrict x_in, + const __global _FLOAT* __restrict dy_in, + __global _FLOAT* __restrict buff +#if MIO_BN_USESAVED == 1 , - const __global _FLOAT* savedMean, - const __global _FLOAT* savedInvVariance + const __global _FLOAT_PREC* __restrict savedMean, + const __global _FLOAT_PREC* __restrict savedInvVariance #endif ) { - unsigned int xgid = get_global_id(0); + unsigned int xlid = get_local_id(0); unsigned int ylid = get_local_id(1); - unsigned int ygrp_id = get_group_id(1); + unsigned int xgid = get_global_id(0); unsigned int ygid = get_global_id(1); + unsigned int xgrp_id = get_group_id(0); + unsigned int ygrp_id = get_group_id(1); + unsigned int xgrp_sz = get_local_size(0); unsigned int ygrp_sz = get_local_size(1); - unsigned int index; - unsigned int cidx = xgid * MIO_BN_HW; - _FLOAT mean = (_FLOAT)0.; - _FLOAT invVar = (_FLOAT)0.; - _FLOAT elemStd = (_FLOAT)0.; - _FLOAT xhat = (_FLOAT)0.; - _FLOAT dscale = (_FLOAT)0.; - _FLOAT dbias = (_FLOAT)0.; + unsigned int xstride = MIO_LAYOUT_NHWC ? 1 : MIO_BN_HW; + unsigned int ystride = MIO_LAYOUT_NHWC ? MIO_BN_C : 1; - __local _FLOAT lmean, livar; + if(xgid * VEC_SIZE_X >= MIO_BN_C) + return; + + unsigned int index; + _FLOAT_PREC_C mean, invVar; + _FLOAT_PREC_LS elemStd, xhat; + _FLOAT_PREC_C dscale = (_FLOAT_PREC_C)0.; + _FLOAT_PREC_C dbias = (_FLOAT_PREC_C)0.; + + local _FLOAT_PREC_C lmean[MIO_BN_GRP0], livar[MIO_BN_GRP0]; if(ylid == 0) { -#if(MIO_BN_USESAVED == 0) - unsigned int meanstashindex = cidx + ygrp_sz * ygrp_id + 1; - unsigned int varstashindex = cidx + ygrp_sz * ygrp_id + 3; - lmean = *(buff + meanstashindex); // load stashed mean - livar = *(buff + varstashindex); -#else // NO SAVED - lmean = *(savedMean + xgid); - livar = *(savedInvVariance + xgid); -#endif // SAVED +#if MIO_BN_USESAVED == 0 + lmean[xlid] = loadFromStash((__global _FLOAT_C*)buff, + 0, + ygrp_sz * ygrp_id * VEC_SIZE_Y, + ystride / VEC_SIZE_X, + xgrp_sz, + xgrp_id, + xlid, + xstride); + livar[xlid] = loadFromStash((__global _FLOAT_C*)buff, + 1, + ygrp_sz * ygrp_id * VEC_SIZE_Y, + ystride / VEC_SIZE_X, + xgrp_sz, + xgrp_id, + xlid, + xstride); +#else + lmean[xlid] = *((__global _FLOAT_PREC_C*)(savedMean + xgid * VEC_SIZE_X)); + livar[xlid] = *((__global _FLOAT_PREC_C*)(savedInvVariance + xgid * VEC_SIZE_X)); +#endif } barrier(CLK_LOCAL_MEM_FENCE); - if(ygid < MIO_BN_HW) + if(ygid * VEC_SIZE_Y < MIO_BN_HW) { - mean = lmean; - invVar = livar; + mean = lmean[xlid]; + invVar = livar[xlid]; + _FLOAT_LS read4; + _FLOAT_PREC_LS value1, value2; for(unsigned int n = 0; n < MIO_BN_N; n++) { - index = n * MIO_BN_CHW + cidx + ygid; - dbias += *(dy_in + index); - elemStd = *(x_in + index) - mean; + index = n * MIO_BN_CHW + ygid * ystride * VEC_SIZE_Y + xgid * xstride * VEC_SIZE_X; + read4 = *((const __global _FLOAT_LS*)(dy_in + index)); + value1 = FLOAT2FLOATPREC_VEC(read4); + _ACCUMULATE(dbias, value1) + read4 = *((const __global _FLOAT_LS*)(x_in + index)); + value2 = FLOAT2FLOATPREC_VEC(read4); + elemStd = value2 - mean; xhat = elemStd * invVar; - dscale = mad(xhat, dy_in[index], dscale); + _ACCUMULATE_MAD(dscale, xhat, value1, dscale) } } -// REDUCE over DS and DB -#if !MIOPEN_USE_AMDGCN - local _FLOAT_ACCUM lcl_data_x2[MIO_BN_LDS_SIZE]; - local _FLOAT_ACCUM lcl_data_y2[MIO_BN_LDS_SIZE]; - lds_reduce2(&dscale, &dbias, (_FLOAT_ACCUM)1.0, lcl_data_x2, lcl_data_y2, ylid); +#if !MIOPEN_USE_AMDGCN || MIO_BN_GRP0 > 1 || MIO_BN_LDSGCN_SIZE == 1 + local _FLOAT_ACCUM_C lcl_data_x[MIO_BN_LDS_SIZE]; + local _FLOAT_ACCUM_C lcl_data_y[MIO_BN_LDS_SIZE]; + lds_reduce2_2d(&dscale, + &dbias, + (_FLOAT_ACCUM)1.0, + lcl_data_x + xlid * ygrp_sz, + lcl_data_y + xlid * ygrp_sz, + ylid, + ygrp_sz); #else - local _FLOAT_ACCUM lcl_data_x2[MIO_BN_LDSGCN_SIZE]; - local _FLOAT_ACCUM lcl_data_y2[MIO_BN_LDSGCN_SIZE]; - gcn_reduce2(&dscale, &dbias, (_FLOAT_ACCUM)1.0, lcl_data_x2, lcl_data_y2, ylid); + local _FLOAT_ACCUM_C lcl_data_x[MIO_BN_LDSGCN_SIZE]; + local _FLOAT_ACCUM_C lcl_data_y[MIO_BN_LDSGCN_SIZE]; + gcn_reduce2(&dscale, &dbias, (_FLOAT_ACCUM)1.0, lcl_data_x, lcl_data_y, ylid); #endif - // end reduction----------- if(ylid == 0) { - unsigned int betaindex = cidx + ygrp_sz * ygrp_id + 6; - unsigned int gammaindex = cidx + ygrp_sz * ygrp_id + 4; - buff[gammaindex] = FLOAT2FLOATPREC(dscale); - buff[betaindex] = FLOAT2FLOATPREC(dbias); + storeToStash(dscale, + (__global _FLOAT_C*)buff, + 2, + ygrp_sz * ygrp_id * VEC_SIZE_Y, + ystride / VEC_SIZE_X, + xgrp_sz, + xgrp_id, + xlid, + xstride); + storeToStash(dbias, + (__global _FLOAT_C*)buff, + 3, + ygrp_sz * ygrp_id * VEC_SIZE_Y, + ystride / VEC_SIZE_X, + xgrp_sz, + xgrp_id, + xlid, + xstride); } } __attribute__((reqd_work_group_size(MIO_BN_GRP0, MIO_BN_GRP1, MIO_BN_GRP2))) __kernel void -MIOpenBatchNormBwdSpatialFinalDScaleDBias(__global _FLOAT* buff, - __global _FLOAT* delta_scale, - __global _FLOAT* delta_bias) +MIOpenBatchNormBwdSpatialFinalDScaleDBias(const __global _FLOAT* __restrict buff, + __global _FLOAT_PREC* __restrict delta_scale, + __global _FLOAT_PREC* __restrict delta_bias) { - _FLOAT ds = (_FLOAT)0.; - _FLOAT db = (_FLOAT)0.; - - unsigned int lid = get_local_id(1); + unsigned int xlid = get_local_id(0); + unsigned int ylid = get_local_id(1); unsigned int xgid = get_global_id(0); - unsigned int ygid = get_global_id(1); + unsigned int xgrp_id = get_group_id(0); + unsigned int xgrp_sz = get_local_size(0); unsigned int ygrp_sz = get_local_size(1); - unsigned int yngrps = get_num_groups(1); - int cidx = MIO_BN_HW * xgid; - for(int gn = 0; gn < MIO_BN_NGRPS; gn++) + unsigned int xstride = MIO_LAYOUT_NHWC ? 1 : MIO_BN_HW; + unsigned int ystride = MIO_LAYOUT_NHWC ? MIO_BN_C : 1; + + if(xgid * VEC_SIZE_X >= MIO_BN_C) + return; + + _FLOAT_PREC_C dscale = (_FLOAT_PREC_C)0.; + _FLOAT_PREC_C dbias = (_FLOAT_PREC_C)0.; + + for(unsigned int yoffset = ylid; yoffset < MIO_BN_NGRPS; yoffset += ygrp_sz) { - unsigned int offset = gn * ygrp_sz + lid; - if(offset < yngrps) - { // modify to span larger number of groups - unsigned int gammaindex = cidx + ygrp_sz * offset + 4; - unsigned int betaindex = cidx + ygrp_sz * offset + 6; - ds += *(buff + gammaindex); - db += *(buff + betaindex); - } + dscale += loadFromStash((__global _FLOAT_C*)buff, + 2, + ygrp_sz * yoffset * VEC_SIZE_Y, + ystride / VEC_SIZE_X, + xgrp_sz, + xgrp_id, + xlid, + xstride); + dbias += loadFromStash((__global _FLOAT_C*)buff, + 3, + ygrp_sz * yoffset * VEC_SIZE_Y, + ystride / VEC_SIZE_X, + xgrp_sz, + xgrp_id, + xlid, + xstride); } -#if !MIOPEN_USE_AMDGCN - local _FLOAT_ACCUM lcl_data_x2[MIO_BN_NGRPS]; - local _FLOAT_ACCUM lcl_data_y2[MIO_BN_NGRPS]; - lds_reduce2(&ds, &db, (_FLOAT_ACCUM)1.0, lcl_data_x2, lcl_data_y2, lid); +#if !MIOPEN_USE_AMDGCN || MIO_BN_GRP0 > 1 || MIO_BN_LDSGCN_SIZE == 1 + local _FLOAT_ACCUM_C lcl_data_x[MIO_BN_LDS_SIZE]; + local _FLOAT_ACCUM_C lcl_data_y[MIO_BN_LDS_SIZE]; + lds_reduce2_2d(&dscale, + &dbias, + (_FLOAT_ACCUM)1.0, + lcl_data_x + xlid * ygrp_sz, + lcl_data_y + xlid * ygrp_sz, + ylid, + ygrp_sz); #else - local _FLOAT_ACCUM lcl_data_x2[MIO_BN_LDSGCN_SIZE]; - local _FLOAT_ACCUM lcl_data_y2[MIO_BN_LDSGCN_SIZE]; - gcn_reduce2(&ds, &db, (_FLOAT_ACCUM)1.0, lcl_data_x2, lcl_data_y2, lid); + local _FLOAT_ACCUM_C lcl_data_x[MIO_BN_LDSGCN_SIZE]; + local _FLOAT_ACCUM_C lcl_data_y[MIO_BN_LDSGCN_SIZE]; + gcn_reduce2(&dscale, &dbias, (_FLOAT_ACCUM)1.0, lcl_data_x, lcl_data_y, ylid); #endif - if(ygid == 0) + if(ylid == 0) { - delta_scale[xgid] = FLOAT2FLOATPREC(ds); - delta_bias[xgid] = FLOAT2FLOATPREC(db); + *((__global _FLOAT_PREC_C*)(delta_scale + xgid * VEC_SIZE_X)) = dscale; + *((__global _FLOAT_PREC_C*)(delta_bias + xgid * VEC_SIZE_X)) = dbias; } } __attribute__((reqd_work_group_size(MIO_BN_GRP0, MIO_BN_GRP1, MIO_BN_GRP2))) __kernel void -MIOpenBatchNormBwdSpatialDX(const __global _FLOAT* x_in, - const __global _FLOAT* dy_in, - __global _FLOAT* dx_out, - const __global _FLOAT* bnScale, - __global _FLOAT* delta_scale, - __global _FLOAT* delta_bias, -#if(MIO_BN_USESAVED == 1) - const __global _FLOAT* savedMean, - const __global _FLOAT* savedInvVariance, +MIOpenBatchNormBwdSpatialDX(const __global _FLOAT* __restrict x_in, + const __global _FLOAT* __restrict dy_in, + __global _FLOAT* __restrict dx_out, + const __global _FLOAT_PREC* __restrict bnScale, + const __global _FLOAT_PREC* __restrict delta_scale, + const __global _FLOAT_PREC* __restrict delta_bias, +#if MIO_BN_USESAVED == 1 + const __global _FLOAT_PREC* __restrict savedMean, + const __global _FLOAT_PREC* __restrict savedInvVariance, #endif - _FLOAT INHW) + _FLOAT_PREC INHW) { - int xgid = get_global_id(0); - int ygid = get_global_id(1); - int cidx = MIO_BN_HW * xgid; + unsigned int xlid = get_local_id(0); + unsigned int ylid = get_local_id(1); + unsigned int xgid = get_global_id(0); + unsigned int ygid = get_global_id(1); + + unsigned int xstride = MIO_LAYOUT_NHWC ? 1 : MIO_BN_HW; + unsigned int ystride = MIO_LAYOUT_NHWC ? MIO_BN_C : 1; + + if(xgid * VEC_SIZE_X >= MIO_BN_C) + return; + unsigned int index; - _FLOAT mean, invVar; - _FLOAT elemStd, xhat; - _FLOAT scale, dscale, dbias; - _FLOAT tmp1, tmp2, tmp3; - _FLOAT NHW = (_FLOAT)MIO_BN_NHW; + _FLOAT_PREC_C mean, invVar; + _FLOAT_PREC_LS elemStd, xhat; + _FLOAT_PREC_C scale, dscale, dbias; + _FLOAT_PREC_LS tmp1, tmp2, tmp3, tmp4; + _FLOAT_PREC_LS value1, value2; + _FLOAT_LS read4; + _FLOAT_PREC NHW = (_FLOAT_PREC)MIO_BN_NHW; - local _FLOAT lscale, ldscale, ldbias, lmean, livar; + local _FLOAT_PREC_C lscale[MIO_BN_GRP0], ldscale[MIO_BN_GRP0], ldbias[MIO_BN_GRP0], + lmean[MIO_BN_GRP0], livar[MIO_BN_GRP0]; - if(get_local_id(1) == 0) + if(ylid == 0) { - -#if(MIO_BN_USESAVED == 0) - int ygrp_id = get_group_id(1); - int ygrp_sz = get_local_size(1); - unsigned int meanstashindex = cidx + ygrp_sz * ygrp_id + 1; - unsigned int varstashindex = cidx + ygrp_sz * ygrp_id + 3; - lmean = *(dx_out + meanstashindex); // load stashed mean - livar = *(dx_out + varstashindex); -#else // SAVED - lmean = *(savedMean + xgid); - livar = *(savedInvVariance + xgid); -#endif // SAVED - lscale = *(bnScale + xgid); - ldscale = *(delta_scale + xgid); - ldbias = *(delta_bias + xgid); +#if MIO_BN_USESAVED == 0 + unsigned int xgrp_id = get_group_id(0); + unsigned int ygrp_id = get_group_id(1); + unsigned int xgrp_sz = get_local_size(0); + unsigned int ygrp_sz = get_local_size(1); + + lmean[xlid] = loadFromStash((__global _FLOAT_C*)dx_out, + 0, + ygrp_sz * ygrp_id * VEC_SIZE_Y, + ystride / VEC_SIZE_X, + xgrp_sz, + xgrp_id, + xlid, + xstride); + livar[xlid] = loadFromStash((__global _FLOAT_C*)dx_out, + 1, + ygrp_sz * ygrp_id * VEC_SIZE_Y, + ystride / VEC_SIZE_X, + xgrp_sz, + xgrp_id, + xlid, + xstride); +#else + lmean[xlid] = *((const __global _FLOAT_PREC_C*)(savedMean + xgid * VEC_SIZE_X)); + livar[xlid] = *((const __global _FLOAT_PREC_C*)(savedInvVariance + xgid * VEC_SIZE_X)); +#endif + lscale[xlid] = *((const __global _FLOAT_PREC_C*)(bnScale + xgid * VEC_SIZE_X)); + ldscale[xlid] = *((const __global _FLOAT_PREC_C*)(delta_scale + xgid * VEC_SIZE_X)); + ldbias[xlid] = *((const __global _FLOAT_PREC_C*)(delta_bias + xgid * VEC_SIZE_X)); } barrier(CLK_LOCAL_MEM_FENCE); - //________________________________________________ - // Group level reduction - // Need to reduce over all elements in NxHxW - // move across the sections of an image in the mini_batch stack - if(ygid < MIO_BN_HW) - { - mean = lmean; - invVar = livar; - scale = lscale; - dscale = ldscale; - dbias = ldbias; + if(ygid * VEC_SIZE_Y < MIO_BN_HW) + { + mean = lmean[xlid]; + invVar = livar[xlid]; + scale = lscale[xlid]; + dscale = ldscale[xlid]; + dbias = ldbias[xlid]; for(unsigned int n = 0; n < MIO_BN_N; n++) { // apply normalization - index = n * MIO_BN_CHW + cidx + ygid; - elemStd = *(x_in + index) - mean; // (x_i - mean) - xhat = elemStd * invVar; // recalculating this again... - tmp1 = mad(NHW, *(dy_in + index), -dbias); - tmp2 = -xhat * dscale; - tmp3 = scale * invVar * INHW; - dx_out[index] = tmp3 * (tmp2 + tmp1); + index = n * MIO_BN_CHW + ygid * ystride * VEC_SIZE_Y + xgid * xstride * VEC_SIZE_X; + read4 = *((const __global _FLOAT_LS*)(x_in + index)); + value1 = FLOAT2FLOATPREC_VEC(read4); + elemStd = value1 - mean; // (x_i - mean) + xhat = elemStd * invVar; // recalculating this again... + read4 = *((const __global _FLOAT_LS*)(dy_in + index)); + value2 = FLOAT2FLOATPREC_VEC(read4); + tmp1 = mad((_FLOAT_PREC_LS)NHW, value2, -dbias); + tmp2 = -xhat * dscale; + tmp3 = scale * invVar * INHW; + tmp4 = tmp3 * (tmp2 + tmp1); + *((__global _FLOAT_LS*)(dx_out + index)) = FLOATPREC2FLOAT_VEC(tmp4); } } } diff --git a/src/kernels/MIOpenBatchNormFwdInferPerAct.cl b/src/kernels/MIOpenBatchNormFwdInferPerAct.cl index f516a076ea..df2d66582d 100644 --- a/src/kernels/MIOpenBatchNormFwdInferPerAct.cl +++ b/src/kernels/MIOpenBatchNormFwdInferPerAct.cl @@ -2,7 +2,7 @@ * * MIT License * - * Copyright (c) 2017 Advanced Micro Devices, Inc. + * Copyright (c) 2017-2025 Advanced Micro Devices, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -35,43 +35,52 @@ #include "batchnorm_functions.h" __attribute__((reqd_work_group_size(MIO_BN_GRP0, MIO_BN_GRP1, MIO_BN_GRP2))) __kernel void -MIOpenBatchNormFwdInferPerActivationEst(const __global _FLOAT* in, - __global _FLOAT* __restrict out, - __global _FLOAT_PREC* __restrict estimatedMean, - __global _FLOAT_PREC* __restrict estimatedVariance, +MIOpenBatchNormFwdInferPerActivationEst(const __global _FLOAT* __restrict in, /* x input */ + __global _FLOAT* __restrict out, /* y output */ + const __global _FLOAT_PREC* __restrict estimatedMean, + const __global _FLOAT_PREC* __restrict estimatedVariance, const __global _FLOAT_PREC* __restrict scale, const __global _FLOAT_PREC* __restrict bias, double epsilon, + unsigned int c, + unsigned int hw, unsigned int batchSize, - unsigned int imageDims, + unsigned int cStride, + unsigned int hwStride, unsigned int batchStride) { + int xgid = get_global_id(0); + int ygid = get_global_id(1); + + if(xgid * VEC_SIZE_X >= c || ygid * VEC_SIZE_Y >= hw) + return; - // PER ACTIVATION - _FLOAT_PREC mean, variance; - _FLOAT_PREC invVariance, elemStd, inhat; - _FLOAT_PREC pvt_scale, pvt_bias; unsigned int adjIndex, index; - int ygid = get_global_id(1); - int yglb_sz = get_global_size(1); - int grpid = get_group_id(0); - for(int img_offset = ygid; img_offset < imageDims; img_offset += yglb_sz) + // PER ACTIVATION + _FLOAT_PREC_LS mean, variance, invVariance; + _FLOAT_PREC_LS inhat; + _FLOAT_PREC_LS pscale, pbias; + _FLOAT_LS value; + + adjIndex = (xgid * cStride * VEC_SIZE_X) + (ygid * hwStride * VEC_SIZE_Y); + mean = *((const __global _FLOAT_PREC_LS*)(estimatedMean + adjIndex)); + variance = *((const __global _FLOAT_PREC_LS*)(estimatedVariance + adjIndex)); + pscale = *((const __global _FLOAT_PREC_LS*)(scale + adjIndex)); + pbias = *((const __global _FLOAT_PREC_LS*)(bias + adjIndex)); + invVariance = rsqrt(fabs(variance + (_FLOAT_PREC_LS)epsilon)); + + for(int n = 0; n < batchSize; n++) { - adjIndex = (grpid * imageDims) + img_offset; - mean = estimatedMean[adjIndex]; - variance = estimatedVariance[adjIndex]; - invVariance = rsqrt(fabs(variance + epsilon)); - pvt_scale = *(scale + adjIndex); - pvt_bias = *(bias + adjIndex); + index = (n * batchStride) + adjIndex; + value = *((const __global _FLOAT_LS*)(in + index)); + + inhat = FLOAT2FLOATPREC_VEC(value); + inhat = (inhat - mean) * invVariance; + inhat = mad(pscale, inhat, pbias); + value = FLOATPREC2FLOAT_VEC(inhat); - for(int n = 0; n < batchSize; n++) - { - index = (batchStride * n) + adjIndex; - elemStd = FLOAT2FLOATPREC(*(in + index)) - mean; - inhat = elemStd * invVariance; - out[index] = FLOATPREC2FLOAT(mad(pvt_scale, inhat, pvt_bias)); - } + *((__global _FLOAT_LS*)(out + index)) = value; } } diff --git a/src/kernels/MIOpenBatchNormFwdInferSpatial.cl b/src/kernels/MIOpenBatchNormFwdInferSpatial.cl index a81db2a03b..788dbb1340 100644 --- a/src/kernels/MIOpenBatchNormFwdInferSpatial.cl +++ b/src/kernels/MIOpenBatchNormFwdInferSpatial.cl @@ -2,7 +2,7 @@ * * MIT License * - * Copyright (c) 2017 Advanced Micro Devices, Inc. + * Copyright (c) 2017-2025 Advanced Micro Devices, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -42,34 +42,42 @@ MIOpenBatchNormFwdInferSpatialEst(const __global _FLOAT* __restrict in, /* x inp const __global _FLOAT_PREC* __restrict scale, const __global _FLOAT_PREC* __restrict bias, double epsilon, + unsigned int c, + unsigned int hw, unsigned int batchSize, - unsigned int imageDims, + unsigned int cStride, + unsigned int hwStride, unsigned int batchStride) { + unsigned int xgid = get_global_id(0); + unsigned int ygid = get_global_id(1); - int xgid = get_global_id(0); - int ygid = get_global_id(1); + if(xgid * VEC_SIZE_X >= c || ygid * VEC_SIZE_Y >= hw) + return; unsigned int index; + _FLOAT_PREC_C mean, variance, invVariance; + _FLOAT_PREC_C pscale, pbias; + _FLOAT_PREC_LS inhat; + _FLOAT_LS value; - _FLOAT_PREC mean, variance, invVariance; - _FLOAT_PREC inhat; - _FLOAT_PREC pscale, pbias; + mean = *((const __global _FLOAT_PREC_C*)(estimatedMean + xgid * VEC_SIZE_X)); + variance = *((const __global _FLOAT_PREC_C*)(estimatedVariance + xgid * VEC_SIZE_X)); + pscale = *((const __global _FLOAT_PREC_C*)(scale + xgid * VEC_SIZE_X)); + pbias = *((const __global _FLOAT_PREC_C*)(bias + xgid * VEC_SIZE_X)); + invVariance = rsqrt(fabs(variance + (_FLOAT_PREC_C)epsilon)); - mean = *(estimatedMean + xgid); - variance = *(estimatedVariance + xgid); - pscale = *(scale + xgid); - pbias = *(bias + xgid); - invVariance = rsqrt(fabs(variance + epsilon)); - - for(int idx = ygid; idx < imageDims; idx += get_global_size(1)) + for(int n = 0; n < batchSize; n++) { - for(int n = 0; n < batchSize; n++) - { - index = (n * batchStride) + (xgid * imageDims) + idx; - inhat = (FLOAT2FLOATPREC(*(in + index)) - mean) * invVariance; - out[index] = FLOATPREC2FLOAT(mad(pscale, inhat, pbias)); - } + index = (n * batchStride) + (xgid * cStride * VEC_SIZE_X) + (ygid * hwStride * VEC_SIZE_Y); + value = *((const __global _FLOAT_LS*)(in + index)); + + inhat = FLOAT2FLOATPREC_VEC(value); + inhat = (inhat - mean) * invVariance; + inhat = mad(pscale, inhat, (_FLOAT_PREC_LS)pbias); + value = FLOATPREC2FLOAT_VEC(inhat); + + *((__global _FLOAT_LS*)(out + index)) = value; } } // end spatial norm diff --git a/src/kernels/MIOpenBatchNormFwdTrainSpatial.cl b/src/kernels/MIOpenBatchNormFwdTrainSpatial.cl index ef69963149..e3bf837cbf 100644 --- a/src/kernels/MIOpenBatchNormFwdTrainSpatial.cl +++ b/src/kernels/MIOpenBatchNormFwdTrainSpatial.cl @@ -446,51 +446,77 @@ MIOpenBatchNormFwdTrainSpatialNorm(const __global _FLOAT* __restrict in, const __global _FLOAT_PREC* __restrict bias) { - // SPATIAL - _FLOAT_PREC mean = (_FLOAT_PREC)0.; - _FLOAT_PREC invVariance = (_FLOAT_PREC)0.; - _FLOAT_PREC inhat = (_FLOAT_PREC)0.; - _FLOAT_PREC pvt_scale = (_FLOAT_PREC)0.; - _FLOAT_PREC pvt_bias = (_FLOAT_PREC)0.; - __local _FLOAT_PREC lcl_bias; - __local _FLOAT_PREC lcl_scale; - __local _FLOAT lcl_mean, lcl_ivar; + unsigned int xstride = MIO_LAYOUT_NHWC ? 1 : MIO_BN_HW; + unsigned int ystride = MIO_LAYOUT_NHWC ? MIO_BN_C : 1; + unsigned int xgrp_id = get_group_id(0); unsigned int ygrp_id = get_group_id(1); unsigned int xgid = get_global_id(0); unsigned int ygid = get_global_id(1); + unsigned int xgrp_sz = get_local_size(0); unsigned int ygrp_sz = get_local_size(1); + unsigned int xlid = get_local_id(0); unsigned int index; - unsigned int cidx = xgid * MIO_BN_HW; - unsigned int meanstashindex = cidx + ygrp_sz * ygrp_id + 1; - unsigned int varstashindex = cidx + ygrp_sz * ygrp_id + 3; + + // SPATIAL + _FLOAT_PREC_C mean = (_FLOAT_PREC_C)0.; + _FLOAT_PREC_C invVariance = (_FLOAT_PREC_C)0.; + _FLOAT_PREC_LS inhat = (_FLOAT_PREC_LS)0.; + _FLOAT_PREC_C pvt_scale = (_FLOAT_PREC_C)0.; + _FLOAT_PREC_C pvt_bias = (_FLOAT_PREC_C)0.; + _FLOAT_LS value; + __local _FLOAT_PREC_C lcl_bias[MIO_BN_GRP0]; + __local _FLOAT_PREC_C lcl_scale[MIO_BN_GRP0]; + __local _FLOAT_PREC_C lcl_mean[MIO_BN_GRP0]; + __local _FLOAT_PREC_C lcl_ivar[MIO_BN_GRP0]; + + if(xgid * VEC_SIZE_X >= MIO_BN_C) + return; // #4 apply the normalization :: x_hat = (x_i - mean) / sqrt(variance_accum + epsilon) if(get_local_id(1) == 0) { - lcl_scale = *(scale + xgid); - lcl_bias = *(bias + xgid); - lcl_mean = *(out + meanstashindex); // load stashed mean - lcl_ivar = *(out + varstashindex); + lcl_scale[xlid] = *((const __global _FLOAT_PREC_C*)(scale + xgid * VEC_SIZE_X)); + lcl_bias[xlid] = *((const __global _FLOAT_PREC_C*)(bias + xgid * VEC_SIZE_X)); + lcl_mean[xlid] = loadFromStash((__global _FLOAT_C*)out, + 0, + ygrp_sz * ygrp_id * VEC_SIZE_Y, + ystride / VEC_SIZE_X, + xgrp_sz, + xgrp_id, + xlid, + xstride); + lcl_ivar[xlid] = loadFromStash((__global _FLOAT_C*)out, + 1, + ygrp_sz * ygrp_id * VEC_SIZE_Y, + ystride / VEC_SIZE_X, + xgrp_sz, + xgrp_id, + xlid, + xstride); } barrier(CLK_LOCAL_MEM_FENCE); - if(ygid < MIO_BN_HW) + if(ygid * VEC_SIZE_Y < MIO_BN_HW) { - mean = FLOAT2FLOATPREC(lcl_mean); - invVariance = FLOAT2FLOATPREC(lcl_ivar); - pvt_scale = lcl_scale; - pvt_bias = lcl_bias; + mean = lcl_mean[xlid]; + invVariance = lcl_ivar[xlid]; + pvt_scale = lcl_scale[xlid]; + pvt_bias = lcl_bias[xlid]; #if(MIO_BN_HW > MIO_BN_LOOP_UNROLL_MAXHW) for(unsigned int n = 0; n < MIO_BN_N; n++) #else __attribute__((opencl_unroll_hint(2))) for(unsigned int n = 0; n < MIO_BN_N; n++) #endif { // apply normalization - index = n * MIO_BN_CHW + cidx + ygid; - inhat = (FLOAT2FLOATPREC(*(in + index)) - mean) * invVariance; + index = n * MIO_BN_CHW + ygid * ystride * VEC_SIZE_Y + xgid * xstride * VEC_SIZE_X; + value = *((const __global _FLOAT_LS*)(in + index)); + inhat = FLOAT2FLOATPREC_VEC(value); + inhat = (inhat - mean) * invVariance; + inhat = mad(pvt_scale, inhat, pvt_bias); + value = FLOATPREC2FLOAT_VEC(inhat); // #5 Gamma and Beta adjust :: y_i = gamma*x_hat + beta - out[index] = FLOATPREC2FLOAT(mad(pvt_scale, inhat, pvt_bias)); + *((__global _FLOAT_LS*)(out + index)) = value; } // end for(n) } // end if(inImgIndex) } // end spatial norm @@ -516,65 +542,105 @@ MIOpenBatchNormFwdTrainSpatialFinalMeanVariance( #endif ) { - _FLOAT_PREC variance = (_FLOAT_PREC)0.; - _FLOAT_PREC invVariance = (_FLOAT_PREC)0.; - _FLOAT_PREC mean = (_FLOAT_PREC)0.; - unsigned int lid = get_local_id(1); - unsigned int ygrp_id = get_group_id(1); - unsigned int xgid = get_global_id(0); - unsigned int ygrp_sz = get_local_size(1); - unsigned int yngrps = get_num_groups(1); - unsigned int cidx = xgid * MIO_BN_HW; - unsigned int meanstashindex = cidx + ygrp_sz * ygrp_id + 1; - unsigned int varstashindex = cidx + ygrp_sz * ygrp_id + 3; - unsigned int commitID = 0; - - for(int gn = 0; gn < yngrps; gn++) - { - unsigned int offset = gn * ygrp_sz + lid; - unsigned int meanindex = cidx + ygrp_sz * offset; - unsigned int varindex = cidx + ygrp_sz * offset + 2; - if(offset < yngrps) - { // modify to span larger number of groups - mean += FLOAT2FLOATPREC(*(meanvarbuff + meanindex)); - variance += FLOAT2FLOATPREC(*(meanvarbuff + varindex)); // load per group variance - } + _FLOAT_PREC_C variance = (_FLOAT_PREC_C)0.; + _FLOAT_PREC_C invVariance = (_FLOAT_PREC_C)0.; + _FLOAT_PREC_C mean = (_FLOAT_PREC_C)0.; + unsigned int xgid = get_global_id(0); + unsigned int ygid = get_global_id(1); + unsigned int xlid = get_local_id(0); + unsigned int ylid = get_local_id(1); + unsigned int xgrp_sz = get_local_size(0); + unsigned int ygrp_sz = get_local_size(1); + unsigned int xgrp_id = get_group_id(0); + unsigned int xstride = MIO_LAYOUT_NHWC ? 1 : MIO_BN_HW; + unsigned int ystride = MIO_LAYOUT_NHWC ? MIO_BN_C : 1; + unsigned int commitID = 0; + + if(xgid * VEC_SIZE_X >= MIO_BN_C) + return; + + for(unsigned int yoffset = ylid; yoffset < MIO_BN_NGRPS; yoffset += ygrp_sz) + { + mean += loadFromStash((__global _FLOAT_C*)meanvarbuff, + 0, + ygrp_sz * yoffset * VEC_SIZE_Y, + ystride / VEC_SIZE_X, + xgrp_sz, + xgrp_id, + xlid, + xstride); + variance += loadFromStash((__global _FLOAT_C*)meanvarbuff, + 1, + ygrp_sz * yoffset * VEC_SIZE_Y, + ystride / VEC_SIZE_X, + xgrp_sz, + xgrp_id, + xlid, + xstride); } -#if !MIOPEN_USE_AMDGCN - local _FLOAT_ACCUM lcl_data_x[MIO_BN_LDS_SIZE]; - local _FLOAT_ACCUM lcl_data_y[MIO_BN_LDS_SIZE]; - lds_reduce2(&mean, &variance, (_FLOAT_ACCUM)INHW, lcl_data_x, lcl_data_y, lid); +#if !MIOPEN_USE_AMDGCN || MIO_BN_GRP0 > 1 || MIO_BN_LDSGCN_SIZE == 1 + // TODO: this simple approach has many bank conflicts, optimize if it affects performance + local _FLOAT_ACCUM_C lcl_data_x[MIO_BN_LDS_SIZE]; + local _FLOAT_ACCUM_C lcl_data_y[MIO_BN_LDS_SIZE]; + lds_reduce2_2d(&mean, + &variance, + INHW, + lcl_data_x + xlid * ygrp_sz, + lcl_data_y + xlid * ygrp_sz, + ylid, + ygrp_sz); #else commitID = 64; - local _FLOAT_ACCUM lcl_data_x[MIO_BN_LDSGCN_SIZE]; - local _FLOAT_ACCUM lcl_data_y[MIO_BN_LDSGCN_SIZE]; - gcn_reduce2(&mean, &variance, (_FLOAT_ACCUM)INHW, lcl_data_x, lcl_data_y, lid); + local _FLOAT_ACCUM_C lcl_data_x[MIO_BN_LDSGCN_SIZE]; + local _FLOAT_ACCUM_C lcl_data_y[MIO_BN_LDSGCN_SIZE]; + gcn_reduce2(&mean, &variance, INHW, lcl_data_x, lcl_data_y, ylid); #endif - barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE); - variance = mad(-mean, mean, variance); - if(variance < 0) - { - variance = 0; - } - invVariance = rsqrt(variance + epsilon); - if(lid == commitID) - { - meanvarbuff[meanstashindex] = FLOATPREC2FLOAT(mean); // stash mean - meanvarbuff[varstashindex] = FLOATPREC2FLOAT(invVariance); // stash mean + variance = mad(-mean, mean, variance); + variance = max(variance, (_FLOAT_PREC_C)0.); + invVariance = rsqrt(variance + (_FLOAT_PREC_C)epsilon); + + for(unsigned int yoffset = ylid; yoffset < MIO_BN_NGRPS; yoffset += ygrp_sz) + { + storeToStash(mean, + (__global _FLOAT_C*)meanvarbuff, + 0, + ygrp_sz * yoffset * VEC_SIZE_Y, + ystride / VEC_SIZE_X, + xgrp_sz, + xgrp_id, + xlid, + xstride); + storeToStash(invVariance, + (__global _FLOAT_C*)meanvarbuff, + 1, + ygrp_sz * yoffset * VEC_SIZE_Y, + ystride / VEC_SIZE_X, + xgrp_sz, + xgrp_id, + xlid, + xstride); } // Save mean and calculate and save running mean - unsigned int ygid = get_global_id(1); if(ygid == commitID) { #if(MIO_RUNNING_RESULT == 1) - running_stash(resultRunningMean, resultRunningVariance, expAvgFactor, mean, variance, xgid); + running_stash((global _FLOAT_PREC_C*)resultRunningMean, + (global _FLOAT_PREC_C*)resultRunningVariance, + expAvgFactor, + mean, + variance, + xgid); #endif #if(MIO_SAVE_MEAN_VARIANCE == 1) - saved_stash(resultSaveMean, resultSaveInvVariance, mean, invVariance, xgid); + saved_stash((global _FLOAT_PREC_C*)resultSaveMean, + (global _FLOAT_PREC_C*)resultSaveInvVariance, + mean, + invVariance, + xgid); #endif } } @@ -584,50 +650,77 @@ MIOpenBatchNormFwdTrainSpatialMeanVariance(const __global _FLOAT* __restrict in, __global _FLOAT* __restrict mvbuff) { - unsigned int ylid = get_local_id(1); - unsigned int ygrp_id = get_group_id(1); unsigned int xgid = get_global_id(0); unsigned int ygid = get_global_id(1); + unsigned int xlid = get_local_id(0); + unsigned int ylid = get_local_id(1); + unsigned int xgrp_id = get_group_id(0); + unsigned int ygrp_id = get_group_id(1); + unsigned int xgrp_sz = get_local_size(0); unsigned int ygrp_sz = get_local_size(1); unsigned int index; - unsigned int cidx = xgid * MIO_BN_HW; - unsigned int meanindex = cidx + ygrp_sz * ygrp_id; - unsigned int varindex = meanindex + 2; - _FLOAT_ACCUM mean = (_FLOAT_ACCUM)0.; - _FLOAT_ACCUM variance = (_FLOAT_ACCUM)0.; - _FLOAT_ACCUM value = (_FLOAT_ACCUM)0.; + unsigned int xstride = MIO_LAYOUT_NHWC ? 1 : MIO_BN_HW; + unsigned int ystride = MIO_LAYOUT_NHWC ? MIO_BN_C : 1; + + _FLOAT_PREC_C mean = (_FLOAT_PREC_C)0.; + _FLOAT_PREC_C variance = (_FLOAT_PREC_C)0.; + _FLOAT_PREC_LS value; + + if(xgid * VEC_SIZE_X >= MIO_BN_C) + return; - if(ygid < MIO_BN_HW) + if(ygid * VEC_SIZE_Y < MIO_BN_HW) { -#if(MIO_BN_HW > MIO_BN_LOOP_UNROLL_MAXHW) + _FLOAT_LS read4; for(unsigned int n = 0; n < MIO_BN_N; n++) -#else - __attribute__((opencl_unroll_hint(2))) for(unsigned int n = 0; n < MIO_BN_N; n++) -#endif { - index = n * MIO_BN_CHW + cidx + ygid; - value = FLOAT2ACCUM(*(in + index)); - mean += value; - variance = mad(value, value, variance); + index = n * MIO_BN_CHW + ygid * ystride * VEC_SIZE_Y + xgid * xstride * VEC_SIZE_X; + read4 = *((const global _FLOAT_LS*)(in + index)); + value = FLOAT2FLOATPREC_VEC(read4); + _ACCUMULATE(mean, value) + _ACCUMULATE_MAD(variance, value, value, variance) } } -#if !MIOPEN_USE_AMDGCN - local _FLOAT_ACCUM lcl_data_x[MIO_BN_LDS_SIZE]; - local _FLOAT_ACCUM lcl_data_y[MIO_BN_LDS_SIZE]; - lds_reduce2(&mean, &variance, (_FLOAT_ACCUM)1.0, lcl_data_x, lcl_data_y, ylid); +#if !MIOPEN_USE_AMDGCN || MIO_BN_GRP0 > 1 || MIO_BN_LDSGCN_SIZE == 1 + // TODO: this simple approach has many bank conflicts, optimize if it affects performance + local _FLOAT_ACCUM_C lcl_data_x[MIO_BN_LDS_SIZE]; + local _FLOAT_ACCUM_C lcl_data_y[MIO_BN_LDS_SIZE]; + lds_reduce2_2d(&mean, + &variance, + (_FLOAT_ACCUM)1.0, + lcl_data_x + xlid * ygrp_sz, + lcl_data_y + xlid * ygrp_sz, + ylid, + ygrp_sz); #else - local _FLOAT_ACCUM lcl_data_x[MIO_BN_LDSGCN_SIZE]; - local _FLOAT_ACCUM lcl_data_y[MIO_BN_LDSGCN_SIZE]; + local _FLOAT_ACCUM_C lcl_data_x[MIO_BN_LDSGCN_SIZE]; + local _FLOAT_ACCUM_C lcl_data_y[MIO_BN_LDSGCN_SIZE]; gcn_reduce2(&mean, &variance, (_FLOAT_ACCUM)1.0, lcl_data_x, lcl_data_y, ylid); #endif if(ylid == 0) { - mvbuff[meanindex] = ACCUM2FLOAT(mean); - mvbuff[varindex] = ACCUM2FLOAT(variance); + storeToStash(mean, + (__global _FLOAT_C*)mvbuff, + 0, + ygrp_sz * ygrp_id * VEC_SIZE_Y, + ystride / VEC_SIZE_X, + xgrp_sz, + xgrp_id, + xlid, + xstride); + storeToStash(variance, + (__global _FLOAT_C*)mvbuff, + 1, + ygrp_sz * ygrp_id * VEC_SIZE_Y, + ystride / VEC_SIZE_X, + xgrp_sz, + xgrp_id, + xlid, + xstride); } -} // end spatial mean kernel +} #elif(MIO_BN_VARIANT == 3) diff --git a/src/kernels/batchnorm_functions.h b/src/kernels/batchnorm_functions.h index b95ff4cc1f..7f92514cb6 100644 --- a/src/kernels/batchnorm_functions.h +++ b/src/kernels/batchnorm_functions.h @@ -96,6 +96,7 @@ #define _AS_FLOAT PPCAT(as_, _FLOAT) #define _FLOAT_PREC4 PPCAT(_FLOAT_PREC, FOUR) +#define _FLOAT_ACCUM4 PPCAT(_FLOAT_ACCUM, FOUR) #ifndef MIO_BN_LDSGCN_SIZE #define MIO_BN_LDSGCN_SIZE 16 @@ -206,30 +207,211 @@ #define MIO_BN_GFX120X 0 #endif +#ifndef MIO_BN_VECTORIZE +#define MIO_BN_VECTORIZE 0 +#endif + +#ifndef MIO_BN_STASH_METHOD +#define MIO_BN_STASH_METHOD 0 +#endif + +#define FLOATPREC4_2_FLOAT4(val) \ + ((_FLOAT4)(FLOATPREC2FLOAT(val.x), \ + FLOATPREC2FLOAT(val.y), \ + FLOATPREC2FLOAT(val.z), \ + FLOATPREC2FLOAT(val.w))) + +#define FLOAT4_2_FLOATPREC4(val) \ + ((_FLOAT_PREC4)(FLOAT2FLOATPREC(val.x), \ + FLOAT2FLOATPREC(val.y), \ + FLOAT2FLOATPREC(val.z), \ + FLOAT2FLOATPREC(val.w))) + +#define _ACCUMULATE1(a, b) a += b; + +#define _ACCUMULATE_MAD1(a, b, c, d) a = mad(b, c, d); + +#define _ACCUMULATE4(a, b) \ + a += b.x; \ + a += b.y; \ + a += b.z; \ + a += b.w; + +#define _ACCUMULATE_MAD4(a, b, c, d) \ + a = mad(b.x, c.x, d); \ + a = mad(b.y, c.y, d); \ + a = mad(b.z, c.z, d); \ + a = mad(b.w, c.w, d); + +#if MIO_BN_VECTORIZE +#define VEC_SIZE 4 + +#if MIO_LAYOUT_NHWC +// NHWC vectorize in X direction which corresponds +// to channels +#define VEC_SIZE_X VEC_SIZE +#define VEC_SIZE_Y 1 +// _C suffix means used for computation +// _LS suffix means used for loading / storing +#define _FLOAT_PREC_C _FLOAT_PREC4 +#define _FLOAT_PREC_LS _FLOAT_PREC4 +#define _FLOAT_C _FLOAT4 +#define _FLOAT_LS _FLOAT4 +#define _FLOAT_ACCUM_C _FLOAT_ACCUM4 +#define _FLOAT_ACCUM_LS _FLOAT_ACCUM4 +#define _ACCUMULATE _ACCUMULATE1 +#define _ACCUMULATE_MAD _ACCUMULATE_MAD1 +#else +// NCHW vectorize in Y direction which corresponds +// to HW +#define VEC_SIZE_X 1 +#define VEC_SIZE_Y VEC_SIZE +#define _FLOAT_PREC_C _FLOAT_PREC +#define _FLOAT_PREC_LS _FLOAT_PREC4 +// _C suffix means used for computation +// _LS suffix means used for loading / storing +#define _FLOAT_C _FLOAT +#define _FLOAT_LS _FLOAT4 +#define _FLOAT_ACCUM_C _FLOAT_ACCUM +#define _FLOAT_ACCUM_LS _FLOAT_ACCUM4 +#define _ACCUMULATE _ACCUMULATE4 +#define _ACCUMULATE_MAD _ACCUMULATE_MAD4 +#endif + +#define FLOAT2FLOATPREC_VEC FLOAT4_2_FLOATPREC4 +#define FLOATPREC2FLOAT_VEC FLOATPREC4_2_FLOAT4 + +#else + +#define VEC_SIZE 1 +#define VEC_SIZE_X 1 +#define VEC_SIZE_Y 1 +#define _FLOAT_PREC_C _FLOAT_PREC +#define _FLOAT_PREC_LS _FLOAT_PREC +#define _FLOAT_C _FLOAT +#define _FLOAT_LS _FLOAT +#define _FLOAT_ACCUM_C _FLOAT_ACCUM +#define _FLOAT_ACCUM_LS _FLOAT_ACCUM +#define FLOAT2FLOATPREC_VEC FLOAT2FLOATPREC +#define FLOATPREC2FLOAT_VEC FLOATPREC2FLOAT +#define _ACCUMULATE _ACCUMULATE1 +#define _ACCUMULATE_MAD _ACCUMULATE_MAD1 + +#endif + #define UNUSED __attribute__((__unused__)) +#if(MIO_BN_VARIANT == 2) + +#if(MIO_BN_STASH_METHOD == 0) +// store values in HW dimension +#define NSTRIDE ystride +#else +// store values in N dimension +#define NSTRIDE (MIO_BN_C / VEC_SIZE_X * MIO_BN_HW) +#endif + +inline unsigned int getStashIndex(unsigned int vindex, + unsigned int ygroupoffset, + unsigned int ystride, + unsigned int xgrp_sz, + unsigned int xgrp_id, + unsigned int xlid, + unsigned int xstride) +{ +#if MIOPEN_USE_FPMIX || MIOPEN_USE_BFPMIX + // 2 _FLOAT values are used to store 1 _FLOAT_PREC value. +#if MIO_LAYOUT_NHWC +#if MIO_BN_C % 2 == 0 + // xgrp_sz values are split in two parts: even threads use 2 values at even rows, odd threads - + // at odd rows. + // The only restriction for C and xgrp_sz is that they must be even. + return (vindex * 2 + xlid % 2) * NSTRIDE + ygroupoffset * ystride + + (xgrp_sz * xgrp_id + xlid / 2 * 2) * xstride; +#else + // Values are stored consecutively in y dim. + return (vindex * 2) * NSTRIDE + ygroupoffset * ystride + (xgrp_sz * xgrp_id + xlid) * xstride; +#endif +#else // !MIO_LAYOUT_NHWC + // Values are stored consecutively in y dim, indices are aligned up by 2 (_FLOAT_PREC). + return ((vindex * 2) * NSTRIDE + ygroupoffset * ystride + (xgrp_sz * xgrp_id + xlid) * xstride + + 1) / + 2 * 2; +#endif +#else + return vindex * NSTRIDE + ygroupoffset * ystride + (xgrp_sz * xgrp_id + xlid) * xstride; +#endif +} + +inline _FLOAT_PREC_C loadFromStash(const __global _FLOAT_C* stash, + unsigned int vindex, + unsigned int ygroupoffset, + unsigned int ystride, + unsigned int xgrp_sz, + unsigned int xgrp_id, + unsigned int xlid, + unsigned int xstride) +{ + unsigned int index = + getStashIndex(vindex, ygroupoffset, ystride, xgrp_sz, xgrp_id, xlid, xstride); + +#if(MIO_BN_STASH_METHOD == 0 || MIO_BN_STASH_METHOD == 1) + return *((const __global _FLOAT_PREC_C*)(stash + index)); +#else + _FLOAT_PREC_C value; + *((_FLOAT_C*)(&value)) = *(stash + index); + index += NSTRIDE; + *((_FLOAT_C*)(&value) + 1) = *(stash + index); + + return value; +#endif +} + +inline void storeToStash(_FLOAT_PREC_C value, + __global _FLOAT_C* stash, + unsigned int vindex, + unsigned int ygroupoffset, + unsigned int ystride, + unsigned int xgrp_sz, + unsigned int xgrp_id, + unsigned int xlid, + unsigned int xstride) +{ + unsigned int index = + getStashIndex(vindex, ygroupoffset, ystride, xgrp_sz, xgrp_id, xlid, xstride); + +#if(MIO_BN_STASH_METHOD == 0 || MIO_BN_STASH_METHOD == 1) + *((__global _FLOAT_PREC_C*)(stash + index)) = value; +#else + *(stash + index) = *((_FLOAT_C*)(&value)); + index += NSTRIDE; + *(stash + index) = *((_FLOAT_C*)(&value) + 1); +#endif +} +#endif + #if(MIO_BN_VARIANT != 4) -static inline void running_stash(global _FLOAT_PREC* resultRunningMean, - global _FLOAT_PREC* resultRunningVariance, +static inline void running_stash(global _FLOAT_PREC_C* resultRunningMean, + global _FLOAT_PREC_C* resultRunningVariance, double expAvgFactor, - _FLOAT_ACCUM mean, - _FLOAT_ACCUM variance, + _FLOAT_ACCUM_C mean, + _FLOAT_ACCUM_C variance, uint channel) { - _FLOAT_ACCUM pvt_runMean = (_FLOAT_ACCUM)(*(resultRunningMean + channel)); - _FLOAT_ACCUM pvt_newRunMean = + _FLOAT_ACCUM_C pvt_runMean = (_FLOAT_ACCUM_C)(*(resultRunningMean + channel)); + _FLOAT_ACCUM_C pvt_newRunMean = mad((_FLOAT_ACCUM)-expAvgFactor, pvt_runMean, pvt_runMean); // tmp = oldRunMean*(1-factor) - resultRunningMean[channel] = - (_FLOAT_PREC)mad(mean, (_FLOAT_ACCUM)expAvgFactor, pvt_newRunMean); // newMean*factor + tmp - const _FLOAT_ACCUM adjust = - (_FLOAT_ACCUM)((MIO_BN_NHW == 1) - ? variance - : variance * ((_FLOAT_ACCUM)MIO_BN_NHW / - ((_FLOAT_ACCUM)MIO_BN_NHW - (_FLOAT_ACCUM)1.0))); + resultRunningMean[channel] = (_FLOAT_PREC_C)mad( + mean, (_FLOAT_ACCUM)expAvgFactor, pvt_newRunMean); // newMean*factor + tmp + const _FLOAT_ACCUM_C adjust = + (_FLOAT_ACCUM_C)((MIO_BN_NHW == 1) + ? variance + : variance * ((_FLOAT_ACCUM)MIO_BN_NHW / + ((_FLOAT_ACCUM)MIO_BN_NHW - (_FLOAT_ACCUM)1.0))); resultRunningVariance[channel] = - (_FLOAT_PREC)((1 - (_FLOAT_ACCUM)expAvgFactor) * - (_FLOAT_ACCUM)(*(resultRunningVariance + channel)) + - (_FLOAT_ACCUM)expAvgFactor * adjust); + (_FLOAT_PREC_C)((1 - (_FLOAT_ACCUM)expAvgFactor) * + (_FLOAT_ACCUM_C)(*(resultRunningVariance + channel)) + + (_FLOAT_ACCUM)expAvgFactor * adjust); } static inline void running_stash_pa(global _FLOAT_PREC* resultRunningMean, @@ -279,12 +461,12 @@ static inline void running_stash_dyn(global _FLOAT_PREC* resultRunningMean, } #endif -static inline void saved_stash(global _FLOAT_PREC* resultSaveMean, - global _FLOAT_PREC* resultSaveInvVariance, - _FLOAT_ACCUM mean, - _FLOAT_ACCUM invVariance, +static inline void saved_stash(global _FLOAT_PREC_C* resultSaveMean, + global _FLOAT_PREC_C* resultSaveInvVariance, + _FLOAT_ACCUM_C mean, + _FLOAT_ACCUM_C invVariance, uint channel) { - *(resultSaveMean + channel) = (_FLOAT_PREC)mean; - *(resultSaveInvVariance + channel) = (_FLOAT_PREC)invVariance; + *(resultSaveMean + channel) = (_FLOAT_PREC_C)mean; + *(resultSaveInvVariance + channel) = (_FLOAT_PREC_C)invVariance; } diff --git a/src/kernels/reduction_functions.h b/src/kernels/reduction_functions.h index 4e8c607982..88aeed431a 100644 --- a/src/kernels/reduction_functions.h +++ b/src/kernels/reduction_functions.h @@ -59,6 +59,30 @@ regLDSreduce(_FLOAT_ACCUM* value, local _FLOAT_ACCUM* data, uint localID, _FLOAT #endif +static inline void lds_reduce2_2d(_FLOAT_ACCUM_C* x, + _FLOAT_ACCUM_C* y, + _FLOAT_ACCUM scale, + local _FLOAT_ACCUM_C* lcl_data_x, + local _FLOAT_ACCUM_C* lcl_data_y, + unsigned int lid, + unsigned int size) +{ + lcl_data_x[lid] = (_FLOAT_ACCUM_C)*x; + lcl_data_y[lid] = (_FLOAT_ACCUM_C)*y; + barrier(CLK_LOCAL_MEM_FENCE); + for(unsigned int red = (size >> 1); red > 0; red >>= 1) + { + if(lid < red) + { + lcl_data_x[lid] += lcl_data_x[lid + red]; + lcl_data_y[lid] += lcl_data_y[lid + red]; + } + barrier(CLK_LOCAL_MEM_FENCE); + } + *x = (_FLOAT_ACCUM_C)(lcl_data_x[0] * scale); + *y = (_FLOAT_ACCUM_C)(lcl_data_y[0] * scale); +} + #if MIOPEN_USE_AMDGCN static inline void dpp_reduction(_FLOAT_ACCUM* temp_sum) { diff --git a/src/ocl/batchnormocl.cpp b/src/ocl/batchnormocl.cpp index 7d3310145b..dca3ea6803 100644 --- a/src/ocl/batchnormocl.cpp +++ b/src/ocl/batchnormocl.cpp @@ -129,7 +129,8 @@ void BatchNormForwardTraining(const Handle& handle, expAvgFactor, epsilon, resultsave, - resultrunning}; + resultrunning, + size_t(0.6f * handle.GetMaxComputeUnits())}; const auto algo = bn_mode == miopenBNSpatial ? AlgorithmName{"miopenBatchNormForwardTrainingSpatial"} @@ -151,10 +152,9 @@ void BatchNormForwardTraining(const Handle& handle, return tmp; }(); - const auto solvers = solver::SolverContainer{}; + // solver::batchnorm::BnCKFwdTraining>{}; solvers.ExecutePrimitive(handle, problem, algo, invoke_params); @@ -250,9 +250,8 @@ void BatchNormForwardInference(const Handle& handle, }(); const auto algo = AlgorithmName{"miopenBatchNormalizationForwardInference"}; - const auto solvers = solver::SolverContainer{}; + const auto solvers = solver::SolverContainer{}; + // solver::batchnorm::BnCKFwdInference>{}; solvers.ExecutePrimitive(handle, problem, algo, invoke_params); } @@ -373,7 +372,8 @@ void BatchNormBackward(const Handle& handle, savedMeanDesc, savedVarianceDesc, epsilon, - useSaved}; + useSaved, + size_t(0.6f * handle.GetMaxComputeUnits())}; const auto algo = bn_mode == miopenBNSpatial ? AlgorithmName{"miopenBatchNormBackwardPropSpatial"} @@ -394,10 +394,9 @@ void BatchNormBackward(const Handle& handle, return tmp; }(); - const auto solvers = solver::SolverContainer{}; + // solver::batchnorm::BnCKBwdBackward>{}; solvers.ExecutePrimitive(handle, problem, algo, invoke_params); diff --git a/src/solver.cpp b/src/solver.cpp index c3b077d435..bb707fff81 100644 --- a/src/solver.cpp +++ b/src/solver.cpp @@ -562,19 +562,18 @@ inline SolverRegistrar::SolverRegistrar(IdRegistryData& registry) Register(registry, ++id, Primitive::Activation, activ::ActivBwdSolver0{}.SolverDbId()); Register(registry, ++id, Primitive::Activation, activ::ActivBwdSolver1{}.SolverDbId()); - RegisterWithSolver(registry, ++id, Primitive::Batchnorm); + // combine BnFwdTrainingSpatialMultiple and BnFwdTrainingSpatialSingle + RegisterWithSolver(registry, ++id, Primitive::Batchnorm); RegisterWithSolver( registry, ++id, conv::ConvCkIgemmFwdV6r1DlopsNchw{}, miopenConvolutionAlgoImplicitGEMM); - RegisterWithSolver( - registry, ++id, Primitive::Batchnorm); - + ++id; // removed solver BnFwdTrainingSpatialMultiple (it is now part of BnFwdTrainingSpatial) RegisterWithSolver(registry, ++id, Primitive::Batchnorm); - RegisterWithSolver(registry, ++id, Primitive::Batchnorm); - RegisterWithSolver( - registry, ++id, Primitive::Batchnorm); + // combine BnBwdTrainingSpatialMultiple and BnBwdTrainingSpatialSingle + RegisterWithSolver(registry, ++id, Primitive::Batchnorm); + ++id; // removed solver BnBwdTrainingSpatialMultiple (it is now part of BnBwdTrainingSpatial) RegisterWithSolver(registry, ++id, Primitive::Batchnorm); RegisterWithSolver(registry, ++id, Primitive::Batchnorm); diff --git a/src/solver/batchnorm/backward_spatial.cpp b/src/solver/batchnorm/backward_spatial.cpp new file mode 100644 index 0000000000..75fc09f4a5 --- /dev/null +++ b/src/solver/batchnorm/backward_spatial.cpp @@ -0,0 +1,425 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include +#include + +#include +#include +#include +#include +#include +#include + +namespace miopen { + +namespace solver { + +namespace batchnorm { + +// Spatial multiple needs space for 4 fp32 elements +// per each x thread (including the last workgroup) +// to stash intermediate mean and variance +const unsigned int stash_values_bwd = 4; + +bool PerformanceConfigBnBwdBackward::IsValid( + const ExecutionContext&, const miopen::batchnorm::ProblemDescription& problem) const +{ + if(this->kernel_id.empty()) + { + return false; + } + + // if default config is variant 2, check if it can be applied + // (based on variant 2 restrictions) + size_t vectorsize; + int variant = 0; + GetVariantFromKernelId(this->kernel_id, variant, vectorsize); + if(variant == 2) + { + return IsSpatialMultipleApplicable(problem, vectorsize, stash_values_bwd); + } + return true; +} + +void PerformanceConfigBnBwdBackward::HeuristicInit( + const miopen::batchnorm::ProblemDescription& problem) +{ + // Define default configuration based on heuristics and + // add all other valid configurations for the given problem + if(UseMultiple(problem)) + { + DefaultConfigSpatialMultiple(problem, stash_values_bwd, this->valid_kernels); + DefaultConfigSpatialSingle(problem, this->valid_kernels); + } + else + { + DefaultConfigSpatialSingle(problem, this->valid_kernels); + DefaultConfigSpatialMultiple(problem, stash_values_bwd, this->valid_kernels); + } + + // Set index and kernel_id to default value + this->index = 0; + this->kernel_id = valid_kernels[0]; +} + +bool PerformanceConfigBnBwdBackward::SetNextValue( + const miopen::batchnorm::ProblemDescription& problem_desc) +{ + // In case the valid_kernel list is empty, we fill it with + // default value as first one and all other valid ones will follow + if(this->valid_kernels.empty()) + { + this->HeuristicInit(problem_desc); + return true; + } + // Get next valid configuration + if((this->index + 1) < valid_kernels.size()) + { + ++this->index; + this->kernel_id = this->valid_kernels[index]; + return true; + } + else + { + return false; + } +} + +bool PerformanceConfigBnBwdBackward::operator==(const PerformanceConfigBnBwdBackward& other) const +{ + return this->kernel_id == other.kernel_id; +} + +bool PerformanceConfigBnBwdBackward::IsValidValue() const +{ + return this->index >= 0 && this->index < valid_kernels.size(); +} + +bool BnBwdTrainingSpatial::IsApplicable( + const ExecutionContext&, const miopen::batchnorm::ProblemDescription& bn_problem) const +{ + if(bn_problem.GetDirection() != miopen::batchnorm::Direction::Backward || + bn_problem.GetMode() != miopenBNSpatial) + return false; + + if(!bn_problem.Is2D()) + return false; + +#if WORKAROUND_ISSUE_1549_FP16_BUILD_ERROR + if(bn_problem.GetXDesc().GetType() == miopenHalf && + bn_problem.GetBnScale().GetType() == miopenHalf) + { + // bfp16parm = true; + // Unsupported kernel mode, error in kernel code + // MIOpenBatchNormBwdSpatial.cl:526 issue#1549 + return false; + } +#endif + if(!IsOCLBwdTypeValid(bn_problem)) + return false; + + return true; +} + +PerformanceConfigBnBwdBackward BnBwdTrainingSpatial::GetDefaultPerformanceConfig( + const ExecutionContext&, const miopen::batchnorm::ProblemDescription& problem_desc) const +{ + PerformanceConfigBnBwdBackward pp; + pp.HeuristicInit(problem_desc); + MIOPEN_LOG_I(pp.ToString()); + return pp; +} + +bool BnBwdTrainingSpatial::IsValidPerformanceConfig( + const ExecutionContext& ctx, + const miopen::batchnorm::ProblemDescription& problem_desc, + const PerformanceConfigBnBwdBackward& config) const +{ + return config.IsValid(ctx, problem_desc); +} + +PerformanceConfigBnBwdBackward +BnBwdTrainingSpatial::Search(const ExecutionContext& ctx, + const miopen::batchnorm::ProblemDescription& problem, + const AnyInvokeParams& invoke_ctx) const +{ + return GenericSearch(*this, ctx, problem, invoke_ctx); +} + +ConvSolution BnBwdTrainingSpatial::GetSolution(const ExecutionContext& context, + const miopen::batchnorm::ProblemDescription& problem, + const PerformanceConfigBnBwdBackward& config) const +{ + const auto& handle = context.GetStream(); + const unsigned wavesize = (miopen::StartsWith(handle.GetDeviceName(), "gfx10") ? 32 : 64); + + bool bfpmixparm = false; + bool bbfpmixparam = false; + bool bfp16parm = false; + bool bfp32parm = true; + + if(problem.GetXDesc().GetType() == miopenHalf && problem.GetBnScale().GetType() == miopenHalf) + { + bfp16parm = true; + bfp32parm = false; + } + else if(problem.GetXDesc().GetType() == miopenHalf && + problem.GetBnScale().GetType() == miopenFloat) + { + bfpmixparm = true; + bfp32parm = false; + } + else if(problem.GetXDesc().GetType() == miopenBFloat16 && + problem.GetBnScale().GetType() == miopenFloat) + { + bbfpmixparam = true; + bfp32parm = false; + } + + int n, c, h, w; + std::tie(n, c, h, w) = tien<4>(problem.GetXDesc().GetLengths()); + + unsigned int in_cstride = h * w; + unsigned int in_nstride = c * in_cstride; + unsigned int in_nhw = n * in_cstride; + unsigned int in_nchw = n * in_nstride; + + auto inhw = float(1.0 / in_nhw); + + int variant = -1; + size_t vectorsize = 1; + GetVariantFromKernelId(config.kernel_id, variant, vectorsize); + + size_t xlocalsize, xgridsize; + size_t ylocalsize = 1, ygridsize = 1, zlocalsize = 1, zgridsize = 1; + unsigned int ldsgcn, ldsnogcn; + int stash_method = -1; + if(variant != 2) + { + xlocalsize = 1024; + xgridsize = static_cast(1024) * c; + ldsgcn = xlocalsize / wavesize; + ldsnogcn = xlocalsize; + } + else + { + GetSpatialMultipleConfig(problem, + stash_values_bwd, + vectorsize, + xlocalsize, + ylocalsize, + xgridsize, + ygridsize, + stash_method); + ldsnogcn = xlocalsize * ylocalsize; + ldsgcn = xlocalsize * ylocalsize / wavesize; + } + + auto result = ConvSolution{miopenStatusSuccess}; + + { + auto kernel = KernelInfo{}; + + auto build_params = KernelBuildParameters{ + {"MIOPEN_USE_FP16", static_cast(bfp16parm)}, + {"MIOPEN_USE_FP32", static_cast(bfp32parm)}, + {"MIOPEN_USE_FPMIX", static_cast(bfpmixparm)}, + {"MIOPEN_USE_BFPMIX", static_cast(bbfpmixparam)}, + {"MIO_BN_USESAVED", static_cast(problem.UseSaved())}, + {"MIO_BN_N", static_cast(n)}, + {"MIO_BN_C", static_cast(c)}, + {"MIO_BN_HW", static_cast(in_cstride)}, + {"MIO_BN_NHW", static_cast(in_nhw)}, + {"MIO_BN_CHW", in_nstride}, + {"MIO_BN_NCHW", in_nchw}, + {"MIO_BN_NGRPS", ygridsize / ylocalsize}, + {"MIO_BN_LDS_SIZE", ldsnogcn}, + {"MIO_BN_LDSGCN_SIZE", ldsgcn}, + {"MIO_BN_VARIANT", variant}, + {"MIO_WAVESIZE", wavesize}, + {"MIO_BN_GRP0", xlocalsize}, + {"MIO_BN_GRP1", ylocalsize}, + {"MIO_BN_GRP2", zlocalsize}, + {"MIO_LAYOUT_NHWC", static_cast(problem.IsLayoutNHWC())}, + {"MIO_BN_VECTORIZE", static_cast(vectorsize > 1)}, + {"MIO_BN_STASH_METHOD", stash_method}, + }; + + { + // OpenCL kernels for variant 0-4 + kernel.kernel_file = "MIOpenBatchNormBwdSpatial.cl"; + std::string kernel_name = "MIOpenBatchNormBwdSpatial"; + + build_params << KernelBuildParameters{ + {"MIO_BN_GFX103X", (StartsWith(handle.GetDeviceName(), "gfx103") ? "1" : "0")}, + {"MIO_BN_GFX110X", (StartsWith(handle.GetDeviceName(), "gfx110") ? "1" : "0")}, + {"MIO_BN_GFX120X", (StartsWith(handle.GetDeviceName(), "gfx120") ? "1" : "0")}, + }; + + kernel.comp_options = build_params.GenerateFor(kbp::OpenCL{}); + + kernel.l_wk.push_back(xlocalsize); + kernel.l_wk.push_back(ylocalsize); + kernel.l_wk.push_back(zlocalsize); + + kernel.g_wk.push_back(xgridsize); + kernel.g_wk.push_back(ygridsize); + kernel.g_wk.push_back(zgridsize); + + if(variant != 2) + { + kernel.kernel_name = kernel_name; + result.construction_params.push_back(kernel); + } + else + { + auto single_ygroup_kernel = kernel; + + single_ygroup_kernel.g_wk[1] = single_ygroup_kernel.l_wk[1]; + + if(!problem.UseSaved()) + { + kernel.kernel_name = kernel_name + "MeanVariance"; + result.construction_params.push_back(kernel); + + single_ygroup_kernel.kernel_name = kernel_name + "FinalMeanVariance"; + result.construction_params.push_back(single_ygroup_kernel); + } + + kernel.kernel_name = kernel_name + "DScaleDBias"; + result.construction_params.push_back(kernel); + + single_ygroup_kernel.kernel_name = kernel_name + "FinalDScaleDBias"; + result.construction_params.push_back(single_ygroup_kernel); + + kernel.kernel_name = kernel_name + "DX"; + result.construction_params.push_back(kernel); + } + } + } + + const auto dtype = problem.GetBnScale().GetType(); + const auto useSaved = problem.UseSaved(); + + result.invoker_factory = [=](const std::vector& kernels) { + return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { + decltype(auto) params = raw_params.CastTo(); + + float ctime = 0.; + visit_float(dtype, [&](auto as_float) { + if(variant != 2) + { + decltype(auto) kernel = handle_.Run(kernels.front()); + if(useSaved) + { + kernel(params.x, + params.dy, + params.dx, + params.bnScale, + params.resultBnScaleDiff, + params.resultBnBiasDiff, + params.savedMean, + params.savedInvVariance, + as_float(inhw)); + } + else + { + kernel(params.x, + params.dy, + params.dx, + params.bnScale, + params.resultBnScaleDiff, + params.resultBnBiasDiff, + params.epsilon, + inhw); + } + } + else + { + if(useSaved) + { + handle_.Run(kernels[0])(params.x, + params.dy, + params.dx, + params.savedMean, + params.savedInvVariance); + profileSequence(handle_, 0, &ctime); + + handle_.Run(kernels[1])( + params.dx, params.resultBnScaleDiff, params.resultBnBiasDiff); + profileSequence(handle_, 1, &ctime); + + handle_.Run(kernels[2])(params.x, + params.dy, + params.dx, + params.bnScale, + params.resultBnScaleDiff, + params.resultBnBiasDiff, + params.savedMean, + params.savedInvVariance, + as_float(inhw)); + profileSequence(handle_, 2, &ctime); + } + else + { + handle_.Run(kernels[0])(params.x, params.dx); // mean variance + profileSequence(handle_, 0, &ctime); + + handle_.Run(kernels[1])( + params.dx, as_float(inhw), params.epsilon); // final mean variance + profileSequence(handle_, 1, &ctime); + + handle_.Run(kernels[2])(params.x, params.dy, params.dx); // dscale dbias + profileSequence(handle_, 1, &ctime); + + handle_.Run(kernels[3])(params.dx, + params.resultBnScaleDiff, + params.resultBnBiasDiff); // final dscale dbias + profileSequence(handle_, 1, &ctime); + + handle_.Run(kernels[4])(params.x, + params.dy, + params.dx, + params.bnScale, + params.resultBnScaleDiff, + params.resultBnBiasDiff, + as_float(inhw)); // dx + profileSequence(handle_, 2, &ctime); + } + } + }); + }; + }; + + return result; +} + +} // namespace batchnorm + +} // namespace solver + +} // namespace miopen diff --git a/src/solver/batchnorm/backward_spatial_multiple.cpp b/src/solver/batchnorm/backward_spatial_multiple.cpp deleted file mode 100644 index e26922f478..0000000000 --- a/src/solver/batchnorm/backward_spatial_multiple.cpp +++ /dev/null @@ -1,352 +0,0 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2021 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ - -#include - -#include -#include -#include -#include -#include - -namespace miopen { - -namespace solver { - -namespace batchnorm { - -bool BnBwdTrainingSpatialMultiple::IsApplicable( - const ExecutionContext& context, const miopen::batchnorm::ProblemDescription& problem) const -{ - if(problem.GetDirection() != miopen::batchnorm::Direction::Backward || - problem.GetMode() != miopenBNSpatial) - return false; - if(!problem.Is2D()) - { - return false; - } - if(!IsOCLBwdTypeValid(problem)) - return false; - -#if WORKAROUND_ISSUE_1549_FP16_BUILD_ERROR - if(problem.GetXDesc().GetType() == miopenHalf && problem.GetBnScale().GetType() == miopenHalf) - { - // bfp16parm = true; - // Unsupported kernel mode, error in kernel code - // MIOpenBatchNormBwdSpatial.cl:526 issue#1549 - return false; - } -#endif - - return !BnBwdTrainingSpatialSingle{}.IsApplicable(context, problem); -} - -ConvSolution BnBwdTrainingSpatialMultiple::GetSolution( - const ExecutionContext& context, const miopen::batchnorm::ProblemDescription& problem) const -{ - const auto& handle = context.GetStream(); - - bool bfpmixparm = false; - bool bbfpmixparam = false; - bool bfp16parm = false; - bool bfp32parm = true; - - if(problem.GetXDesc().GetType() == miopenHalf && problem.GetBnScale().GetType() == miopenHalf) - { - bfp16parm = true; - bfp32parm = false; - } - else if(problem.GetXDesc().GetType() == miopenHalf && - problem.GetBnScale().GetType() == miopenFloat) - { - bfpmixparm = true; - bfp32parm = false; - } - else if(problem.GetXDesc().GetType() == miopenBFloat16 && - problem.GetBnScale().GetType() == miopenFloat) - { - bbfpmixparam = true; - bfp32parm = false; - } - - int n, c, h, w; - std::tie(n, c, h, w) = tien<4>(problem.GetXDesc().GetLengths()); - - unsigned int in_cstride = h * w; - unsigned int in_nstride = c * in_cstride; - unsigned int in_nhw = n * in_cstride; - unsigned int in_nchw = n * in_nstride; - - auto inhw = float(1.0 / in_nhw); - - size_t xlocalsize = 1; - size_t ylocalsize = 1; - - size_t xgridsize = 1; - size_t ygridsize = 1; - - unsigned int ldsgcn = 0; - unsigned int ldsnogcn = 0; - int variant = 1; - - if(problem.IsLayoutNHWC()) - { - xlocalsize = 1024; - xgridsize = c * xlocalsize; - ldsgcn = xlocalsize / 64; - ldsnogcn = xlocalsize; - } - else - { - //************************************************************************************************* - // N*H*W < 32M and H*W > 1024, use batchnorm variant#1 implementation which parallelize - // work groups over channels and loop through NHW. - //************************************************************************************************* - if((in_nhw < (32 * 1024 * 1024) && in_cstride > 1024)) - { - variant = 1; - xlocalsize = 1024; - xgridsize = c * xlocalsize; - ldsgcn = xlocalsize / 64; - ldsnogcn = xlocalsize; - } - //************************************************************************************************* - // N*H*W < 32M and H*W > 512 use batchnorm variant#1 or variant#3 implementation which - // parallelize - // work groups over channels and loop through N. - //************************************************************************************************* - else if(in_nhw < (32 * 1024 * 1024) && in_cstride > 512) - { - variant = (n >= 32) ? 1 : 3; - xlocalsize = std::min(64 * ((in_cstride + 63) / 64), static_cast(1024)); - xgridsize = c * xlocalsize; - ldsgcn = xlocalsize / 64; - ldsnogcn = xlocalsize; - } - //************************************************************************************************* - // H*W < 512 use batchnorm variant#0 or variant#3 implementation based on batch size and - // H*W - //************************************************************************************************* - else if(in_cstride <= 512) - { - if((n > 64) && (in_cstride > 160)) - { - variant = 3; - xlocalsize = - std::min(64 * ((in_cstride + 63) / 64), static_cast(1024)); - xgridsize = c * xlocalsize; - ldsgcn = xlocalsize / 64; - ldsnogcn = xlocalsize; - } - else - { - variant = 0; - if(bfp32parm) - { - xlocalsize = 1024; - xgridsize = static_cast(1024) * c; - } - else - { - xlocalsize = 256; - xgridsize = static_cast(256) * c; - } - ldsgcn = xlocalsize / 64; - ldsnogcn = xlocalsize; - } - } - //************************************************************************************************* - // N*H*W > 32M, use batchnorm variant#2 implementation which parallelize - // work groups over channels and data segments. - //************************************************************************************************* - else - { - variant = 2; - ylocalsize = 1024; - auto segment = int(std::ceil(double(in_cstride) / double(ylocalsize))); - xgridsize = c; - ygridsize = segment * ylocalsize; - ldsgcn = ylocalsize / 64; - ldsnogcn = ylocalsize; - } - if((in_cstride < 200) && (in_cstride > 60) && bfpmixparm) - { - variant = 1; - xlocalsize = 1024; - xgridsize = c * xlocalsize; - ldsgcn = xlocalsize / 64; - ldsnogcn = xlocalsize; - } - } - - auto result = ConvSolution{miopenStatusSuccess}; - - { - size_t zlocalsize = 1; - size_t zgridsize = 1; - - auto kernel = KernelInfo{}; - - kernel.kernel_file = "MIOpenBatchNormBwdSpatial.cl"; - kernel.kernel_name = "MIOpenBatchNormBwdSpatial"; - - auto build_params = KernelBuildParameters{ - {"MIOPEN_USE_FP16", static_cast(bfp16parm)}, - {"MIOPEN_USE_FP32", static_cast(bfp32parm)}, - {"MIOPEN_USE_FPMIX", static_cast(bfpmixparm)}, - {"MIOPEN_USE_BFPMIX", static_cast(bbfpmixparam)}, - {"MIO_BN_USESAVED", static_cast(problem.UseSaved())}, - {"MIO_BN_N", static_cast(n)}, - {"MIO_BN_C", static_cast(c)}, - {"MIO_BN_HW", static_cast(in_cstride)}, - {"MIO_BN_NHW", static_cast(in_nhw)}, - {"MIO_BN_CHW", in_nstride}, - {"MIO_BN_NCHW", in_nchw}, - {"MIO_BN_NGRPS", int(std::ceil(float(ygridsize) / ylocalsize))}, - {"MIO_BN_LDS_SIZE", ldsnogcn}, - {"MIO_BN_LDSGCN_SIZE", ldsgcn}, - {"MIO_BN_VARIANT", variant}, - {"MIO_BN_GRP0", xlocalsize}, - {"MIO_BN_GRP1", ylocalsize}, - {"MIO_BN_GRP2", zlocalsize}, - {"MIO_BN_GFX103X", (StartsWith(handle.GetDeviceName(), "gfx103") ? "1" : "0")}, - {"MIO_BN_GFX110X", (StartsWith(handle.GetDeviceName(), "gfx110") ? "1" : "0")}, - {"MIO_BN_GFX120X", (StartsWith(handle.GetDeviceName(), "gfx120") ? "1" : "0")}, - {"MIO_LAYOUT_NHWC", static_cast(problem.IsLayoutNHWC())}, - }; - - kernel.comp_options = build_params.GenerateFor(kbp::OpenCL{}); - - kernel.l_wk.push_back(xlocalsize); - kernel.l_wk.push_back(ylocalsize); - kernel.l_wk.push_back(zlocalsize); - - kernel.g_wk.push_back(xgridsize); - kernel.g_wk.push_back(ygridsize); - kernel.g_wk.push_back(zgridsize); - - if(problem.UseSaved()) - { - auto copy = kernel; - - copy.kernel_name = kernel.kernel_name + "DScaleDBias"; - result.construction_params.push_back(copy); - - copy.kernel_name = kernel.kernel_name + "FinalDScaleDBias"; - result.construction_params.push_back(copy); - - copy.kernel_name = kernel.kernel_name + "DX"; - result.construction_params.push_back(copy); - } - else - { - auto copy = kernel; - - copy.kernel_name = kernel.kernel_name + "MeanVariance"; - result.construction_params.push_back(copy); - - copy.kernel_name = kernel.kernel_name + "FinalMeanVariance"; - result.construction_params.push_back(copy); - - copy.kernel_name = kernel.kernel_name + "DScaleDBias"; - result.construction_params.push_back(copy); - - copy.kernel_name = kernel.kernel_name + "FinalDScaleDBias"; - result.construction_params.push_back(copy); - - copy.kernel_name = kernel.kernel_name + "DX"; - result.construction_params.push_back(copy); - } - } - - const auto dtype = problem.GetBnScale().GetType(); - const auto useSaved = problem.UseSaved(); - - result.invoker_factory = [=](const std::vector& kernels) { - return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { - decltype(auto) params = raw_params.CastTo(); - - float ctime = 0.; - visit_float(dtype, [&](auto as_float) { - if(useSaved) - { - handle_.Run(kernels[0])( - params.x, params.dy, params.dx, params.savedMean, params.savedInvVariance); - profileSequence(handle_, 0, &ctime); - - handle_.Run(kernels[1])( - params.dx, params.resultBnScaleDiff, params.resultBnBiasDiff); - profileSequence(handle_, 1, &ctime); - - handle_.Run(kernels[2])(params.x, - params.dy, - params.dx, - params.bnScale, - params.resultBnScaleDiff, - params.resultBnBiasDiff, - params.savedMean, - params.savedInvVariance, - as_float(inhw)); - profileSequence(handle_, 2, &ctime); - } - else - { - handle_.Run(kernels[0])(params.x, params.dx); // mean variance - profileSequence(handle_, 0, &ctime); - - handle_.Run(kernels[1])( - params.dx, as_float(inhw), params.epsilon); // final mean variance - profileSequence(handle_, 1, &ctime); - - handle_.Run(kernels[2])(params.x, params.dy, params.dx); // dscale dbias - profileSequence(handle_, 1, &ctime); - - handle_.Run(kernels[3])(params.dx, - params.resultBnScaleDiff, - params.resultBnBiasDiff); // final dscale dbias - profileSequence(handle_, 1, &ctime); - - handle_.Run(kernels[4])(params.x, - params.dy, - params.dx, - params.bnScale, - params.resultBnScaleDiff, - params.resultBnBiasDiff, - as_float(inhw)); // dx - profileSequence(handle_, 2, &ctime); - } - }); - }; - }; - - return result; -} - -} // namespace batchnorm - -} // namespace solver - -} // namespace miopen diff --git a/src/solver/batchnorm/backward_spatial_single.cpp b/src/solver/batchnorm/backward_spatial_single.cpp deleted file mode 100644 index 9b375f517c..0000000000 --- a/src/solver/batchnorm/backward_spatial_single.cpp +++ /dev/null @@ -1,330 +0,0 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2021 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ - -#include - -#include -#include -#include -#include - -#define WORKAROUND_ISSUE_1146 1 // check asm solver applicability for gfx90a - -namespace miopen { - -namespace solver { - -namespace batchnorm { - -bool BnBwdTrainingSpatialSingle::IsApplicable( - const ExecutionContext&, const miopen::batchnorm::ProblemDescription& bn_problem) const -{ - if(bn_problem.GetDirection() != miopen::batchnorm::Direction::Backward || - bn_problem.GetMode() != miopenBNSpatial) - return false; - if(!bn_problem.Is2D()) - return false; - -#if WORKAROUND_ISSUE_1549_FP16_BUILD_ERROR - if(bn_problem.GetXDesc().GetType() == miopenHalf && - bn_problem.GetBnScale().GetType() == miopenHalf) - { - // bfp16parm = true; - // Unsupported kernel mode, error in kernel code - // MIOpenBatchNormBwdSpatial.cl:526 issue#1549 - return false; - } -#endif - if(!IsOCLBwdTypeValid(bn_problem)) - return false; - - int n, c, h, w; - std::tie(n, c, h, w) = tien<4>(bn_problem.GetXDesc().GetLengths()); - - unsigned int in_cstride = h * w; - unsigned int in_nhw = n * in_cstride; - - return (in_cstride > 1024 && in_nhw < (32 * 1024 * 1024)) || - (in_cstride > 512 && in_nhw < (32 * 1024 * 1024)) || in_cstride <= 512; -} - -ConvSolution -BnBwdTrainingSpatialSingle::GetSolution(const ExecutionContext& context, - const miopen::batchnorm::ProblemDescription& problem) const -{ - const auto& handle = context.GetStream(); - const unsigned wavesize = (miopen::StartsWith(handle.GetDeviceName(), "gfx10") ? 32 : 64); - - bool bfpmixparm = false; - bool bbfpmixparam = false; - bool bfp16parm = false; - bool bfp32parm = true; - - if(problem.GetXDesc().GetType() == miopenHalf && problem.GetBnScale().GetType() == miopenHalf) - { - bfp16parm = true; - bfp32parm = false; - } - else if(problem.GetXDesc().GetType() == miopenHalf && - problem.GetBnScale().GetType() == miopenFloat) - { - bfpmixparm = true; - bfp32parm = false; - } - else if(problem.GetXDesc().GetType() == miopenBFloat16 && - problem.GetBnScale().GetType() == miopenFloat) - { - bbfpmixparam = true; - bfp32parm = false; - } - - int n, c, h, w; - std::tie(n, c, h, w) = tien<4>(problem.GetXDesc().GetLengths()); - - unsigned int in_cstride = h * w; - unsigned int in_nstride = c * in_cstride; - unsigned int in_nhw = n * in_cstride; - unsigned int in_nchw = n * in_nstride; - - auto inhw = float(1.0 / in_nhw); - - size_t xlocalsize = 1; - size_t ylocalsize = 1; - - size_t xgridsize = 1; - size_t ygridsize = 1; - - unsigned int ldsgcn = 0; - unsigned int ldsnogcn = 0; - int variant = 1; - - if(problem.IsLayoutNHWC()) - { - xlocalsize = 1024; - xgridsize = c * xlocalsize; - ldsgcn = xlocalsize / wavesize; - ldsnogcn = xlocalsize; - } - else - { - //************************************************************************************************* - // N*H*W < 32M and H*W > 1024, use batchnorm variant#1 implementation which parallelize - // work groups over channels and loop through NHW. - //************************************************************************************************* - if((in_nhw < (32 * 1024 * 1024) && in_cstride > 1024)) - { - variant = 1; - xlocalsize = 1024; - xgridsize = c * xlocalsize; - ldsgcn = xlocalsize / wavesize; - ldsnogcn = xlocalsize; - } - //************************************************************************************************* - // N*H*W < 32M and H*W > 512 use batchnorm variant#1 or variant#3 implementation which - // parallelize - // work groups over channels and loop through N. - //************************************************************************************************* - else if(in_nhw < (32 * 1024 * 1024) && in_cstride > 512) - { - variant = (n >= 32) ? 1 : 3; - xlocalsize = 1024; - xgridsize = c * xlocalsize; - ldsgcn = xlocalsize / wavesize; - ldsnogcn = xlocalsize; - } - //************************************************************************************************* - // H*W < 512 use batchnorm variant#0 or variant#3 implementation based on batch size and - // H*W - //************************************************************************************************* - else if(in_cstride <= 512) - { - if((n > 64) && (in_cstride > 160)) - { - variant = 3; - xlocalsize = 1024; - xgridsize = c * xlocalsize; - ldsgcn = xlocalsize / wavesize; - ldsnogcn = xlocalsize; - } - else - { - variant = 0; - xlocalsize = 1024; - xgridsize = static_cast(1024) * c; - ldsgcn = xlocalsize / wavesize; - ldsnogcn = xlocalsize; - } - } - //************************************************************************************************* - // N*H*W > 32M, use batchnorm variant#2 implementation which parallelize - // work groups over channels and data segments. - //************************************************************************************************* - else - { - variant = 2; - ylocalsize = 1024; - auto segment = int(std::ceil(double(in_cstride) / double(ylocalsize))); - xgridsize = c; - ygridsize = segment * ylocalsize; - ldsgcn = ylocalsize / wavesize; - ldsnogcn = ylocalsize; - } - if((in_cstride < 200) && (in_cstride > 60) && bfpmixparm) - { - variant = 1; - xlocalsize = 1024; - xgridsize = c * xlocalsize; - ldsgcn = xlocalsize / wavesize; - ldsnogcn = xlocalsize; - } - } - auto result = ConvSolution{miopenStatusSuccess}; - - { - size_t zlocalsize = 1; - size_t zgridsize = 1; - - auto kernel = KernelInfo{}; - - auto build_params = KernelBuildParameters{ - {"MIOPEN_USE_FP16", static_cast(bfp16parm)}, - {"MIOPEN_USE_FP32", static_cast(bfp32parm)}, - {"MIOPEN_USE_FPMIX", static_cast(bfpmixparm)}, - {"MIOPEN_USE_BFPMIX", static_cast(bbfpmixparam)}, - {"MIO_BN_USESAVED", static_cast(problem.UseSaved())}, - {"MIO_BN_N", static_cast(n)}, - {"MIO_BN_C", static_cast(c)}, - {"MIO_BN_HW", static_cast(in_cstride)}, - {"MIO_BN_NHW", static_cast(in_nhw)}, - {"MIO_BN_CHW", in_nstride}, - {"MIO_BN_NCHW", in_nchw}, - {"MIO_BN_LDS_SIZE", ldsnogcn}, - {"MIO_BN_LDSGCN_SIZE", ldsgcn}, - {"MIO_BN_VARIANT", variant}, - {"MIO_WAVESIZE", wavesize}, - {"MIO_BN_GRP0", xlocalsize}, - {"MIO_BN_GRP1", ylocalsize}, - {"MIO_BN_GRP2", zlocalsize}, - {"MIO_LAYOUT_NHWC", static_cast(problem.IsLayoutNHWC())}, - }; - - if((n > 64) && (n % 2 == 0) && (variant == 3) && (bfpmixparm) && (problem.UseSaved()) && - context.use_asm_kernels && context.rmv.IsV2orV3() && - (StartsWith(handle.GetDeviceName(), "gfx8") || - (StartsWith(handle.GetDeviceName(), "gfx9") -#if WORKAROUND_ISSUE_1146 - && (handle.GetDeviceName() != "gfx90a") -#endif - && (!StartsWith(handle.GetDeviceName(), "gfx94")))) && - (!handle.GetTargetProperties().Xnack() || !*handle.GetTargetProperties().Xnack())) - { - kernel.kernel_file = "gcnAsmBNBwdTrainSpatial.s"; - kernel.kernel_name = "miopenGcnAsmBNBwdTrainSpatial"; - - union - { - unsigned u32; - float f32 = 0; - } NHW_value; - - NHW_value.f32 = static_cast(in_nhw); - - build_params << KernelBuildParameters{ - {"ROCM_METADATA_VERSION", context.rmv.UseV3() ? "5" : "4"}, - {"MIO_BN_NHW_FLOAT", NHW_value.u32}, - }; - - kernel.comp_options = build_params.GenerateFor(kbp::GcnAsm{}); - } - else - { - kernel.kernel_file = "MIOpenBatchNormBwdSpatial.cl"; - kernel.kernel_name = "MIOpenBatchNormBwdSpatial"; - - build_params << KernelBuildParameters{ - {"MIO_BN_GFX103X", (StartsWith(handle.GetDeviceName(), "gfx103") ? "1" : "0")}, - {"MIO_BN_GFX110X", (StartsWith(handle.GetDeviceName(), "gfx110") ? "1" : "0")}, - {"MIO_BN_GFX120X", (StartsWith(handle.GetDeviceName(), "gfx120") ? "1" : "0")}, - }; - - kernel.comp_options = build_params.GenerateFor(kbp::OpenCL{}); - } - - kernel.l_wk.push_back(xlocalsize); - kernel.l_wk.push_back(ylocalsize); - kernel.l_wk.push_back(zlocalsize); - - kernel.g_wk.push_back(xgridsize); - kernel.g_wk.push_back(ygridsize); - kernel.g_wk.push_back(zgridsize); - - result.construction_params.push_back(kernel); - } - - const auto dtype = problem.GetBnScale().GetType(); - const auto useSaved = problem.UseSaved(); - - result.invoker_factory = [=](const std::vector& kernels) { - return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { - decltype(auto) kernel = handle_.Run(kernels.front()); - decltype(auto) params = raw_params.CastTo(); - - visit_float(dtype, [&](auto as_float) { - if(useSaved) - { - kernel(params.x, - params.dy, - params.dx, - params.bnScale, - params.resultBnScaleDiff, - params.resultBnBiasDiff, - params.savedMean, - params.savedInvVariance, - as_float(inhw)); - } - else - { - kernel(params.x, - params.dy, - params.dx, - params.bnScale, - params.resultBnScaleDiff, - params.resultBnBiasDiff, - params.epsilon, - inhw); - } - }); - }; - }; - - return result; -} - -} // namespace batchnorm - -} // namespace solver - -} // namespace miopen diff --git a/src/solver/batchnorm/forward_inference.cpp b/src/solver/batchnorm/forward_inference.cpp index a05fce5105..d338f8001a 100644 --- a/src/solver/batchnorm/forward_inference.cpp +++ b/src/solver/batchnorm/forward_inference.cpp @@ -1,172 +1,210 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2021 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ - -#include - -#include -#include -#include -#include -#include - -namespace miopen { - -namespace solver { - -namespace batchnorm { - -bool BnFwdInference::IsApplicable(const ExecutionContext&, - const miopen::batchnorm::ProblemDescription& bn_problem) const -{ - if(bn_problem.IsLayoutNHWC()) - return false; - if(bn_problem.GetDirection() != miopen::batchnorm::Direction::ForwardInference) - return false; - if(!(bn_problem.IsFp32() or bn_problem.IsFp16() or bn_problem.IsBFp16())) - return false; - if(!bn_problem.Is2D()) - return false; - if(!IsOCLInferTypeValid(bn_problem)) - return false; - - return true; -} - -ConvSolution BnFwdInference::GetSolution(const ExecutionContext& context, - const miopen::batchnorm::ProblemDescription& problem) const -{ - const auto& handle = context.GetStream(); - - bool bfpmixparm = false; - bool bbfpmixparam = false; - bool bfp16parm = false; - bool bfp32parm = true; - if(problem.GetXDesc().GetType() == miopenHalf && problem.GetBnScale().GetType() == miopenHalf) - { - bfp16parm = true; - bfp32parm = false; - } - else if(problem.GetXDesc().GetType() == miopenHalf && - problem.GetBnScale().GetType() == miopenFloat) - { - bfpmixparm = true; - bfp32parm = false; - } - else if(problem.GetXDesc().GetType() == miopenBFloat16 && - problem.GetBnScale().GetType() == miopenFloat) - { - bbfpmixparam = true; - bfp32parm = false; - } - - int n, c, h, w; - std::tie(n, c, h, w) = tien<4>(problem.GetXDesc().GetLengths()); - - unsigned int in_cstride = h * w; - - auto result = ConvSolution{miopenStatusSuccess}; - - { - size_t xlocalsize = 1; - auto xgridsize = c; - size_t ylocalsize = 256; - size_t ygridsize = ylocalsize * ((in_cstride + ylocalsize - 1) / ylocalsize); - size_t zlocalsize = 1; - size_t zgridsize = 1; - - auto kernel = KernelInfo{}; - - kernel.kernel_file = "MIOpenBatchNormFwdInfer"; // build this up - kernel.kernel_name = "MIOpenBatchNormFwdInfer"; - if(problem.GetMode() == miopenBNSpatial) - { // SPATIAL kernels - kernel.kernel_file += "Spatial.cl"; - kernel.kernel_name += "SpatialEst"; - } - else - { // PER ACTIVATION - kernel.kernel_file += "PerAct.cl"; - kernel.kernel_name += "PerActivationEst"; - } - - const auto build_params = KernelBuildParameters{ - {"MIOPEN_USE_FP16", static_cast(bfp16parm)}, - {"MIOPEN_USE_FP32", static_cast(bfp32parm)}, - {"MIOPEN_USE_FPMIX", static_cast(bfpmixparm)}, - {"MIOPEN_USE_BFPMIX", static_cast(bbfpmixparam)}, - {"MIO_BN_GRP0", xlocalsize}, - {"MIO_BN_GRP1", ylocalsize}, - {"MIO_BN_GRP2", zlocalsize}, - {"MIO_BN_GFX103X", (StartsWith(handle.GetDeviceName(), "gfx103") ? "1" : "0")}, - {"MIO_BN_GFX110X", (StartsWith(handle.GetDeviceName(), "gfx110") ? "1" : "0")}, - {"MIO_BN_GFX120X", (StartsWith(handle.GetDeviceName(), "gfx120") ? "1" : "0")}, - }; - - kernel.comp_options = build_params.GenerateFor(kbp::OpenCL{}); - - kernel.l_wk.push_back(xlocalsize); - kernel.l_wk.push_back(ylocalsize); - kernel.l_wk.push_back(zlocalsize); - - kernel.g_wk.push_back(xgridsize); - kernel.g_wk.push_back(ygridsize); - kernel.g_wk.push_back(zgridsize); - - result.construction_params.push_back(kernel); - } - - result.invoker_factory = [](const std::vector& kernels) { - return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { - decltype(auto) kernel = handle_.Run(kernels.front()); - decltype(auto) params = raw_params.CastTo(); - - int n_, c_, h_, w_; - std::tie(n_, c_, h_, w_) = tien<4>(params.xDesc->GetLengths()); - - unsigned int in_nstride_ = c_ * h_ * w_; - unsigned int in_cstride_ = h_ * w_; - - kernel(params.x, - params.y, - params.estimatedMean, - params.estimatedVariance, - params.bnScale, - params.bnBias, - params.epsilon, - n_, - in_cstride_, - in_nstride_); - }; - }; - - return result; -} - -} // namespace batchnorm - -} // namespace solver - -} // namespace miopen +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2021-2025 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include + +#include +#include +#include +#include +#include + +namespace miopen { + +namespace solver { + +namespace batchnorm { + +bool BnFwdInference::IsApplicable(const ExecutionContext&, + const miopen::batchnorm::ProblemDescription& bn_problem) const +{ + if(bn_problem.GetDirection() != miopen::batchnorm::Direction::ForwardInference) + return false; + if(!(bn_problem.IsFp32() or bn_problem.IsFp16() or bn_problem.IsBFp16())) + return false; + if(!bn_problem.Is2D()) + return false; + if(!IsOCLInferTypeValid(bn_problem)) + return false; + + return true; +} + +ConvSolution BnFwdInference::GetSolution(const ExecutionContext& context, + const miopen::batchnorm::ProblemDescription& problem) const +{ + const auto& handle = context.GetStream(); + + bool bfpmixparm = false; + bool bbfpmixparam = false; + bool bfp16parm = false; + bool bfp32parm = true; + if(problem.GetXDesc().GetType() == miopenHalf && problem.GetBnScale().GetType() == miopenHalf) + { + bfp16parm = true; + bfp32parm = false; + } + else if(problem.GetXDesc().GetType() == miopenHalf && + problem.GetBnScale().GetType() == miopenFloat) + { + bfpmixparm = true; + bfp32parm = false; + } + else if(problem.GetXDesc().GetType() == miopenBFloat16 && + problem.GetBnScale().GetType() == miopenFloat) + { + bbfpmixparam = true; + bfp32parm = false; + } + + int n, c, h, w; + std::tie(n, c, h, w) = tien<4>(problem.GetXDesc().GetLengths()); + + unsigned int in_cstride = h * w; + + auto result = ConvSolution{miopenStatusSuccess}; + + { + size_t xlocalsize, xgridsize, ylocalsize, ygridsize, zlocalsize, zgridsize; + size_t max_localsize = 256; + bool vectorize; + if(problem.GetXDesc().GetLayout_t() == miopenTensorNHWC) + { + vectorize = c % 4 == 0; + int vector_size = vectorize ? 4 : 1; + xlocalsize = std::min(size_t{c / vector_size}, max_localsize); + xgridsize = xlocalsize * ((c / vector_size + xlocalsize - 1) / xlocalsize); + ylocalsize = max_localsize / xlocalsize; + ygridsize = ylocalsize * ((in_cstride + ylocalsize - 1) / ylocalsize); + } + else + { + vectorize = in_cstride % 4 == 0; + int vector_size = vectorize ? 4 : 1; + xlocalsize = 1; + xgridsize = c; + ylocalsize = max_localsize; + ygridsize = ylocalsize * ((in_cstride / vector_size + ylocalsize - 1) / ylocalsize); + } + zlocalsize = 1; + zgridsize = 1; + + auto kernel = KernelInfo{}; + + kernel.kernel_file = "MIOpenBatchNormFwdInfer"; // build this up + kernel.kernel_name = "MIOpenBatchNormFwdInfer"; + if(problem.GetMode() == miopenBNSpatial) + { // SPATIAL kernels + kernel.kernel_file += "Spatial.cl"; + kernel.kernel_name += "SpatialEst"; + } + else + { // PER ACTIVATION + kernel.kernel_file += "PerAct.cl"; + kernel.kernel_name += "PerActivationEst"; + } + + const auto build_params = KernelBuildParameters{ + {"MIOPEN_USE_FP16", static_cast(bfp16parm)}, + {"MIOPEN_USE_FP32", static_cast(bfp32parm)}, + {"MIOPEN_USE_FPMIX", static_cast(bfpmixparm)}, + {"MIOPEN_USE_BFPMIX", static_cast(bbfpmixparam)}, + {"MIO_BN_GRP0", xlocalsize}, + {"MIO_BN_GRP1", ylocalsize}, + {"MIO_BN_GRP2", zlocalsize}, + {"MIO_BN_GFX103X", (StartsWith(handle.GetDeviceName(), "gfx103") ? "1" : "0")}, + {"MIO_BN_GFX110X", (StartsWith(handle.GetDeviceName(), "gfx110") ? "1" : "0")}, + {"MIO_BN_GFX120X", (StartsWith(handle.GetDeviceName(), "gfx120") ? "1" : "0")}, + {"MIO_LAYOUT_NHWC", static_cast(problem.IsLayoutNHWC())}, + {"MIO_BN_VECTORIZE", static_cast(vectorize)}, + }; + + kernel.comp_options = build_params.GenerateFor(kbp::OpenCL{}); + + kernel.l_wk.push_back(xlocalsize); + kernel.l_wk.push_back(ylocalsize); + kernel.l_wk.push_back(zlocalsize); + + kernel.g_wk.push_back(xgridsize); + kernel.g_wk.push_back(ygridsize); + kernel.g_wk.push_back(zgridsize); + + result.construction_params.push_back(kernel); + } + + result.invoker_factory = [](const std::vector& kernels) { + return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { + decltype(auto) kernel = handle_.Run(kernels.front()); + decltype(auto) params = raw_params.CastTo(); + + int n_, c_, h_, w_; + std::tie(n_, c_, h_, w_) = tien<4>(params.xDesc->GetLengths()); + + unsigned int in_nstride_ = c_ * h_ * w_; + + if(params.xDesc->GetLayout_t() == miopenTensorNHWC) + { + kernel(params.x, + params.y, + params.estimatedMean, + params.estimatedVariance, + params.bnScale, + params.bnBias, + params.epsilon, + c_, + h_ * w_, + n_, + 1, // cStride + c_, // hwStride + in_nstride_); // batchStride + } + else + { + kernel(params.x, + params.y, + params.estimatedMean, + params.estimatedVariance, + params.bnScale, + params.bnBias, + params.epsilon, + c_, + h_ * w_, + n_, + h_ * w_, // cStride + 1, // hwStride + in_nstride_); // batchStride + } + }; + }; + + return result; +} + +} // namespace batchnorm + +} // namespace solver + +} // namespace miopen diff --git a/src/solver/batchnorm/forward_spatial.cpp b/src/solver/batchnorm/forward_spatial.cpp new file mode 100644 index 0000000000..49872a3637 --- /dev/null +++ b/src/solver/batchnorm/forward_spatial.cpp @@ -0,0 +1,500 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2021 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include +#include + +#include +#include +#include +#include +#include +#include + +namespace miopen { + +namespace solver { + +namespace batchnorm { + +// Spatial multiple needs space for 2 fp32 elements +// per each x thread (including the last workgroup) +// to stash intermediate mean and variance +const unsigned int stash_values_fwd = 2; + +bool PerformanceConfigBnFwdTraining::IsValid( + const ExecutionContext&, const miopen::batchnorm::ProblemDescription& problem) const +{ + if(this->kernel_id.empty()) + { + return false; + } + + // if default config is variant 2, check if it can be applied + // (based on variant 2 restrictions) + size_t vectorsize; + int variant = 0; + GetVariantFromKernelId(this->kernel_id, variant, vectorsize); + if(variant == 2) + { + return IsSpatialMultipleApplicable(problem, vectorsize, stash_values_fwd); + } + return true; +} + +void PerformanceConfigBnFwdTraining::HeuristicInit( + const miopen::batchnorm::ProblemDescription& problem) +{ + // Define default configuration based on heuristics and + // add all other valid configurations for the given problem + if(UseMultiple(problem)) + { + DefaultConfigSpatialMultiple(problem, stash_values_fwd, this->valid_kernels); + DefaultConfigSpatialSingle(problem, this->valid_kernels); + } + else + { + DefaultConfigSpatialSingle(problem, this->valid_kernels); + DefaultConfigSpatialMultiple(problem, stash_values_fwd, this->valid_kernels); + } + + // Set index and kernel_id to default value + this->index = 0; + this->kernel_id = valid_kernels[0]; +} + +bool PerformanceConfigBnFwdTraining::SetNextValue( + const miopen::batchnorm::ProblemDescription& problem_desc) +{ + // In case the valid_kernel list is empty, we fill it with + // default value as first one and all other valid ones will follow + if(this->valid_kernels.empty()) + { + this->HeuristicInit(problem_desc); + return true; + } + // Get next valid configuration + if((this->index + 1) < valid_kernels.size()) + { + ++this->index; + this->kernel_id = this->valid_kernels[index]; + return true; + } + else + { + return false; + } +} + +bool PerformanceConfigBnFwdTraining::operator==(const PerformanceConfigBnFwdTraining& other) const +{ + return this->kernel_id == other.kernel_id; +} + +bool PerformanceConfigBnFwdTraining::IsValidValue() const +{ + return this->index >= 0 && this->index < valid_kernels.size(); +} + +bool BnFwdTrainingSpatial::IsApplicable( + const ExecutionContext&, const miopen::batchnorm::ProblemDescription& bn_problem) const +{ + if(bn_problem.GetDirection() != miopen::batchnorm::Direction::ForwardTraining || + bn_problem.GetMode() != miopenBNSpatial) + return false; + + if(!bn_problem.Is2D()) + return false; + + if(!IsOCLFwdTrainTypeValid(bn_problem)) + return false; + + return true; +} + +PerformanceConfigBnFwdTraining BnFwdTrainingSpatial::GetDefaultPerformanceConfig( + const ExecutionContext&, const miopen::batchnorm::ProblemDescription& problem_desc) const +{ + PerformanceConfigBnFwdTraining pp; + pp.HeuristicInit(problem_desc); + MIOPEN_LOG_I(pp.ToString()); + return pp; +} + +bool BnFwdTrainingSpatial::IsValidPerformanceConfig( + const ExecutionContext& ctx, + const miopen::batchnorm::ProblemDescription& problem_desc, + const PerformanceConfigBnFwdTraining& config) const +{ + bool valid = config.IsValid(ctx, problem_desc); + return valid; +} + +PerformanceConfigBnFwdTraining +BnFwdTrainingSpatial::Search(const ExecutionContext& ctx, + const miopen::batchnorm::ProblemDescription& problem, + const AnyInvokeParams& invoke_ctx) const +{ + return GenericSearch(*this, ctx, problem, invoke_ctx); +} + +ConvSolution BnFwdTrainingSpatial::GetSolution(const ExecutionContext& context, + const miopen::batchnorm::ProblemDescription& problem, + const PerformanceConfigBnFwdTraining& config) const +{ + const auto& handle = context.GetStream(); + + bool bfpmixparm = false; + bool bbfpmixparam = false; + bool bfp16parm = false; + bool bfp32parm = true; + + if(problem.GetXDesc().GetType() == miopenHalf && problem.GetBnScale().GetType() == miopenHalf) + { + bfp16parm = true; + bfp32parm = false; + } + else if(problem.GetXDesc().GetType() == miopenHalf && + problem.GetBnScale().GetType() == miopenFloat) + { + bfpmixparm = true; + bfp32parm = false; + } + else if(problem.GetXDesc().GetType() == miopenBFloat16 && + problem.GetBnScale().GetType() == miopenFloat) + { + bbfpmixparam = true; + bfp32parm = false; + } + + int n, c, h, w; + std::tie(n, c, h, w) = tien<4>(problem.GetXDesc().GetLengths()); + + unsigned int in_cstride = h * w; + unsigned int in_nstride = c * in_cstride; + unsigned int in_nhw = n * in_cstride; + unsigned int in_nchw = n * in_nstride; + auto inhw = float(1.0 / in_nhw); + + int variant = -1; + size_t vectorsize = 1; + GetVariantFromKernelId(config.kernel_id, variant, vectorsize); + + size_t xlocalsize, xgridsize; + size_t ylocalsize = 1, ygridsize = 1, zlocalsize = 1, zgridsize = 1; + unsigned int ldsgcn, ldsnogcn; + int stash_method = -1; + + if(variant != 2) + { + xlocalsize = 1024; + if(((in_cstride < 256) && (n < 256)) || ((in_cstride < 100) && (n <= 256))) + { + xlocalsize = 256; + } + xgridsize = c * xlocalsize; + ldsgcn = xlocalsize / 64; + ldsnogcn = xlocalsize; +#if(WORKAROUND_SWDEV_253606 == 0) + if(variant == 4) + { + xlocalsize = 256; + xgridsize = c * xlocalsize; + ylocalsize = 1; + ygridsize = 1; + ldsgcn = xlocalsize / 64; + ldsnogcn = xlocalsize; + } +#endif + } + else + { + GetSpatialMultipleConfig(problem, + stash_values_fwd, + vectorsize, + xlocalsize, + ylocalsize, + xgridsize, + ygridsize, + stash_method); + ldsnogcn = xlocalsize * ylocalsize; + ldsgcn = xlocalsize * ylocalsize / 64; + } + + auto result = ConvSolution{miopenStatusSuccess}; + + { + auto kernel = KernelInfo{}; + + auto build_params = KernelBuildParameters{ + {"MIOPEN_USE_FP16", static_cast(bfp16parm)}, + {"MIOPEN_USE_FP32", static_cast(bfp32parm)}, + {"MIOPEN_USE_FPMIX", static_cast(bfpmixparm)}, + {"MIOPEN_USE_BFPMIX", static_cast(bbfpmixparam)}, + {"MIO_SAVE_MEAN_VARIANCE", static_cast(problem.GetResultSave())}, + {"MIO_RUNNING_RESULT", static_cast(problem.GetResultRunning())}, + {"MIO_BN_VARIANT", variant}, + {"MIO_BN_LDS_SIZE", ldsnogcn}, + {"MIO_BN_LDSGCN_SIZE", std::to_string(ldsgcn)}, + {"MIO_BN_N", n}, + {"MIO_BN_NGRPS", ygridsize / ylocalsize}, + {"MIO_BN_GRP0", xlocalsize}, + {"MIO_BN_GRP1", ylocalsize}, + {"MIO_BN_GRP2", zlocalsize}, + {"MIO_BN_GFX103X", (StartsWith(handle.GetDeviceName(), "gfx103") ? "1" : "0")}, + {"MIO_BN_GFX110X", (StartsWith(handle.GetDeviceName(), "gfx110") ? "1" : "0")}, + {"MIO_BN_GFX120X", (StartsWith(handle.GetDeviceName(), "gfx120") ? "1" : "0")}, + {"MIO_LAYOUT_NHWC", static_cast(problem.IsLayoutNHWC())}, + {"MIO_BN_VECTORIZE", static_cast(vectorsize > 1)}, + {"MIO_BN_STASH_METHOD", stash_method}, + }; + + if(variant != 4) + { + build_params.Define("MIO_BN_C", c); + build_params.Define("MIO_BN_HW", in_cstride); + build_params.Define("MIO_BN_NHW", in_nhw); + build_params.Define("MIO_BN_CHW", in_nstride); + build_params.Define("MIO_BN_NCHW", in_nchw); + } + + kernel.kernel_file = "MIOpenBatchNormFwdTrainSpatial.cl"; + kernel.kernel_name = "MIOpenBatchNormFwdTrainSpatial"; + + kernel.comp_options = build_params.GenerateFor(kbp::OpenCL{}); + + kernel.l_wk.push_back(xlocalsize); + kernel.l_wk.push_back(ylocalsize); + kernel.l_wk.push_back(zlocalsize); + + kernel.g_wk.push_back(xgridsize); + kernel.g_wk.push_back(ygridsize); + kernel.g_wk.push_back(zgridsize); + + if(variant != 2) + { + result.construction_params.push_back(kernel); + } + else + { + auto copy = kernel; + copy.kernel_name = kernel.kernel_name + "MeanVariance"; + result.construction_params.push_back(copy); + + copy.kernel_name = kernel.kernel_name + "FinalMeanVariance"; + copy.g_wk[1] = kernel.l_wk[1]; + result.construction_params.push_back(copy); + + copy.kernel_name = kernel.kernel_name + "Norm"; + copy.g_wk[1] = kernel.g_wk[1]; + result.construction_params.push_back(copy); + } + } + + const auto dtype = problem.GetBnScale().GetType(); + const auto vn4 = (variant != 4); + + result.invoker_factory = [=](const std::vector& kernels) { + return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { + decltype(auto) params = raw_params.CastTo(); + const auto resultsave = + params.resultSaveMean != nullptr && params.resultSaveInvVariance != nullptr; + const auto resultrunning = + params.resultRunningMean != nullptr && params.resultRunningVariance != nullptr; + + float ctime = 0.; + visit_float(dtype, [&](auto as_float) { + if(variant != 2) + { + decltype(auto) kernel = handle_.Run(kernels.front()); + if(resultsave && resultrunning) + { + if(vn4) + { + kernel(params.x, + params.y, + params.bnScale, + params.bnBias, + as_float(inhw), + params.expAvgFactor, + params.resultRunningMean, + params.resultRunningVariance, + params.epsilon, + params.resultSaveMean, + params.resultSaveInvVariance); + } + else + { + kernel(params.x, + params.y, + params.bnScale, + params.bnBias, + as_float(inhw), + params.expAvgFactor, + params.resultRunningMean, + params.resultRunningVariance, + params.epsilon, + params.resultSaveMean, + params.resultSaveInvVariance, + in_cstride, + in_nstride); + } + } + else if(resultsave) + { + if(vn4) + { + kernel(params.x, + params.y, + params.bnScale, + params.bnBias, + as_float(inhw), + params.epsilon, + params.resultSaveMean, + params.resultSaveInvVariance); + } + else + { + kernel(params.x, + params.y, + params.bnScale, + params.bnBias, + as_float(inhw), + params.epsilon, + params.resultSaveMean, + params.resultSaveInvVariance, + in_cstride, + in_nstride); + } + } + else if(resultrunning) + { + if(vn4) + { + kernel(params.x, + params.y, + params.bnScale, + params.bnBias, + as_float(inhw), + params.expAvgFactor, + params.resultRunningMean, + params.resultRunningVariance, + params.epsilon); + } + else + { + kernel(params.x, + params.y, + params.bnScale, + params.bnBias, + as_float(inhw), + params.expAvgFactor, + params.resultRunningMean, + params.resultRunningVariance, + params.epsilon, + in_cstride, + in_nstride); + } + } + else + { + if(vn4) + { + kernel(params.x, + params.y, + params.bnScale, + params.bnBias, + as_float(inhw), + params.epsilon); + } + else + { + kernel(params.x, + params.y, + params.bnScale, + params.bnBias, + as_float(inhw), + params.epsilon, + in_cstride, + in_nstride); + } + } + } + else + { + handle_.Run(kernels[0])(params.x, params.y); + profileSequence(handle_, 0, &ctime); + + if(resultsave && resultrunning) + { + handle_.Run(kernels[1])(params.y, + as_float(inhw), + params.expAvgFactor, + params.resultRunningMean, + params.resultRunningVariance, + params.epsilon, + params.resultSaveMean, + params.resultSaveInvVariance); + } + else if(resultsave) + { + handle_.Run(kernels[1])(params.y, + as_float(inhw), + params.epsilon, + params.resultSaveMean, + params.resultSaveInvVariance); + } + else if(resultrunning) + { + handle_.Run(kernels[1])(params.y, + as_float(inhw), + params.expAvgFactor, + params.resultRunningMean, + params.resultRunningVariance, + params.epsilon); + } + else + { + handle_.Run(kernels[1])(params.y, as_float(inhw), params.epsilon); + } + + profileSequence(handle_, 1, &ctime); + + handle_.Run(kernels[2])(params.x, params.y, params.bnScale, params.bnBias); + profileSequence(handle_, 2, &ctime); + } + }); + }; + }; + + return result; +} + +} // namespace batchnorm + +} // namespace solver + +} // namespace miopen diff --git a/src/solver/batchnorm/forward_spatial_multiple.cpp b/src/solver/batchnorm/forward_spatial_multiple.cpp deleted file mode 100644 index 6a2c42743b..0000000000 --- a/src/solver/batchnorm/forward_spatial_multiple.cpp +++ /dev/null @@ -1,281 +0,0 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2021 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ - -#include - -#include -#include -#include -#include -#include - -#define WORKAROUND_SWDEV_253606 1 - -namespace miopen { - -namespace solver { - -namespace batchnorm { - -bool BnFwdTrainingSpatialMultiple::IsApplicable( - const ExecutionContext& context, const miopen::batchnorm::ProblemDescription& problem) const -{ - if(problem.GetDirection() != miopen::batchnorm::Direction::ForwardTraining || - problem.GetMode() != miopenBNSpatial) - return false; - - if(!IsOCLFwdTrainTypeValid(problem)) - return false; - - return !BnFwdTrainingSpatialSingle{}.IsApplicable(context, problem); -} - -ConvSolution BnFwdTrainingSpatialMultiple::GetSolution( - const ExecutionContext& context, const miopen::batchnorm::ProblemDescription& problem) const -{ - const auto& handle = context.GetStream(); - const auto& xDesc = problem.GetXDesc(); - const auto& bnScaleBiasMeanVarDesc = problem.GetBnScale(); - - int n, c, h, w; - std::tie(n, c, h, w) = tien<4>(xDesc.GetLengths()); - - unsigned int in_cstride = h * w; - unsigned int in_nstride = c * in_cstride; - unsigned int in_nhw = n * in_cstride; - unsigned int in_nchw = n * in_nstride; - auto inhw = float(1.0 / in_nhw); - - size_t xlocalsize = 1024; - if(((in_cstride < 256) && (n < 256)) || ((in_cstride < 100) && (n <= 256))) - xlocalsize = 256; - - size_t ylocalsize = 1; - - size_t xgridsize = c * xlocalsize; - size_t ygridsize = 1; - - bool bfpmixparm = false; - bool bbfpmixparam = false; - bool bfp16parm = false; - bool bfp32parm = true; - if(xDesc.GetType() == miopenHalf && bnScaleBiasMeanVarDesc.GetType() == miopenHalf) - { - bfp16parm = true; - bfp32parm = false; - } - else if(xDesc.GetType() == miopenHalf && bnScaleBiasMeanVarDesc.GetType() == miopenFloat) - { - bfpmixparm = true; - bfp32parm = false; - } - else if(problem.GetXDesc().GetType() == miopenBFloat16 && - problem.GetBnScale().GetType() == miopenFloat) - { - bbfpmixparam = true; - bfp32parm = false; - } - - int variant = 1; - unsigned int ldsgcn = xlocalsize / 64; - unsigned int ldsnogcn = xlocalsize; - - if(!problem.IsLayoutNHWC()) - { -#if(WORKAROUND_SWDEV_253606 == 0) - if(n < 3) - { - variant = 4; - xlocalsize = 256; - xgridsize = c * xlocalsize; - ylocalsize = 1; - ygridsize = 1; - ldsgcn = xlocalsize / 64; - ldsnogcn = xlocalsize; - } - else -#endif - - // clang-format off - if((in_nhw < 33554432 && in_cstride > 1024) || - ((n >= 256) && (in_cstride > 60) && bfpmixparm) || - ((in_cstride > 512) && bfpmixparm)) - { - variant = 1; - } - else if(in_cstride <= 512) - { - variant = 0; - } - else - { - variant = 2; - xlocalsize = 1; - ylocalsize = 1024; - auto segment = int(std::ceil(double(in_cstride) / double(ylocalsize))); - xgridsize = c; - ygridsize = segment * ylocalsize; - ldsgcn = ylocalsize / 64; - ldsnogcn = ylocalsize; - } - // clang-format on - - if((n > 768) && (in_cstride > 150) && bfp32parm) - { - variant = 2; - xlocalsize = 1; - ylocalsize = 1024; - auto segment = int(std::ceil(double(in_cstride) / double(ylocalsize))); - xgridsize = c; - ygridsize = segment * ylocalsize; - ldsgcn = ylocalsize / 64; - ldsnogcn = ylocalsize; - } - } - - auto result = ConvSolution{miopenStatusSuccess}; - - { - auto kernel = KernelInfo{}; - - kernel.kernel_name = "MIOpenBatchNormFwdTrainSpatial"; - kernel.kernel_file = "MIOpenBatchNormFwdTrainSpatial.cl"; - - size_t zlocalsize = 1; - size_t zgridsize = 1; - - auto build_params = KernelBuildParameters{ - {"MIOPEN_USE_FP16", static_cast(bfp16parm)}, - {"MIOPEN_USE_FP32", static_cast(bfp32parm)}, - {"MIOPEN_USE_FPMIX", static_cast(bfpmixparm)}, - {"MIOPEN_USE_BFPMIX", static_cast(bbfpmixparam)}, - {"MIO_SAVE_MEAN_VARIANCE", static_cast(problem.GetResultSave())}, - {"MIO_RUNNING_RESULT", static_cast(problem.GetResultRunning())}, - {"MIO_BN_N", n}, - {"MIO_BN_C", c}, - {"MIO_BN_HW", in_cstride}, - {"MIO_BN_NHW", in_nhw}, - {"MIO_BN_CHW", in_nstride}, - {"MIO_BN_NCHW", in_nchw}, - {"MIO_BN_NGRPS", int(std::ceil(float(ygridsize) / ylocalsize))}, - {"MIO_BN_LDS_SIZE", ldsnogcn}, - {"MIO_BN_LDSGCN_SIZE", ldsgcn}, - {"MIO_BN_VARIANT", variant}, - {"MIO_BN_GRP0", xlocalsize}, - {"MIO_BN_GRP1", ylocalsize}, - {"MIO_BN_GRP2", zlocalsize}, - {"MIO_BN_GFX103X", (StartsWith(handle.GetDeviceName(), "gfx103") ? "1" : "0")}, - {"MIO_BN_GFX110X", (StartsWith(handle.GetDeviceName(), "gfx110") ? "1" : "0")}, - {"MIO_BN_GFX120X", (StartsWith(handle.GetDeviceName(), "gfx120") ? "1" : "0")}, - {"MIO_LAYOUT_NHWC", static_cast(problem.IsLayoutNHWC())}, - }; - - kernel.comp_options = build_params.GenerateFor(kbp::OpenCL{}); - - kernel.l_wk.push_back(xlocalsize); - kernel.l_wk.push_back(ylocalsize); - kernel.l_wk.push_back(zlocalsize); - - kernel.g_wk.push_back(xgridsize); - kernel.g_wk.push_back(ygridsize); - kernel.g_wk.push_back(zgridsize); - - auto copy = kernel; - copy.kernel_name = kernel.kernel_name + "MeanVariance"; - result.construction_params.push_back(copy); - - copy.kernel_name = kernel.kernel_name + "FinalMeanVariance"; - result.construction_params.push_back(copy); - - copy.kernel_name = kernel.kernel_name + "Norm"; - result.construction_params.push_back(copy); - } - - const auto dtype = bnScaleBiasMeanVarDesc.GetType(); - - result.invoker_factory = [=](const std::vector& kernels) { - return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { - decltype(auto) kernel = handle_.Run(kernels.front()); - decltype(auto) params = raw_params.CastTo(); - const auto resultsave = - params.resultSaveMean != nullptr && params.resultSaveInvVariance != nullptr; - const auto resultrunning = - params.resultRunningMean != nullptr && params.resultRunningVariance != nullptr; - - float ctime = 0.; - visit_float(dtype, [&](auto as_float) { - handle_.Run(kernels[0])(params.x, params.y); - profileSequence(handle_, 0, &ctime); - - if(resultsave && resultrunning) - { - handle_.Run(kernels[1])(params.y, - as_float(inhw), - params.expAvgFactor, - params.resultRunningMean, - params.resultRunningVariance, - params.epsilon, - params.resultSaveMean, - params.resultSaveInvVariance); - } - else if(resultsave) - { - handle_.Run(kernels[1])(params.y, - as_float(inhw), - params.epsilon, - params.resultSaveMean, - params.resultSaveInvVariance); - } - else if(resultrunning) - { - handle_.Run(kernels[1])(params.y, - as_float(inhw), - params.expAvgFactor, - params.resultRunningMean, - params.resultRunningVariance, - params.epsilon); - } - else - { - handle_.Run(kernels[1])(params.y, as_float(inhw), params.epsilon); - } - - profileSequence(handle_, 1, &ctime); - - handle_.Run(kernels[2])(params.x, params.y, params.bnScale, params.bnBias); - profileSequence(handle_, 2, &ctime); - }); - }; - }; - - return result; -} - -} // namespace batchnorm - -} // namespace solver - -} // namespace miopen diff --git a/src/solver/batchnorm/forward_spatial_single.cpp b/src/solver/batchnorm/forward_spatial_single.cpp deleted file mode 100644 index ccfebce987..0000000000 --- a/src/solver/batchnorm/forward_spatial_single.cpp +++ /dev/null @@ -1,385 +0,0 @@ -/******************************************************************************* - * - * MIT License - * - * Copyright (c) 2021 Advanced Micro Devices, Inc. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - *******************************************************************************/ - -#include - -#include -#include -#include -#include - -#define WORKAROUND_SWDEV_253606 1 - -namespace miopen { - -namespace solver { - -namespace batchnorm { - -bool BnFwdTrainingSpatialSingle::IsApplicable( - const ExecutionContext&, const miopen::batchnorm::ProblemDescription& bn_problem) const -{ - if(bn_problem.GetDirection() != miopen::batchnorm::Direction::ForwardTraining || - bn_problem.GetMode() != miopenBNSpatial) - return false; - - if(!IsOCLFwdTrainTypeValid(bn_problem)) - return false; - - int n, c, h, w; - std::tie(n, c, h, w) = tien<4>(bn_problem.GetXDesc().GetLengths()); - - unsigned int in_cstride = h * w; - unsigned int in_nhw = n * in_cstride; - - bool bfpmixparm = false; - bool bfp32parm = true; - - if(bn_problem.GetXDesc().GetType() == miopenHalf && - bn_problem.GetBnScale().GetType() == miopenHalf) - { - bfp32parm = false; - } - else if((bn_problem.GetXDesc().GetType() == miopenHalf || - bn_problem.GetXDesc().GetType() == miopenBFloat16) && - bn_problem.GetBnScale().GetType() == miopenFloat) - { - bfpmixparm = true; - bfp32parm = false; - } - - // clang-format off - if(!(WORKAROUND_SWDEV_253606 == 0 && n < 3) && - !((in_nhw < 33554432 && in_cstride > 1024) || - ((n >= 256) && (in_cstride > 60) && bfpmixparm) || - ((in_cstride > 512) && bfpmixparm) || - in_cstride <= 512)) - return false; - // clang-format on - - if((n > 768) && (in_cstride > 150) && bfp32parm) - { - return false; - } - - return true; -} - -ConvSolution -BnFwdTrainingSpatialSingle::GetSolution(const ExecutionContext& context, - const miopen::batchnorm::ProblemDescription& problem) const -{ - const auto& handle = context.GetStream(); - - bool bfpmixparm = false; - bool bbfpmixparam = false; - bool bfp16parm = false; - bool bfp32parm = true; - - if(problem.GetXDesc().GetType() == miopenHalf && problem.GetBnScale().GetType() == miopenHalf) - { - bfp16parm = true; - bfp32parm = false; - } - else if(problem.GetXDesc().GetType() == miopenHalf && - problem.GetBnScale().GetType() == miopenFloat) - { - bfpmixparm = true; - bfp32parm = false; - } - else if(problem.GetXDesc().GetType() == miopenBFloat16 && - problem.GetBnScale().GetType() == miopenFloat) - { - bbfpmixparam = true; - bfp32parm = false; - } - - int n, c, h, w; - std::tie(n, c, h, w) = tien<4>(problem.GetXDesc().GetLengths()); - - unsigned int in_cstride = h * w; - unsigned int in_nstride = c * in_cstride; - unsigned int in_nhw = n * in_cstride; - unsigned int in_nchw = n * in_nstride; - auto inhw = float(1.0 / in_nhw); - - size_t xlocalsize = 1024; - if(((in_cstride < 256) && (n < 256)) || ((in_cstride < 100) && (n <= 256))) - xlocalsize = 256; - - size_t ylocalsize = 1; - - size_t xgridsize = c * xlocalsize; - size_t ygridsize = 1; - - int variant = 1; - unsigned int ldsgcn = xlocalsize / 64; - unsigned int ldsnogcn = xlocalsize; - - if(!problem.IsLayoutNHWC()) - { -#if(WORKAROUND_SWDEV_253606 == 0) - if(n < 3) - { - variant = 4; - xlocalsize = 256; - xgridsize = c * xlocalsize; - ylocalsize = 1; - ygridsize = 1; - ldsgcn = xlocalsize / 64; - ldsnogcn = xlocalsize; - } - else -#endif - { - // clang-format off - if( (in_nhw < 33554432 && in_cstride > 1024) || - ((n >= 256) && (in_cstride > 60) && (bfpmixparm || bbfpmixparam)) || - ((in_cstride > 512) && (bfpmixparm || bbfpmixparam))) - { - variant = 1; - } - else if(in_cstride <= 512) - { - variant = 0; - } - else - { - variant = 2; - xlocalsize = 1; - ylocalsize = 1024; - auto segment = int(std::ceil(double(in_cstride) / double(ylocalsize))); - xgridsize = c; - ygridsize = segment * ylocalsize; - ldsgcn = ylocalsize / 64; - ldsnogcn = ylocalsize; - } - // clang-format on - - if((n > 768) && (in_cstride > 150) && bfp32parm) - { - variant = 2; - xlocalsize = 1; - ylocalsize = 1024; - auto segment = int(std::ceil(double(in_cstride) / double(ylocalsize))); - xgridsize = c; - ygridsize = segment * ylocalsize; - ldsgcn = ylocalsize / 64; - ldsnogcn = ylocalsize; - } - } - } - - auto result = ConvSolution{miopenStatusSuccess}; - - { - size_t zlocalsize = 1; - size_t zgridsize = 1; - - auto kernel = KernelInfo{}; - - kernel.kernel_name = "MIOpenBatchNormFwdTrainSpatial"; - kernel.kernel_file = "MIOpenBatchNormFwdTrainSpatial.cl"; - - auto build_params = KernelBuildParameters{ - {"MIOPEN_USE_FP16", static_cast(bfp16parm)}, - {"MIOPEN_USE_FP32", static_cast(bfp32parm)}, - {"MIOPEN_USE_FPMIX", static_cast(bfpmixparm)}, - {"MIOPEN_USE_BFPMIX", static_cast(bbfpmixparam)}, - {"MIO_SAVE_MEAN_VARIANCE", static_cast(problem.GetResultSave())}, - {"MIO_RUNNING_RESULT", static_cast(problem.GetResultRunning())}, - {"MIO_BN_VARIANT", variant}, - {"MIO_BN_LDS_SIZE", ldsnogcn}, - {"MIO_BN_LDSGCN_SIZE", std::to_string(ldsgcn)}, - {"MIO_BN_N", n}, - {"MIO_BN_GRP0", xlocalsize}, - {"MIO_BN_GRP1", ylocalsize}, - {"MIO_BN_GRP2", zlocalsize}, - {"MIO_BN_GFX103X", (StartsWith(handle.GetDeviceName(), "gfx103") ? "1" : "0")}, - {"MIO_BN_GFX110X", (StartsWith(handle.GetDeviceName(), "gfx110") ? "1" : "0")}, - {"MIO_BN_GFX120X", (StartsWith(handle.GetDeviceName(), "gfx120") ? "1" : "0")}, - {"MIO_LAYOUT_NHWC", static_cast(problem.IsLayoutNHWC())}, - }; - - if(variant != 4) - { - build_params.Define("MIO_BN_C", c); - build_params.Define("MIO_BN_HW", in_cstride); - build_params.Define("MIO_BN_NHW", in_nhw); - build_params.Define("MIO_BN_CHW", in_nstride); - build_params.Define("MIO_BN_NCHW", in_nchw); - } - - kernel.comp_options = build_params.GenerateFor(kbp::OpenCL{}); - - kernel.l_wk.push_back(xlocalsize); - kernel.l_wk.push_back(ylocalsize); - kernel.l_wk.push_back(zlocalsize); - - kernel.g_wk.push_back(xgridsize); - kernel.g_wk.push_back(ygridsize); - kernel.g_wk.push_back(zgridsize); - - result.construction_params.push_back(kernel); - } - - const auto dtype = problem.GetBnScale().GetType(); - const auto vn4 = (variant != 4); - - result.invoker_factory = [=](const std::vector& kernels) { - return [=](const Handle& handle_, const AnyInvokeParams& raw_params) { - decltype(auto) kernel = handle_.Run(kernels.front()); - decltype(auto) params = raw_params.CastTo(); - const auto resultsave = - params.resultSaveMean != nullptr && params.resultSaveInvVariance != nullptr; - const auto resultrunning = - params.resultRunningMean != nullptr && params.resultRunningVariance != nullptr; - - visit_float(dtype, [&](auto as_float) { - if(resultsave && resultrunning) - { - if(vn4) - { - kernel(params.x, - params.y, - params.bnScale, - params.bnBias, - as_float(inhw), - params.expAvgFactor, - params.resultRunningMean, - params.resultRunningVariance, - params.epsilon, - params.resultSaveMean, - params.resultSaveInvVariance); - } - else - { - kernel(params.x, - params.y, - params.bnScale, - params.bnBias, - as_float(inhw), - params.expAvgFactor, - params.resultRunningMean, - params.resultRunningVariance, - params.epsilon, - params.resultSaveMean, - params.resultSaveInvVariance, - in_cstride, - in_nstride); - } - } - else if(resultsave) - { - if(vn4) - { - kernel(params.x, - params.y, - params.bnScale, - params.bnBias, - as_float(inhw), - params.epsilon, - params.resultSaveMean, - params.resultSaveInvVariance); - } - else - { - kernel(params.x, - params.y, - params.bnScale, - params.bnBias, - as_float(inhw), - params.epsilon, - params.resultSaveMean, - params.resultSaveInvVariance, - in_cstride, - in_nstride); - } - } - else if(resultrunning) - { - if(vn4) - { - kernel(params.x, - params.y, - params.bnScale, - params.bnBias, - as_float(inhw), - params.expAvgFactor, - params.resultRunningMean, - params.resultRunningVariance, - params.epsilon); - } - else - { - kernel(params.x, - params.y, - params.bnScale, - params.bnBias, - as_float(inhw), - params.expAvgFactor, - params.resultRunningMean, - params.resultRunningVariance, - params.epsilon, - in_cstride, - in_nstride); - } - } - else - { - if(vn4) - { - kernel(params.x, - params.y, - params.bnScale, - params.bnBias, - as_float(inhw), - params.epsilon); - } - else - { - kernel(params.x, - params.y, - params.bnScale, - params.bnBias, - as_float(inhw), - params.epsilon, - in_cstride, - in_nstride); - } - } - }); - }; - }; - - return result; -} - -} // namespace batchnorm - -} // namespace solver - -} // namespace miopen diff --git a/test/fusionHost.hpp b/test/fusionHost.hpp index 6fbc428b1a..1e64290c19 100644 --- a/test/fusionHost.hpp +++ b/test/fusionHost.hpp @@ -162,7 +162,6 @@ void batchNormSpatialHostInference(const tensor& input, output(bidx, cidx, row, column) = static_cast(scale(0, cidx, 0, 0) * inhat + bias(0, cidx, 0, 0)); // printf("output: %f\n",scale(0, cidx, 0, 0) * inhat + bias(0, cidx, 0, 0)); - // std::cout << output(bidx, cidx, row, column) << ","; } } } @@ -542,32 +541,44 @@ void batchNormPerActHostFwdTrain(const tensor& input, scale(0, cidx, row, column) * inhat + bias(0, cidx, row, column)); } // end for(n_batch) - newRunMean = runMean(0, cidx, row, column) * (1.0 - expAvgFactor); - runMean(0, cidx, row, column) = - mean_accum * expAvgFactor + newRunMean; // newMean*factor + tmp - + if(!runMean.data.empty()) + { + newRunMean = runMean(0, cidx, row, column) * (1.0 - expAvgFactor); + runMean(0, cidx, row, column) = + mean_accum * expAvgFactor + newRunMean; // newMean*factor + tmp + } // var(n+1) = p * var(n-1) + (1 - p)*(b/b-1)*var(n) - adjust = (n_batch == 1) ? variance_accum : (n / (n - 1.0)) * variance_accum; - runVar(0, cidx, row, column) = - (1 - expAvgFactor) * runVar(0, cidx, row, column) + expAvgFactor * adjust; - - saveMean(0, cidx, row, column) = static_cast(mean_accum); - saveInvVar(0, cidx, row, column) = static_cast(elemInvVar); + if(!runVar.data.empty()) + { + adjust = (n_batch == 1) ? variance_accum : (n / (n - 1.0)) * variance_accum; + runVar(0, cidx, row, column) = + (1 - expAvgFactor) * runVar(0, cidx, row, column) + expAvgFactor * adjust; + } + if(!saveMean.data.empty() || !saveInvVar.data.empty()) + { + saveMean(0, cidx, row, column) = static_cast(mean_accum); + saveInvVar(0, cidx, row, column) = static_cast(elemInvVar); + } } // for (column) } // for (row) }); } -template -void batchNormPerActHostBwdTrain(const tensor& x_input, - const tensor& dy_input, - const tensor& scale, - tensor& dscale, - tensor& dbias, - tensor& dx_out, - const tensor& savedMean, - const tensor& savedInvVar) +template +void batchNormPerActHostBwdTrain(const tensor& x_input, + const tensor& dy_input, + tensor& dx_out, + const tensor& scale, + tensor& dscale, + tensor& dbias, + const tensor& savedMean, + const tensor& savedInvVar) { int height, width, n_batch, channels; @@ -621,7 +632,7 @@ void batchNormPerActHostBwdTrain(const tensor& x_input, n_batch * scale(0, cidx, row, column) * dy_input(bidx, cidx, row, column) - tmp1; double tmp3 = elemInvVar / (double(n)); - dx_out(bidx, cidx, row, column) = static_cast(tmp3 * tmp2); + dx_out(bidx, cidx, row, column) = static_cast(tmp3 * tmp2); } // end for(n_batchs) } // for (column) } // for (row) diff --git a/test/gtest/CMakeLists.txt b/test/gtest/CMakeLists.txt index 544e0526cc..648048c4ee 100644 --- a/test/gtest/CMakeLists.txt +++ b/test/gtest/CMakeLists.txt @@ -105,7 +105,12 @@ function(add_gtest TEST_NAME TEST_CPP) # Enable CMake to discover the test binary # Note: Due to the following cmake issue with gtest_discover_tests https://gitlab.kitware.com/cmake/cmake/-/issues/17812 you cannot pass PROPERTIES as a list. # To work around this limitation, we are passing the environment variables in the format ENVIRONMENT;value1=${value1};ENVIRONMENT;value2=${value2}. - gtest_discover_tests(${TEST_NAME} DISCOVERY_TIMEOUT 300 DISCOVERY_MODE PRE_TEST WORKING_DIRECTORY ${PROJECT_BINARY_DIR}/${DATABASE_INSTALL_DIR} PROPERTIES ${TEST_ENVIRONMENT_VARIABLES} TEST_FILTER "${MIOPEN_GTEST_FILTER}") + # bn tests are run sequentially since running tests in parallel was causing large tensor case fail with insufficient memory error. + if("${TEST_NAME}" STREQUAL "test_bn_bwd_serial_run" OR "${TEST_NAME}" STREQUAL "test_bn_fwd_train_serial_run" OR "${TEST_NAME}" STREQUAL "test_bn_infer_serial_run") + gtest_discover_tests(${TEST_NAME} DISCOVERY_TIMEOUT 300 DISCOVERY_MODE PRE_TEST WORKING_DIRECTORY ${PROJECT_BINARY_DIR}/${DATABASE_INSTALL_DIR} PROPERTIES RUN_SERIAL TRUE ${TEST_ENVIRONMENT_VARIABLES} TEST_FILTER "${MIOPEN_GTEST_FILTER}") + else() + gtest_discover_tests(${TEST_NAME} DISCOVERY_TIMEOUT 300 DISCOVERY_MODE PRE_TEST WORKING_DIRECTORY ${PROJECT_BINARY_DIR}/${DATABASE_INSTALL_DIR} PROPERTIES ${TEST_ENVIRONMENT_VARIABLES} TEST_FILTER "${MIOPEN_GTEST_FILTER}") + endif() endif() target_link_libraries(${TEST_NAME} BZip2::BZip2) if(WIN32) diff --git a/test/gtest/bn.hpp b/test/gtest/bn.hpp index 6734d0f99d..d89d950c13 100644 --- a/test/gtest/bn.hpp +++ b/test/gtest/bn.hpp @@ -62,12 +62,23 @@ static std::string ApiVerisonToString(int api_version) } } +static std::string BNModeToString(int bn_mode) +{ + switch(bn_mode) + { + case miopenBNPerActivation: return "BNPerActivation"; + case miopenBNSpatial: return "BNSpatial"; + default: return "UnknownBNMode"; + } +} + // Custom test name generator to handle enums template struct TestNameGenerator { - std::string operator()( - const testing::TestParamInfo>& info) + std::string + operator()(const testing::TestParamInfo< + std::tuple>& info) const { constexpr int dimension = std::is_same::value ? 2 @@ -75,15 +86,17 @@ struct TestNameGenerator : -1; static_assert(dimension > 0); - const auto& layout_type = std::get<1>(info.param); - const auto& api_type = std::get<2>(info.param); + const auto& layout_type = std::get<1>(info.param); + const auto& batchnorm_mode = std::get<2>(info.param); + const auto& api_type = std::get<3>(info.param); - std::string tensor_name = LayoutToString(layout_type); - std::string api_name = ApiVerisonToString(api_type); + std::string tensor_name = LayoutToString(layout_type); + std::string bn_mode_name = BNModeToString(batchnorm_mode); + std::string api_name = ApiVerisonToString(api_type); std::ostringstream oss; - oss << tensor_name + "_" + api_name + "_Dim_" + std::to_string(dimension) + "_test_id_" + - std::to_string(info.index); + oss << tensor_name + "_" + bn_mode_name + "_" + api_name + "_Dim_" + + std::to_string(dimension) + "_test_id_" + std::to_string(info.index); return oss.str(); } }; @@ -93,15 +106,17 @@ template struct BNInferTest - : public ::testing::TestWithParam> + : public ::testing::TestWithParam< + std::tuple> { protected: void SetUp() override { - std::tie(bn_config, tensor_layout, api_type) = this->GetParam(); - bn_infer_test_data.SetUpImpl(bn_config, tensor_layout); + std::tie(bn_config, tensor_layout, bn_mode, api_type) = this->GetParam(); + bn_infer_test_data.SetUpImpl(bn_config, bn_mode, tensor_layout); auto&& handle = get_handle(); if(!miopen::solver::ck_utility::is_ck_whitelist(handle.GetStream())) @@ -113,7 +128,7 @@ struct BNInferTest if(api_type == BNApiType::testBNAPIV1) { res = miopenBatchNormalizationForwardInference(&handle, - bn_config.mode, + bn_mode, &bn_infer_test_data.alpha, &bn_infer_test_data.beta, &bn_infer_test_data.input.desc, @@ -131,7 +146,7 @@ struct BNInferTest { res = miopenBatchNormalizationForwardInference_V2( &handle, - bn_config.mode, + bn_mode, &bn_infer_test_data.alpha, &bn_infer_test_data.beta, &bn_infer_test_data.input.desc, @@ -172,33 +187,40 @@ struct BNInferTest bn_infer_test_data.out_dev, bn_infer_test_data.output.data.size()); test::ComputeCPUBNInference(bn_infer_test_data); // 4e-3 is tolerance used by CK kernel. - test::CompareTensor(bn_infer_test_data.output, bn_infer_test_data.ref_out, 4e-3); + test::CompareTensor(bn_infer_test_data.output, bn_infer_test_data.out_ref, 4e-3); } TestCase bn_config; bool test_skipped = false; - BNInferTestData + BNInferTestData bn_infer_test_data; miopenTensorLayout_t tensor_layout; + miopenBatchNormMode_t bn_mode; BNApiType api_type; }; template -struct BNBwdTest - : public ::testing::TestWithParam> +struct BNBwdTest : public ::testing::TestWithParam< + std::tuple> { protected: void SetUp() override { - std::tie(bn_config, tensor_layout, api_type) = this->GetParam(); - bn_bwd_test_data.SetUpImpl(bn_config, tensor_layout); + std::tie(bn_config, tensor_layout, bn_mode, api_type) = this->GetParam(); + bn_bwd_test_data.SetUpImpl(bn_config, bn_mode, tensor_layout); auto&& handle = get_handle(); if(!miopen::solver::ck_utility::is_ck_whitelist(handle.GetStream())) @@ -210,7 +232,7 @@ struct BNBwdTest if(api_type == BNApiType::testBNAPIV1) { res = miopenBatchNormalizationBackward(&handle, - bn_config.mode, + bn_mode, &bn_bwd_test_data.alphaDataDiff, &bn_bwd_test_data.betaDataDiff, &bn_bwd_test_data.alphaParamDiff, @@ -232,7 +254,7 @@ struct BNBwdTest else if(api_type == BNApiType::testBNAPIV2) { res = miopenBatchNormalizationBackward_V2(&handle, - bn_config.mode, + bn_mode, &bn_bwd_test_data.alphaDataDiff, &bn_bwd_test_data.betaDataDiff, &bn_bwd_test_data.alphaParamDiff, @@ -283,10 +305,11 @@ struct BNBwdTest test::ComputeCPUBNBwd(bn_bwd_test_data); - test::CompareTensor(bn_bwd_test_data.output, bn_bwd_test_data.ref_out, bwd_tol); - test::CompareTensor( + test::CompareTensor( + bn_bwd_test_data.output, bn_bwd_test_data.out_ref, bwd_tol); + test::CompareTensor( bn_bwd_test_data.dScale, bn_bwd_test_data.dScale_ref, bwd_tol); - test::CompareTensor( + test::CompareTensor( bn_bwd_test_data.dBias, bn_bwd_test_data.dBias_ref, bwd_tol); } @@ -295,13 +318,14 @@ struct BNBwdTest BNBwdTestData bn_bwd_test_data; miopenTensorLayout_t tensor_layout; + miopenBatchNormMode_t bn_mode; BNApiType api_type; double bwd_tol = 4e-3; }; @@ -310,16 +334,18 @@ template struct BNFwdTrainTest - : public ::testing::TestWithParam> + : public ::testing::TestWithParam< + std::tuple> { protected: void SetUp() override { - std::tie(bn_config, tensor_layout, api_type) = this->GetParam(); - bn_fwd_train_test_data.SetUpImpl(bn_config, tensor_layout); + std::tie(bn_config, tensor_layout, bn_mode, api_type) = this->GetParam(); + bn_fwd_train_test_data.SetUpImpl(bn_config, bn_mode, tensor_layout); auto&& handle = get_handle(); if(!miopen::solver::ck_utility::is_ck_whitelist(handle.GetStream())) @@ -332,7 +358,7 @@ struct BNFwdTrainTest { res = miopenBatchNormalizationForwardTraining( &handle, - bn_config.mode, + bn_mode, &bn_fwd_train_test_data.alpha, &bn_fwd_train_test_data.beta, &bn_fwd_train_test_data.input.desc, @@ -353,7 +379,7 @@ struct BNFwdTrainTest { res = miopenBatchNormalizationForwardTraining_V2( &handle, - bn_config.mode, + bn_mode, &bn_fwd_train_test_data.alpha, &bn_fwd_train_test_data.beta, &bn_fwd_train_test_data.input.desc, @@ -402,35 +428,42 @@ struct BNFwdTrainTest bn_fwd_train_test_data.output.data = handle.Read( bn_fwd_train_test_data.out_dev, bn_fwd_train_test_data.output.data.size()); - bn_fwd_train_test_data.saveMean.data = handle.Read( + bn_fwd_train_test_data.saveMean.data = handle.Read( bn_fwd_train_test_data.saveMean_dev, bn_fwd_train_test_data.saveMean.data.size()); bn_fwd_train_test_data.saveVariance.data = - handle.Read(bn_fwd_train_test_data.saveVariance_dev, - bn_fwd_train_test_data.saveVariance_ref.data.size()); - bn_fwd_train_test_data.runMean.data = handle.Read( + handle.Read(bn_fwd_train_test_data.saveVariance_dev, + bn_fwd_train_test_data.saveVariance_ref.data.size()); + bn_fwd_train_test_data.runMean.data = handle.Read( bn_fwd_train_test_data.runMean_dev, bn_fwd_train_test_data.runMean_ref.data.size()); bn_fwd_train_test_data.runVariance.data = - handle.Read(bn_fwd_train_test_data.runVariance_dev, - bn_fwd_train_test_data.runVariance_ref.data.size()); + handle.Read(bn_fwd_train_test_data.runVariance_dev, + bn_fwd_train_test_data.runVariance_ref.data.size()); test::ComputeCPUBNFwdTrain(bn_fwd_train_test_data); // 4e-3 is tolerance used by CK kernel. test::CompareTensor( - bn_fwd_train_test_data.output, bn_fwd_train_test_data.ref_out, 4e-3); - test::CompareTensor( + bn_fwd_train_test_data.output, bn_fwd_train_test_data.out_ref, 4e-3); + test::CompareTensor( bn_fwd_train_test_data.saveMean, bn_fwd_train_test_data.saveMean_ref, 4e-3); - test::CompareTensor( + test::CompareTensor( bn_fwd_train_test_data.saveVariance, bn_fwd_train_test_data.saveVariance_ref, 4e-3); - test::CompareTensor( + test::CompareTensor( bn_fwd_train_test_data.runMean, bn_fwd_train_test_data.runMean_ref, 4e-3); - test::CompareTensor( + test::CompareTensor( bn_fwd_train_test_data.runVariance, bn_fwd_train_test_data.runVariance_ref, 4e-3); } TestCase bn_config; bool test_skipped = false; - BNFwdTrainTestData + BNFwdTrainTestData bn_fwd_train_test_data; miopenTensorLayout_t tensor_layout; + miopenBatchNormMode_t bn_mode; BNApiType api_type; }; diff --git a/test/gtest/bn_bwd.cpp b/test/gtest/bn_bwd.cpp index 5e8c3856f5..0854ec2134 100644 --- a/test/gtest/bn_bwd.cpp +++ b/test/gtest/bn_bwd.cpp @@ -28,13 +28,19 @@ /* typename XDataType, typename DxDataType, typename DyDataType, - typename AccDataType, typename ScaleDataType, typename DscaleDbiasDataType, - typename MeanVarDataType> */ - -struct GPU_BNBWDSmall_FP32 - : BNBwdTest + typename MeanVarDataType, + typename AccDataType> */ + +struct GPU_BNBWDSmall_FP32 : BNBwdTest { }; @@ -44,7 +50,7 @@ struct GPU_BNOCLBWDLarge2D_FP16 : BNBwdTest { }; @@ -55,38 +61,38 @@ struct GPU_BNOCLBWDLarge3D_FP16 : BNBwdTest { }; struct GPU_BNCKBWDLarge2D_BFP16 - : BNBwdTest + : BNBwdTest { }; struct GPU_BNOCLBWDLarge2D_BFP16 - : BNBwdTest + : BNBwdTest { }; struct GPU_BNOCLBWDLarge3D_BFP16 - : BNBwdTest + : BNBwdTest { }; struct GPU_BNBWDSmall2D_FP32 - : BNBwdTest + : BNBwdTest { }; struct GPU_BNBWDLarge2D_FP32 - : BNBwdTest + : BNBwdTest { }; struct GPU_BNBWDLarge3D_FP32 - : BNBwdTest + : BNBwdTest { }; @@ -124,20 +130,26 @@ INSTANTIATE_TEST_SUITE_P(Smoke, GPU_BNBWDSmall_FP32, testing::Combine(testing::ValuesIn(Network2DSmall()), testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}), + testing::ValuesIn({miopenBNSpatial, + miopenBNPerActivation}), testing::ValuesIn({testBNAPIV2})), TestNameGenerator()); INSTANTIATE_TEST_SUITE_P(Smoke, GPU_BNOCLBWDLarge2D_FP16, testing::Combine(testing::ValuesIn(Network2DLarge()), - testing::ValuesIn({miopenTensorNCHW}), + testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}), + testing::ValuesIn({miopenBNSpatial, + miopenBNPerActivation}), testing::ValuesIn({testBNAPIV2})), TestNameGenerator()); INSTANTIATE_TEST_SUITE_P(Smoke, GPU_BNOCLBWDLarge3D_FP16, testing::Combine(testing::ValuesIn(Network3DBN()), - testing::ValuesIn({miopenTensorNCDHW}), + testing::ValuesIn({miopenTensorNCDHW, miopenTensorNDHWC}), + testing::ValuesIn({miopenBNSpatial, + miopenBNPerActivation}), testing::ValuesIn({testBNAPIV2})), TestNameGenerator()); @@ -146,20 +158,26 @@ INSTANTIATE_TEST_SUITE_P(Smoke, GPU_BNCKBWDLarge2D_BFP16, testing::Combine(testing::ValuesIn(Network2DLarge()), testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}), + testing::ValuesIn({miopenBNSpatial, + miopenBNPerActivation}), testing::ValuesIn({testBNAPIV2})), TestNameGenerator()); INSTANTIATE_TEST_SUITE_P(Smoke, GPU_BNOCLBWDLarge2D_BFP16, testing::Combine(testing::ValuesIn(Network2DLarge()), - testing::ValuesIn({miopenTensorNCHW}), + testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}), + testing::ValuesIn({miopenBNSpatial, + miopenBNPerActivation}), testing::ValuesIn({testBNAPIV2})), TestNameGenerator()); INSTANTIATE_TEST_SUITE_P(Smoke, GPU_BNOCLBWDLarge3D_BFP16, testing::Combine(testing::ValuesIn(Network3DBN()), - testing::ValuesIn({miopenTensorNCDHW}), + testing::ValuesIn({miopenTensorNCDHW, miopenTensorNDHWC}), + testing::ValuesIn({miopenBNSpatial, + miopenBNPerActivation}), testing::ValuesIn({testBNAPIV2})), TestNameGenerator()); @@ -167,20 +185,26 @@ INSTANTIATE_TEST_SUITE_P(Smoke, INSTANTIATE_TEST_SUITE_P(Smoke, GPU_BNBWDSmall2D_FP32, testing::Combine(testing::ValuesIn(Network2DSmall()), - testing::ValuesIn({miopenTensorNCHW}), + testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}), + testing::ValuesIn({miopenBNSpatial, + miopenBNPerActivation}), testing::ValuesIn({testBNAPIV1})), TestNameGenerator()); INSTANTIATE_TEST_SUITE_P(Smoke, GPU_BNBWDLarge2D_FP32, testing::Combine(testing::ValuesIn(Network2DLarge()), - testing::ValuesIn({miopenTensorNCHW}), + testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}), + testing::ValuesIn({miopenBNSpatial, + miopenBNPerActivation}), testing::ValuesIn({testBNAPIV2})), TestNameGenerator()); INSTANTIATE_TEST_SUITE_P(Smoke, GPU_BNBWDLarge3D_FP32, testing::Combine(testing::ValuesIn(Network3DBN()), - testing::ValuesIn({miopenTensorNCDHW}), + testing::ValuesIn({miopenTensorNCDHW, miopenTensorNDHWC}), + testing::ValuesIn({miopenBNSpatial, + miopenBNPerActivation}), testing::ValuesIn({testBNAPIV2})), TestNameGenerator()); // fp64 @@ -188,6 +212,8 @@ INSTANTIATE_TEST_SUITE_P(Smoke, GPU_BNBWDSmall2D_FP64, testing::Combine(testing::ValuesIn(Network2DSmall()), testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}), + testing::ValuesIn({miopenBNSpatial, + miopenBNPerActivation}), testing::ValuesIn({testBNAPIV1})), TestNameGenerator()); @@ -195,5 +221,7 @@ INSTANTIATE_TEST_SUITE_P(Smoke, GPU_BNBWDLarge2D_FP64, testing::Combine(testing::ValuesIn(Network2DLarge()), testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}), + testing::ValuesIn({miopenBNSpatial, + miopenBNPerActivation}), testing::ValuesIn({testBNAPIV2})), TestNameGenerator()); diff --git a/test/gtest/bn_bwd_serial_run.cpp b/test/gtest/bn_bwd_serial_run.cpp new file mode 100644 index 0000000000..66f1d56d26 --- /dev/null +++ b/test/gtest/bn_bwd_serial_run.cpp @@ -0,0 +1,111 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2025 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include "bn.hpp" +/* typename XDataType, + typename DxDataType, + typename DyDataType, + typename ScaleDataType, + typename DscaleDbiasDataType, + typename MeanVarDataType, + typename AccDataType> */ + +struct GPU_BNOCLBWDSerialRun3D_FP16 : BNBwdTest +{ +}; + +struct GPU_BNOCLBWDSerialRun3D_BFP16 + : BNBwdTest +{ +}; + +struct GPU_BNBWDSerialRun3D_FP32 + : BNBwdTest +{ +}; + +struct GPU_BNBWDSerialRun3D_FP64 + : BNBwdTest +{ +}; + +// fp16 +TEST_P(GPU_BNOCLBWDSerialRun3D_FP16, BnV2SerialRunBWDOCL3D_fp16) {} + +// bfp16 +TEST_P(GPU_BNOCLBWDSerialRun3D_BFP16, BnV2SerialRunBWDOCLbfp16_3D) {} + +// fp32 (float) +TEST_P(GPU_BNBWDSerialRun3D_FP32, BnV2SerialRunBWDCKfp32_3D) {} + +// fp64 +TEST_P(GPU_BNBWDSerialRun3D_FP64, DISABLED_BnV2SerialRunBWDCKfp64_3D) {} + +// fp16 +INSTANTIATE_TEST_SUITE_P(Full, + GPU_BNOCLBWDSerialRun3D_FP16, + testing::Combine(testing::ValuesIn(Network3DSerialCase()), + testing::ValuesIn({miopenTensorNCDHW, miopenTensorNDHWC}), + testing::ValuesIn({miopenBNSpatial, + miopenBNPerActivation}), + testing::ValuesIn({testBNAPIV2})), + TestNameGenerator()); + +// bfp16 +INSTANTIATE_TEST_SUITE_P(Full, + GPU_BNOCLBWDSerialRun3D_BFP16, + testing::Combine(testing::ValuesIn(Network3DSerialCase()), + testing::ValuesIn({miopenTensorNCDHW, miopenTensorNDHWC}), + testing::ValuesIn({miopenBNSpatial, + miopenBNPerActivation}), + testing::ValuesIn({testBNAPIV2})), + TestNameGenerator()); + +// fp32 +INSTANTIATE_TEST_SUITE_P(Full, + GPU_BNBWDSerialRun3D_FP32, + testing::Combine(testing::ValuesIn(Network3DSerialCase()), + testing::ValuesIn({miopenTensorNCDHW, miopenTensorNDHWC}), + testing::ValuesIn({miopenBNSpatial, + miopenBNPerActivation}), + testing::ValuesIn({testBNAPIV2})), + TestNameGenerator()); +// fp64 +INSTANTIATE_TEST_SUITE_P(Full, + GPU_BNBWDSerialRun3D_FP64, + testing::Combine(testing::ValuesIn(Network3DSerialCase()), + testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}), + testing::ValuesIn({miopenBNSpatial, + miopenBNPerActivation}), + testing::ValuesIn({testBNAPIV2})), + TestNameGenerator()); diff --git a/test/gtest/bn_fwd_train.cpp b/test/gtest/bn_fwd_train.cpp index a2c2f6da72..2483c0905f 100644 --- a/test/gtest/bn_fwd_train.cpp +++ b/test/gtest/bn_fwd_train.cpp @@ -26,76 +26,75 @@ #include "bn.hpp" -// XDataType : half -// YDataYype : half -// ScaleDataType : half -// BiasDataType : half -// MeanVarDataType : float +// XDataType +// YDataYype +// ScaleDataType +// BiasDataType +// RunSaveDataType +// AccDataType struct GPU_BNCKFWDTrainLarge2D_FP16 : BNFwdTrainTest { }; struct GPU_BNOCLFWDTrainLarge2D_FP16 - : BNFwdTrainTest + : BNFwdTrainTest { }; struct GPU_BNOCLFWDTrainLarge3D_FP16 - : BNFwdTrainTest + : BNFwdTrainTest { }; -// XDataType : bfloat16 -// YDataYype : bfloat16 -// ScaleDataType : bfloat16 -// BiasDataType : bfloat16 -// MeanVarDataType : float - struct GPU_BNCKFWDTrainLarge2D_BFP16 - : BNFwdTrainTest + : BNFwdTrainTest { }; struct GPU_BNOCLFWDTrainLarge2D_BFP16 - : BNFwdTrainTest + : BNFwdTrainTest { }; struct GPU_BNOCLFWDTrainLarge3D_BFP16 - : BNFwdTrainTest + : BNFwdTrainTest { }; -struct GPU_BNFWDTrainSmall2D_FP32 : BNFwdTrainTest +struct GPU_BNFWDTrainSmall2D_FP32 + : BNFwdTrainTest { }; -struct GPU_BNFWDTrainSmall3D_FP32 : BNFwdTrainTest +struct GPU_BNFWDTrainSmall3D_FP32 + : BNFwdTrainTest { }; -struct GPU_BNFWDTrainLarge2D_FP32 : BNFwdTrainTest +struct GPU_BNFWDTrainLarge2D_FP32 + : BNFwdTrainTest { }; struct GPU_BNFWDTrainSmall2D_FP64 - : BNFwdTrainTest + : BNFwdTrainTest { }; struct GPU_BNFWDTrainLarge2D_FP64 - : BNFwdTrainTest + : BNFwdTrainTest { }; struct GPU_BNFWDTrainSmall3D_FP64 - : BNFwdTrainTest + : BNFwdTrainTest { }; @@ -125,20 +124,26 @@ INSTANTIATE_TEST_SUITE_P(Smoke, GPU_BNCKFWDTrainLarge2D_FP16, testing::Combine(testing::ValuesIn(Network2DSmall()), testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}), + testing::ValuesIn({miopenBNSpatial, + miopenBNPerActivation}), testing::ValuesIn({testBNAPIV2})), TestNameGenerator()); INSTANTIATE_TEST_SUITE_P(Smoke, GPU_BNOCLFWDTrainLarge2D_FP16, testing::Combine(testing::ValuesIn(Network2DLarge()), - testing::ValuesIn({miopenTensorNCHW}), + testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}), + testing::ValuesIn({miopenBNSpatial, + miopenBNPerActivation}), testing::ValuesIn({testBNAPIV1, testBNAPIV2})), TestNameGenerator()); INSTANTIATE_TEST_SUITE_P(Smoke, GPU_BNOCLFWDTrainLarge3D_FP16, testing::Combine(testing::ValuesIn(Network3DBN()), - testing::ValuesIn({miopenTensorNCDHW}), + testing::ValuesIn({miopenTensorNCDHW, miopenTensorNDHWC}), + testing::ValuesIn({miopenBNSpatial, + miopenBNPerActivation}), testing::ValuesIn({testBNAPIV1, testBNAPIV2})), TestNameGenerator()); @@ -147,20 +152,26 @@ INSTANTIATE_TEST_SUITE_P(Smoke, GPU_BNCKFWDTrainLarge2D_BFP16, testing::Combine(testing::ValuesIn(Network2DSmall()), testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}), + testing::ValuesIn({miopenBNSpatial, + miopenBNPerActivation}), testing::ValuesIn({testBNAPIV2})), TestNameGenerator()); INSTANTIATE_TEST_SUITE_P(Smoke, GPU_BNOCLFWDTrainLarge2D_BFP16, testing::Combine(testing::ValuesIn(Network2DLarge()), - testing::ValuesIn({miopenTensorNCHW}), + testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}), + testing::ValuesIn({miopenBNSpatial, + miopenBNPerActivation}), testing::ValuesIn({testBNAPIV1, testBNAPIV2})), TestNameGenerator()); INSTANTIATE_TEST_SUITE_P(Smoke, GPU_BNOCLFWDTrainLarge3D_BFP16, testing::Combine(testing::ValuesIn(Network3DBN()), - testing::ValuesIn({miopenTensorNCDHW}), + testing::ValuesIn({miopenTensorNCDHW, miopenTensorNDHWC}), + testing::ValuesIn({miopenBNSpatial, + miopenBNPerActivation}), testing::ValuesIn({testBNAPIV1, testBNAPIV2})), TestNameGenerator()); // fp32 @@ -168,27 +179,36 @@ INSTANTIATE_TEST_SUITE_P(Smoke, GPU_BNFWDTrainSmall2D_FP32, testing::Combine(testing::ValuesIn(Network2DSmall()), testing::ValuesIn({miopenTensorNCHW}), + testing::ValuesIn({miopenBNSpatial, + miopenBNPerActivation}), testing::ValuesIn({testBNAPIV1})), TestNameGenerator()); -INSTANTIATE_TEST_SUITE_P(Smoke, - GPU_BNFWDTrainSmall3D_FP32, - testing::Combine(testing::ValuesIn(Network3DBN()), - testing::ValuesIn({miopenTensorNCDHW}), - testing::ValuesIn({testBNAPIV1, testBNAPIV2})), - TestNameGenerator()); - INSTANTIATE_TEST_SUITE_P(Smoke, GPU_BNFWDTrainLarge2D_FP32, testing::Combine(testing::ValuesIn(Network2DLarge()), testing::ValuesIn({miopenTensorNCHW}), + testing::ValuesIn({miopenBNSpatial, + miopenBNPerActivation}), testing::ValuesIn({testBNAPIV2})), TestNameGenerator()); + +INSTANTIATE_TEST_SUITE_P(Smoke, + GPU_BNFWDTrainSmall3D_FP32, + testing::Combine(testing::ValuesIn(Network3DBN()), + testing::ValuesIn({miopenTensorNCDHW, miopenTensorNDHWC}), + testing::ValuesIn({miopenBNSpatial, + miopenBNPerActivation}), + testing::ValuesIn({testBNAPIV1, testBNAPIV2})), + TestNameGenerator()); + // fp64 INSTANTIATE_TEST_SUITE_P(Smoke, GPU_BNFWDTrainSmall2D_FP64, testing::Combine(testing::ValuesIn(Network2DSmall()), testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}), + testing::ValuesIn({miopenBNSpatial, + miopenBNPerActivation}), testing::ValuesIn({testBNAPIV1})), TestNameGenerator()); @@ -196,12 +216,16 @@ INSTANTIATE_TEST_SUITE_P(Smoke, GPU_BNFWDTrainLarge2D_FP64, testing::Combine(testing::ValuesIn(Network2DSmall()), testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}), + testing::ValuesIn({miopenBNSpatial, + miopenBNPerActivation}), testing::ValuesIn({testBNAPIV2})), TestNameGenerator()); INSTANTIATE_TEST_SUITE_P(Smoke, GPU_BNFWDTrainSmall3D_FP64, testing::Combine(testing::ValuesIn(Network3DBN()), - testing::ValuesIn({miopenTensorNCDHW}), + testing::ValuesIn({miopenTensorNCDHW, miopenTensorNDHWC}), + testing::ValuesIn({miopenBNSpatial, + miopenBNPerActivation}), testing::ValuesIn({testBNAPIV1, testBNAPIV2})), TestNameGenerator()); diff --git a/test/gtest/bn_fwd_train_serial_run.cpp b/test/gtest/bn_fwd_train_serial_run.cpp new file mode 100644 index 0000000000..d4e254434c --- /dev/null +++ b/test/gtest/bn_fwd_train_serial_run.cpp @@ -0,0 +1,105 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2025 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include "bn.hpp" + +// XDataType +// YDataYype +// ScaleDataType +// BiasDataType +// RunSaveDataType +// AccDataType + +struct GPU_BNOCLFWDTrainSerialRun3D_FP16 + : BNFwdTrainTest +{ +}; + +struct GPU_BNOCLFWDTrainSerialRun3D_BFP16 + : BNFwdTrainTest +{ +}; + +struct GPU_BNFWDTrainSerialRun3D_FP32 + : BNFwdTrainTest +{ +}; + +struct GPU_BNFWDTrainSerialRun3D_FP64 + : BNFwdTrainTest +{ +}; + +// fp16 +TEST_P(GPU_BNOCLFWDTrainSerialRun3D_FP16, BnV2SerialRunFWD_TrainOCL_3D_fp16) {} + +// bfp16 +TEST_P(GPU_BNOCLFWDTrainSerialRun3D_BFP16, BnV2SerialRunFWD_TrainOCL_3Dbfp16) {} + +// fp32 (float) +TEST_P(GPU_BNFWDTrainSerialRun3D_FP32, BnV1SerialRunFWD_Train3Dfp32) {} + +// fp64 +TEST_P(GPU_BNFWDTrainSerialRun3D_FP64, DISABLED_BnV2SerialRunFWD_Train3Dfp64) {} + +// fp16 +INSTANTIATE_TEST_SUITE_P(Full, + GPU_BNOCLFWDTrainSerialRun3D_FP16, + testing::Combine(testing::ValuesIn(Network3DSerialCase()), + testing::ValuesIn({miopenTensorNCDHW, miopenTensorNDHWC}), + testing::ValuesIn({miopenBNSpatial, + miopenBNPerActivation}), + testing::ValuesIn({testBNAPIV1, testBNAPIV2})), + TestNameGenerator()); + +// bfp16 +INSTANTIATE_TEST_SUITE_P(Full, + GPU_BNOCLFWDTrainSerialRun3D_BFP16, + testing::Combine(testing::ValuesIn(Network3DSerialCase()), + testing::ValuesIn({miopenTensorNCDHW, miopenTensorNDHWC}), + testing::ValuesIn({miopenBNSpatial, + miopenBNPerActivation}), + testing::ValuesIn({testBNAPIV1, testBNAPIV2})), + TestNameGenerator()); +// fp32 +INSTANTIATE_TEST_SUITE_P(Full, + GPU_BNFWDTrainSerialRun3D_FP32, + testing::Combine(testing::ValuesIn(Network3DSerialCase()), + testing::ValuesIn({miopenTensorNCDHW, miopenTensorNDHWC}), + testing::ValuesIn({miopenBNSpatial, + miopenBNPerActivation}), + testing::ValuesIn({testBNAPIV1, testBNAPIV2})), + TestNameGenerator()); + +// fp64 +INSTANTIATE_TEST_SUITE_P(Full, + GPU_BNFWDTrainSerialRun3D_FP64, + testing::Combine(testing::ValuesIn(Network3DSerialCase()), + testing::ValuesIn({miopenTensorNCDHW, miopenTensorNDHWC}), + testing::ValuesIn({miopenBNSpatial, + miopenBNPerActivation}), + testing::ValuesIn({testBNAPIV1, testBNAPIV2})), + TestNameGenerator()); diff --git a/test/gtest/bn_infer.cpp b/test/gtest/bn_infer.cpp index ad9edf694e..7ddbbd11d7 100644 --- a/test/gtest/bn_infer.cpp +++ b/test/gtest/bn_infer.cpp @@ -31,22 +31,24 @@ // ScaleDataType : half_float::half // BiasDataType : half_float::half // MeanVarDataType : float +// AccDataType : double struct GPU_BNCKInferLarge2D_FP16 : BNInferTest { }; struct GPU_BNOCLInferLarge2D_FP16 - : BNInferTest + : BNInferTest { }; struct GPU_BNOCLInferLarge3D_FP16 - : BNInferTest + : BNInferTest { }; @@ -56,7 +58,7 @@ struct GPU_BNOCLInferLarge3D_FP16 // BiasDataType : bfloat16 // MeanVarDataType : float struct GPU_BNCKInferLarge2D_BFP16 - : BNInferTest + : BNInferTest { }; @@ -66,30 +68,35 @@ struct GPU_BNCKInferLarge2D_BFP16 // BiasDataType : float // MeanVarDataType : float struct GPU_BNOCLInferLarge2D_BFP16 - : BNInferTest + : BNInferTest { }; struct GPU_BNOCLInferLarge3D_BFP16 - : BNInferTest + : BNInferTest { }; -struct GPU_BNInferSmall2D_FP32 : BNInferTest +struct GPU_BNInferSmall2D_FP32 + : BNInferTest { }; -struct GPU_BNInferSmall3D_FP32 : BNInferTest +struct GPU_BNInferSmall3D_FP32 + : BNInferTest { }; -struct GPU_BNInferLarge2D_FP32 : BNInferTest +struct GPU_BNInferLarge2D_FP32 + : BNInferTest { }; -struct GPU_BNInferSmall2D_FP64 : BNInferTest +struct GPU_BNInferSmall2D_FP64 + : BNInferTest { }; -struct GPU_BNInferLarge2D_FP64 : BNInferTest +struct GPU_BNInferLarge2D_FP64 + : BNInferTest { }; @@ -117,20 +124,26 @@ INSTANTIATE_TEST_SUITE_P(Smoke, GPU_BNCKInferLarge2D_FP16, testing::Combine(testing::ValuesIn(Network2DSmall()), testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}), + testing::ValuesIn({miopenBNSpatial, + miopenBNPerActivation}), testing::ValuesIn({testBNAPIV2})), TestNameGenerator()); INSTANTIATE_TEST_SUITE_P(Smoke, GPU_BNOCLInferLarge2D_FP16, testing::Combine(testing::ValuesIn(Network2DLarge()), - testing::ValuesIn({miopenTensorNCHW}), + testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}), + testing::ValuesIn({miopenBNSpatial, + miopenBNPerActivation}), testing::ValuesIn({testBNAPIV1, testBNAPIV2})), TestNameGenerator()); INSTANTIATE_TEST_SUITE_P(Smoke, GPU_BNOCLInferLarge3D_FP16, testing::Combine(testing::ValuesIn(Network3DBN()), - testing::ValuesIn({miopenTensorNCDHW}), + testing::ValuesIn({miopenTensorNCDHW, miopenTensorNDHWC}), + testing::ValuesIn({miopenBNSpatial, + miopenBNPerActivation}), testing::ValuesIn({testBNAPIV1, testBNAPIV2})), TestNameGenerator()); // bfp16 @@ -138,20 +151,26 @@ INSTANTIATE_TEST_SUITE_P(Smoke, GPU_BNCKInferLarge2D_BFP16, testing::Combine(testing::ValuesIn(Network2DSmall()), testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}), + testing::ValuesIn({miopenBNSpatial, + miopenBNPerActivation}), testing::ValuesIn({testBNAPIV2})), TestNameGenerator()); INSTANTIATE_TEST_SUITE_P(Smoke, GPU_BNOCLInferLarge2D_BFP16, testing::Combine(testing::ValuesIn(Network2DLarge()), - testing::ValuesIn({miopenTensorNCHW}), + testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}), + testing::ValuesIn({miopenBNSpatial, + miopenBNPerActivation}), testing::ValuesIn({testBNAPIV1, testBNAPIV2})), TestNameGenerator()); INSTANTIATE_TEST_SUITE_P(Smoke, GPU_BNOCLInferLarge3D_BFP16, testing::Combine(testing::ValuesIn(Network3DBN()), - testing::ValuesIn({miopenTensorNCDHW}), + testing::ValuesIn({miopenTensorNCDHW, miopenTensorNDHWC}), + testing::ValuesIn({miopenBNSpatial, + miopenBNPerActivation}), testing::ValuesIn({testBNAPIV1, testBNAPIV2})), TestNameGenerator()); @@ -159,21 +178,27 @@ INSTANTIATE_TEST_SUITE_P(Smoke, INSTANTIATE_TEST_SUITE_P(Smoke, GPU_BNInferSmall2D_FP32, testing::Combine(testing::ValuesIn(Network2DLarge()), - testing::ValuesIn({miopenTensorNCHW}), + testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}), + testing::ValuesIn({miopenBNSpatial, + miopenBNPerActivation}), testing::ValuesIn({testBNAPIV1})), TestNameGenerator()); INSTANTIATE_TEST_SUITE_P(Smoke, GPU_BNInferLarge2D_FP32, testing::Combine(testing::ValuesIn(Network2DSmall()), - testing::ValuesIn({miopenTensorNCHW}), + testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}), + testing::ValuesIn({miopenBNSpatial, + miopenBNPerActivation}), testing::ValuesIn({testBNAPIV2})), TestNameGenerator()); INSTANTIATE_TEST_SUITE_P(Smoke, GPU_BNInferSmall3D_FP32, testing::Combine(testing::ValuesIn(Network3DBN()), - testing::ValuesIn({miopenTensorNCDHW}), + testing::ValuesIn({miopenTensorNCDHW, miopenTensorNDHWC}), + testing::ValuesIn({miopenBNSpatial, + miopenBNPerActivation}), testing::ValuesIn({testBNAPIV2})), TestNameGenerator()); // fp64 @@ -181,6 +206,8 @@ INSTANTIATE_TEST_SUITE_P(Smoke, GPU_BNInferSmall2D_FP64, testing::Combine(testing::ValuesIn(Network2DSmall()), testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}), + testing::ValuesIn({miopenBNSpatial, + miopenBNPerActivation}), testing::ValuesIn({testBNAPIV1})), TestNameGenerator()); @@ -188,5 +215,7 @@ INSTANTIATE_TEST_SUITE_P(Smoke, GPU_BNInferLarge2D_FP64, testing::Combine(testing::ValuesIn(Network2DSmall()), testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}), + testing::ValuesIn({miopenBNSpatial, + miopenBNPerActivation}), testing::ValuesIn({testBNAPIV2})), TestNameGenerator()); diff --git a/test/gtest/bn_infer_serial_run.cpp b/test/gtest/bn_infer_serial_run.cpp new file mode 100644 index 0000000000..18277eb757 --- /dev/null +++ b/test/gtest/bn_infer_serial_run.cpp @@ -0,0 +1,96 @@ +/******************************************************************************* + * + * MIT License + * + * Copyright (c) 2025 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + *******************************************************************************/ + +#include "bn.hpp" + +// XDataType : half_float::half +// YDataYype : half_float::half +// ScaleDataType : half_float::half +// BiasDataType : half_float::half +// MeanVarDataType : float +// AccDataType : double +struct GPU_BNOCLInferSerialRun3D_FP16 + : BNInferTest +{ +}; + +// XDataType : bfloat16 +// YDataYype : bfloat16 +// ScaleDataType : float +// BiasDataType : float +// MeanVarDataType : float +struct GPU_BNOCLInferSerialRun3D_BFP16 + : BNInferTest +{ +}; + +struct GPU_BNInferSerialRun3D_FP32 + : BNInferTest +{ +}; + +struct GPU_BNInferSerialRun3D_FP64 + : BNInferTest +{ +}; + +// fp16 +TEST_P(GPU_BNOCLInferSerialRun3D_FP16, BnV2SerialRunInferOCLfp16_3D) {} + +// bfp16 +TEST_P(GPU_BNOCLInferSerialRun3D_BFP16, BnV2SerialRunInferOCLbfp16_3D) {} + +// fp32 (float) +TEST_P(GPU_BNInferSerialRun3D_FP32, DISABLED_BnV2SerialRunInferfp32_3D) {} + +// fp16 +INSTANTIATE_TEST_SUITE_P(Full, + GPU_BNOCLInferSerialRun3D_FP16, + testing::Combine(testing::ValuesIn(Network3DSerialCase()), + testing::ValuesIn({miopenTensorNCDHW, miopenTensorNDHWC}), + testing::ValuesIn({miopenBNSpatial, + miopenBNPerActivation}), + testing::ValuesIn({testBNAPIV1, testBNAPIV2})), + TestNameGenerator()); +// bfp16 +INSTANTIATE_TEST_SUITE_P(Full, + GPU_BNOCLInferSerialRun3D_BFP16, + testing::Combine(testing::ValuesIn(Network3DSerialCase()), + testing::ValuesIn({miopenTensorNCDHW, miopenTensorNDHWC}), + testing::ValuesIn({miopenBNSpatial, + miopenBNPerActivation}), + testing::ValuesIn({testBNAPIV1, testBNAPIV2})), + TestNameGenerator()); + +// fp32 +INSTANTIATE_TEST_SUITE_P(Full, + GPU_BNInferSerialRun3D_FP32, + testing::Combine(testing::ValuesIn(Network3DSerialCase()), + testing::ValuesIn({miopenTensorNCDHW, miopenTensorNDHWC}), + testing::ValuesIn({miopenBNSpatial, + miopenBNPerActivation}), + testing::ValuesIn({testBNAPIV2})), + TestNameGenerator()); diff --git a/test/gtest/bn_test_data.hpp b/test/gtest/bn_test_data.hpp index 2bbb87d937..163cd0c90f 100644 --- a/test/gtest/bn_test_data.hpp +++ b/test/gtest/bn_test_data.hpp @@ -41,7 +41,6 @@ struct BN2DTestCase size_t C; size_t H; size_t W; - miopenBatchNormMode_t mode; miopen::batchnorm::Direction Direction; bool save; bool keepRunning; @@ -49,8 +48,8 @@ struct BN2DTestCase friend std::ostream& operator<<(std::ostream& ss, const BN2DTestCase& tc) { return ss << "(N: " << tc.N << " C:" << tc.C << " H:" << tc.H << " W:" << tc.W - << " mode: " << tc.mode << " Direction: " << static_cast(tc.Direction) - << " save: " << tc.save << " keepRunning: " << tc.keepRunning; + << " Direction: " << static_cast(tc.Direction) << " save: " << tc.save + << " keepRunning: " << tc.keepRunning; } std::vector GetInput() const { return {N, C, H, W}; } }; @@ -62,7 +61,6 @@ struct BN3DTestCase size_t D; size_t H; size_t W; - miopenBatchNormMode_t mode; miopen::batchnorm::Direction Direction; bool save; bool keepRunning; @@ -70,9 +68,8 @@ struct BN3DTestCase friend std::ostream& operator<<(std::ostream& ss, const BN3DTestCase& tc) { return ss << "(N: " << tc.N << " C:" << tc.C << " D:" << tc.D << " H:" << tc.H - << " W:" << tc.W << " mode: " << tc.mode - << " Direction: " << static_cast(tc.Direction) << " save: " << tc.save - << " keepRunning: " << tc.keepRunning; + << " W:" << tc.W << " Direction: " << static_cast(tc.Direction) + << " save: " << tc.save << " keepRunning: " << tc.keepRunning; } std::vector GetInput() const { return {N, C, D, H, W}; } }; @@ -86,88 +83,114 @@ std::vector Network2DLarge(); template std::vector Network3DBN(); +template +std::vector Network3DSerialCase(); + template <> inline std::vector Network2DLarge() { // pyt_mlperf_resnet50v1.5 + // clang-format off return { - {192, 1, 8, 8, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 1, 0}, - {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, - {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, - {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, - {64, 256, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, - {64, 256, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, - {64, 256, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, - {64, 256, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, - {64, 256, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, - {64, 256, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, - {64, 256, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, - {64, 256, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, - {64, 256, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, - {64, 512, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, - {64, 512, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, - {64, 512, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, - {64, 512, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, - {64, 512, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, - {64, 512, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, - {64, 512, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, - {64, 512, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, - {64, 512, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, - {64, 64, 112, 112, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, - {64, 64, 112, 112, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, - {64, 64, 112, 112, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, - {64, 64, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, - {64, 64, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, - {64, 64, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0}, - {768, 1, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, - {768, 1, 23, 23, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, - {832, 1, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, - {832, 1, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}}; + {64, 1, 1024, 1024, miopen::batchnorm::Direction::Backward, 1, 0}, + {192, 1, 8, 8, miopen::batchnorm::Direction::Backward, 1, 0}, + {12, 40, 122, 122, miopen::batchnorm::Direction::Backward, 1, 0}, + {64, 256, 14, 14, miopen::batchnorm::Direction::Backward, 0, 1}, + {64, 256, 14, 14, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, + {64, 256, 14, 14, miopen::batchnorm::Direction::ForwardInference, 1, 0}, + {64, 256, 28, 28, miopen::batchnorm::Direction::Backward, 0, 1}, + {64, 256, 28, 28, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, + {64, 256, 28, 28, miopen::batchnorm::Direction::ForwardInference, 1, 0}, + {64, 256, 56, 56, miopen::batchnorm::Direction::Backward, 0, 1}, + {64, 256, 56, 56, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, + {64, 256, 56, 56, miopen::batchnorm::Direction::ForwardInference, 1, 0}, + {64, 512, 14, 14, miopen::batchnorm::Direction::Backward, 0, 1}, + {64, 512, 14, 14, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, + {64, 512, 14, 14, miopen::batchnorm::Direction::ForwardInference, 1, 0}, + {64, 512, 28, 28, miopen::batchnorm::Direction::Backward, 0, 1}, + {64, 512, 28, 28, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, + {64, 512, 28, 28, miopen::batchnorm::Direction::ForwardInference, 1, 0}, + {64, 512, 7, 7, miopen::batchnorm::Direction::Backward, 0, 1}, + {64, 512, 7, 7, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, + {64, 512, 7, 7, miopen::batchnorm::Direction::ForwardInference, 1, 0}, + {64, 64, 112, 112, miopen::batchnorm::Direction::Backward, 0, 1}, + {64, 64, 112, 112, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, + {64, 64, 112, 112, miopen::batchnorm::Direction::ForwardInference, 1, 0}, + {64, 64, 56, 56, miopen::batchnorm::Direction::Backward, 0, 1}, + {64, 64, 56, 56, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, + {64, 64, 56, 56, miopen::batchnorm::Direction::ForwardInference, 1, 0}, + {64, 2048, 7, 7, miopen::batchnorm::Direction::Backward, 0, 1}, + {64, 2048, 17, 17, miopen::batchnorm::Direction::Backward, 0, 1}, + {128, 256, 14, 14, miopen::batchnorm::Direction::Backward, 0, 1}, + {128, 256, 16, 16, miopen::batchnorm::Direction::Backward, 0, 1}, + {670, 1, 224, 224, miopen::batchnorm::Direction::Backward, 0, 1}, + {768, 1, 14, 14, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, + {768, 1, 23, 23, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, + {832, 1, 14, 14, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, + {832, 1, 28, 28, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, + // edge cases + {69328, 1, 22, 22, miopen::batchnorm::Direction::ForwardTraining, 1, 1}, + {69328, 1, 13, 79, miopen::batchnorm::Direction::ForwardTraining, 1, 1} + }; + // clang-format on +} + +// These are very large tensors which caused memory insufficient error +// when ran parallely by ctest. Hence, these are run serially. +// Shape: (2, 2048, 16, 128, 128) --> Size: 1.07e+09 +// For now any test case with tensor size greater then 1e09 need to be run serially. +template <> +inline std::vector Network3DSerialCase() +{ + return {{2, 2048, 16, 128, 128, miopen::batchnorm::Direction::Backward, 0, 1}}; } template <> inline std::vector Network2DSmall() { // pyt_mlperf_resnet50v1.5 + // clang-format off return { - {192, 2, 8, 8, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 1, 0}, - {16, 8, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 1, 0}, - {16, 8, 128, 256, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 0}, - {64, 2048, 17, 17, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, - + {12, 40, 122, 122, miopen::batchnorm::Direction::Backward, 1, 0}, + {16, 8, 132, 28, miopen::batchnorm::Direction::Backward, 1, 0}, + {192, 2, 8, 8, miopen::batchnorm::Direction::Backward, 1, 0}, + {16, 8, 56, 56, miopen::batchnorm::Direction::Backward, 1, 0}, + {16, 8, 128, 256, miopen::batchnorm::Direction::ForwardTraining, 1, 0}, }; + // clang-format on } template <> inline std::vector Network3DBN() { + // clang-format off return { - {2, 2, 3, 224, 224, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 1, 0}, - {16, 8, 132, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 1, 0}, - {16, 8, 16, 128, 128, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 0}, - {2, 2048, 16, 128, 128, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1}, - + {2, 2, 3, 224, 224, miopen::batchnorm::Direction::Backward, 1, 0}, + {16, 8, 132, 28, 28, miopen::batchnorm::Direction::Backward, 1, 0}, + {16, 8, 16, 128, 128, miopen::batchnorm::Direction::ForwardTraining, 1, 0} }; + // clang-format on } -template +template struct BNTestData { - void SetUpImpl(const TConfig& config, miopenTensorLayout_t t_layout) + void + SetUpImpl(const TConfig& config, miopenBatchNormMode_t t_bnmode, miopenTensorLayout_t t_layout) { bn_config = config; tensor_layout = t_layout; + bn_mode = t_bnmode; CreateTensors(); InitTensorsWithRandValue(); SetDirection(); - SetBNMode(); WriteToGPU(); } const miopen::TensorDescriptor& GetInputDesc() const { return input.desc; } tensor input; tensor output; - tensor ref_out; + tensor out_ref; miopen::Allocator::ManageDataPtr in_dev; miopen::Allocator::ManageDataPtr out_dev; @@ -184,17 +207,16 @@ struct BNTestData { input = tensor{tensor_layout, bn_config.GetInput()}; output = tensor{tensor_layout, bn_config.GetInput()}; - ref_out = tensor{tensor_layout, bn_config.GetInput()}; + out_ref = tensor{tensor_layout, bn_config.GetInput()}; } void InitTensorsWithRandValue() { - input.generate( - [](auto...) { return prng::gen_descreet_uniform_sign(1e-2, 100); }); + // -2.0 to 2.0 + input.generate(uniform_signed_initializer(2e-3 /*scale*/, 1000 /*range*/)); } void SetDirection() { direction = bn_config.Direction; } - void SetBNMode() { bn_mode = bn_config.mode; } void WriteToGPU() { auto&& handle = get_handle(); @@ -208,12 +230,15 @@ template -struct BNInferTestData : public BNTestData +struct BNInferTestData : public BNTestData { - void SetUpImpl(const TConfig& config, miopenTensorLayout_t t_layout) + void + SetUpImpl(const TConfig& config, miopenBatchNormMode_t t_bnmode, miopenTensorLayout_t t_layout) { - BNTestData::SetUpImpl(config, t_layout); + BNTestData::SetUpImpl( + config, t_bnmode, t_layout); CreateTensors(); InitTensorsWithRandValue(); WriteToGPU(); @@ -238,32 +263,35 @@ struct BNInferTestData : public BNTestData void CreateTensors() { auto derivedBnDesc = miopen::TensorDescriptor{}; - miopen::DeriveBNTensorDescriptor(derivedBnDesc, - BNTestData::input.desc, - BNTestData::bn_mode); - scale = tensor{BNTestData::tensor_layout, - derivedBnDesc.GetLengths()}; - shift = tensor{BNTestData::tensor_layout, - derivedBnDesc.GetLengths()}; - estMean = tensor{BNTestData::tensor_layout, - derivedBnDesc.GetLengths()}; + miopen::DeriveBNTensorDescriptor( + derivedBnDesc, + BNTestData::input.desc, + BNTestData::bn_mode); + scale = tensor{ + BNTestData::tensor_layout, + derivedBnDesc.GetLengths()}; + shift = tensor{ + BNTestData::tensor_layout, + derivedBnDesc.GetLengths()}; + estMean = tensor{ + BNTestData::tensor_layout, + derivedBnDesc.GetLengths()}; estVariance = tensor{ - BNTestData::tensor_layout, derivedBnDesc.GetLengths()}; + BNTestData::tensor_layout, + derivedBnDesc.GetLengths()}; } void InitTensorsWithRandValue() { - auto gen_value = [](auto...) { - return prng::gen_descreet_uniform_sign(1e-2, 100); - }; - scale.generate(gen_value); - shift.generate(gen_value); - estMean.generate(gen_value); - - auto gen_var = [](auto...) { - return static_cast(1e-2 * (prng::gen_0_to_B(100) + 1)); - }; - estVariance.generate(gen_var); + // -2.0 to 2.0 + scale.generate(uniform_signed_initializer(2e-3 /*scale*/, 1000 /*range*/)); + shift.generate(uniform_signed_initializer(2e-3 /*scale*/, 1000 /*range*/)); + estMean.generate( + uniform_signed_initializer(2e-3 /*scale*/, 1000 /*range*/)); + // estVaraince has to be +ve number otherwise 1/sqrt(-ve) would + // give img number + estVariance.generate( + uniform_unsigned_initializer(2e-3 /*scale*/, 1000 /*range*/)); } void WriteToGPU() { @@ -278,16 +306,18 @@ struct BNInferTestData : public BNTestData template -struct BNBwdTestData : public BNTestData +struct BNBwdTestData : public BNTestData { - void SetUpImpl(const TConfig& config, miopenTensorLayout_t t_layout) + void + SetUpImpl(const TConfig& config, miopenBatchNormMode_t t_bnmode, miopenTensorLayout_t t_layout) { - BNTestData::SetUpImpl(config, t_layout); + BNTestData::SetUpImpl( + config, t_bnmode, t_layout); CreateTensors(); InitTensorsWithRandValue(); WriteToGPU(); @@ -301,8 +331,8 @@ struct BNBwdTestData : public BNTestData tensor dy; tensor dScale; tensor dBias; - tensor dScale_ref; - tensor dBias_ref; + tensor dScale_ref; + tensor dBias_ref; miopen::Allocator::ManageDataPtr bnScale_dev; miopen::Allocator::ManageDataPtr savedMean_dev; @@ -321,40 +351,46 @@ struct BNBwdTestData : public BNTestData private: void CreateTensors() { - dy = tensor{BNTestData::tensor_layout, - BNTestData::bn_config.GetInput()}; + dy = tensor{ + BNTestData::tensor_layout, + BNTestData::bn_config.GetInput()}; auto derivedBnDesc = miopen::TensorDescriptor{}; - miopen::DeriveBNTensorDescriptor(derivedBnDesc, - BNTestData::input.desc, - BNTestData::bn_mode); - bnScale = tensor{BNTestData::tensor_layout, - derivedBnDesc.GetLengths()}; + miopen::DeriveBNTensorDescriptor( + derivedBnDesc, + BNTestData::input.desc, + BNTestData::bn_mode); + bnScale = tensor{ + BNTestData::tensor_layout, + derivedBnDesc.GetLengths()}; savedMean = tensor{ - BNTestData::tensor_layout, derivedBnDesc.GetLengths()}; + BNTestData::tensor_layout, + derivedBnDesc.GetLengths()}; savedInvVar = tensor{ - BNTestData::tensor_layout, derivedBnDesc.GetLengths()}; + BNTestData::tensor_layout, + derivedBnDesc.GetLengths()}; dScale = tensor{ - BNTestData::tensor_layout, derivedBnDesc.GetLengths()}; + BNTestData::tensor_layout, + derivedBnDesc.GetLengths()}; dBias = tensor{ - BNTestData::tensor_layout, derivedBnDesc.GetLengths()}; - dScale_ref = dScale; - dBias_ref = dBias; + BNTestData::tensor_layout, + derivedBnDesc.GetLengths()}; + dScale_ref = tensor{ + BNTestData::tensor_layout, + derivedBnDesc.GetLengths()}; + dBias_ref = tensor{ + BNTestData::tensor_layout, + derivedBnDesc.GetLengths()}; } void InitTensorsWithRandValue() { - auto gen_value = [](auto...) { - return prng::gen_descreet_uniform_sign(1e-2, 100); - }; - dy.generate(gen_value); - bnScale.generate(gen_value); - savedMean.generate(gen_value); - - auto gen_var = [](auto...) { - return static_cast(1e-2 * (prng::gen_0_to_B(100) + 1)); - }; - savedInvVar.generate(gen_var); + dy.generate(uniform_signed_initializer(2e-3 /*scale*/, 1000 /*range*/)); + bnScale.generate(uniform_signed_initializer(2e-3 /*scale*/, 1000 /*range*/)); + savedMean.generate( + uniform_signed_initializer(2e-3 /*scale*/, 1000 /*range*/)); + savedInvVar.generate( + uniform_signed_initializer(2e-3 /*scale*/, 1000 /*range*/)); std::fill(dScale.begin(), dScale.end(), 0.); std::fill(dBias.begin(), dBias.end(), 0.); @@ -380,13 +416,16 @@ template -struct BNFwdTrainTestData : public BNTestData +struct BNFwdTrainTestData : public BNTestData { - void SetUpImpl(const TConfig& config, miopenTensorLayout_t t_layout) + void + SetUpImpl(const TConfig& config, miopenBatchNormMode_t t_bnmode, miopenTensorLayout_t t_layout) { - BNTestData::SetUpImpl(config, t_layout); + BNTestData::SetUpImpl( + config, t_bnmode, t_layout); CreateTensors(); InitTensorsWithRandValue(); WriteToGPU(); @@ -394,10 +433,10 @@ struct BNFwdTrainTestData : public BNTestData tensor scale; tensor shift; - tensor saveMean; - tensor saveVariance; - tensor runMean; - tensor runVariance; + tensor saveMean; + tensor saveVariance; + tensor runMean; + tensor runVariance; tensor saveMean_ref; tensor saveVariance_ref; @@ -422,41 +461,69 @@ struct BNFwdTrainTestData : public BNTestData void CreateTensors() { auto derivedBnDesc = miopen::TensorDescriptor{}; - miopen::DeriveBNTensorDescriptor(derivedBnDesc, - BNTestData::input.desc, - BNTestData::bn_mode); - scale = tensor{BNTestData::tensor_layout, - derivedBnDesc.GetLengths()}; - shift = tensor{BNTestData::tensor_layout, - derivedBnDesc.GetLengths()}; - saveMean = tensor{BNTestData::tensor_layout, - derivedBnDesc.GetLengths()}; - saveVariance = tensor{BNTestData::tensor_layout, - derivedBnDesc.GetLengths()}; - runMean = tensor{BNTestData::tensor_layout, - derivedBnDesc.GetLengths()}; - runVariance = tensor{BNTestData::tensor_layout, - derivedBnDesc.GetLengths()}; + miopen::DeriveBNTensorDescriptor( + derivedBnDesc, + BNTestData::input.desc, + BNTestData::bn_mode); + scale = tensor{ + BNTestData::tensor_layout, + derivedBnDesc.GetLengths()}; + shift = tensor{ + BNTestData::tensor_layout, + derivedBnDesc.GetLengths()}; + saveMean = tensor{ + BNTestData::tensor_layout, + derivedBnDesc.GetLengths()}; + saveVariance = tensor{ + BNTestData::tensor_layout, + derivedBnDesc.GetLengths()}; + runMean = tensor{ + BNTestData::tensor_layout, + derivedBnDesc.GetLengths()}; + runVariance = tensor{ + BNTestData::tensor_layout, + derivedBnDesc.GetLengths()}; + // ref + saveMean_ref = tensor{ + BNTestData::tensor_layout, + derivedBnDesc.GetLengths()}; + saveVariance_ref = tensor{ + BNTestData::tensor_layout, + derivedBnDesc.GetLengths()}; + runMean_ref = tensor{ + BNTestData::tensor_layout, + derivedBnDesc.GetLengths()}; + runVariance_ref = tensor{ + BNTestData::tensor_layout, + derivedBnDesc.GetLengths()}; } void InitTensorsWithRandValue() { - auto gen_value = [](auto...) { - return prng::gen_descreet_uniform_sign(1e-2, 100); - }; - scale.generate(gen_value); - shift.generate(gen_value); - - auto gen_var = [](auto...) { - return static_cast(1e-2 * (prng::gen_0_to_B(100) + 1)); - }; - runMean.generate(gen_var); - runVariance.generate(gen_var); - - saveMean_ref = saveMean; - saveVariance_ref = saveVariance; - runMean_ref = runMean; - runVariance_ref = runVariance; + // -2.0 to 2.0 + scale.generate(uniform_signed_initializer(2e-3 /*scale*/, 1000 /*range*/)); + shift.generate(uniform_signed_initializer(2e-3 /*scale*/, 1000 /*range*/)); + runMean.generate( + uniform_signed_initializer(2e-3 /*scale*/, 1000 /*range*/)); + runVariance.generate( + uniform_signed_initializer(2e-3 /*scale*/, 1000 /*range*/)); + + std::transform(saveMean.data.begin(), + saveMean.data.end(), + saveMean_ref.data.begin(), + [](float val) { return static_cast(val); }); + std::transform(saveVariance.data.begin(), + saveVariance.data.end(), + saveVariance_ref.data.begin(), + [](float val) { return static_cast(val); }); + std::transform(runMean.data.begin(), + runMean.data.end(), + runMean_ref.data.begin(), + [](float val) { return static_cast(val); }); + std::transform(runVariance.data.begin(), + runVariance.data.end(), + runVariance_ref.data.begin(), + [](float val) { return static_cast(val); }); } void WriteToGPU() { diff --git a/test/gtest/test_operations.hpp b/test/gtest/test_operations.hpp index cff9c90f55..6fcef9fb01 100644 --- a/test/gtest/test_operations.hpp +++ b/test/gtest/test_operations.hpp @@ -39,19 +39,37 @@ void ComputeCPUBNInference(DLModule& dl_module) } }; ReshapeIfNeeded(dl_module.input.desc); - ReshapeIfNeeded(dl_module.ref_out.desc); + ReshapeIfNeeded(dl_module.out_ref.desc); ReshapeIfNeeded(dl_module.scale.desc); ReshapeIfNeeded(dl_module.shift.desc); ReshapeIfNeeded(dl_module.estMean.desc); ReshapeIfNeeded(dl_module.estVariance.desc); - batchNormSpatialHostInference(dl_module.input, - dl_module.ref_out, - dl_module.scale, - dl_module.shift, - dl_module.epsilon, - dl_module.estMean, - dl_module.estVariance); + if(dl_module.bn_mode == miopenBNSpatial) + { + batchNormSpatialHostInference(dl_module.input, + dl_module.out_ref, + dl_module.scale, + dl_module.shift, + dl_module.epsilon, + dl_module.estMean, + dl_module.estVariance); + } + else if(dl_module.bn_mode == miopenBNPerActivation) + { + batchNormPerActivHostInference(dl_module.input, + dl_module.out_ref, + dl_module.scale, + dl_module.shift, + dl_module.epsilon, + dl_module.estMean, + dl_module.estVariance); + } + else + { + std::cout << "\nUnknown inference batch miopenBatchNormMode_t\n"; + exit(EXIT_FAILURE); + } } template @@ -68,21 +86,40 @@ void ComputeCPUBNBwd(DLModule& dl_module) }; ReshapeIfNeeded(dl_module.input.desc); ReshapeIfNeeded(dl_module.dy.desc); - ReshapeIfNeeded(dl_module.ref_out.desc); + ReshapeIfNeeded(dl_module.out_ref.desc); ReshapeIfNeeded(dl_module.bnScale.desc); ReshapeIfNeeded(dl_module.dScale_ref.desc); ReshapeIfNeeded(dl_module.dBias_ref.desc); ReshapeIfNeeded(dl_module.savedMean.desc); ReshapeIfNeeded(dl_module.savedInvVar.desc); - batchNormSpatialHostBwdTrain(dl_module.input, - dl_module.dy, - dl_module.ref_out, - dl_module.bnScale, - dl_module.dScale_ref, - dl_module.dBias_ref, - dl_module.savedMean, - dl_module.savedInvVar); + if(dl_module.bn_mode == miopenBNSpatial) + { + batchNormSpatialHostBwdTrain(dl_module.input, + dl_module.dy, + dl_module.out_ref, + dl_module.bnScale, + dl_module.dScale_ref, + dl_module.dBias_ref, + dl_module.savedMean, + dl_module.savedInvVar); + } + else if(dl_module.bn_mode == miopenBNPerActivation) + { + batchNormPerActHostBwdTrain(dl_module.input, + dl_module.dy, + dl_module.out_ref, + dl_module.bnScale, + dl_module.dScale_ref, + dl_module.dBias_ref, + dl_module.savedMean, + dl_module.savedInvVar); + } + else + { + std::cout << "\nUnknown BwdTrain batch miopenBatchNormMode_t\n"; + exit(EXIT_FAILURE); + } } template @@ -98,7 +135,7 @@ void ComputeCPUBNFwdTrain(DLModule& dl_module) } }; ReshapeIfNeeded(dl_module.input.desc); - ReshapeIfNeeded(dl_module.ref_out.desc); + ReshapeIfNeeded(dl_module.out_ref.desc); ReshapeIfNeeded(dl_module.scale.desc); ReshapeIfNeeded(dl_module.shift.desc); ReshapeIfNeeded(dl_module.saveMean_ref.desc); @@ -106,30 +143,51 @@ void ComputeCPUBNFwdTrain(DLModule& dl_module) ReshapeIfNeeded(dl_module.runMean_ref.desc); ReshapeIfNeeded(dl_module.runVariance_ref.desc); - batchNormSpatialHostFwdTrain(dl_module.input, - dl_module.ref_out, - dl_module.scale, - dl_module.shift, - dl_module.epsilon, - dl_module.averageFactor, - dl_module.saveMean_ref, - dl_module.saveVariance_ref, - dl_module.runMean_ref, - dl_module.runVariance_ref); + if(dl_module.bn_mode == miopenBNSpatial) + { + batchNormSpatialHostFwdTrain(dl_module.input, + dl_module.out_ref, + dl_module.scale, + dl_module.shift, + dl_module.epsilon, + dl_module.averageFactor, + dl_module.saveMean_ref, + dl_module.saveVariance_ref, + dl_module.runMean_ref, + dl_module.runVariance_ref); + } + else if(dl_module.bn_mode == miopenBNPerActivation) + { + batchNormPerActHostFwdTrain(dl_module.input, + dl_module.out_ref, + dl_module.scale, + dl_module.shift, + dl_module.epsilon, + dl_module.averageFactor, + dl_module.saveMean_ref, + dl_module.saveVariance_ref, + dl_module.runMean_ref, + dl_module.runVariance_ref); + } + else + { + std::cout << "\nUnknown FwdTrain batch miopenBatchNormMode_t\n"; + exit(EXIT_FAILURE); + } } -template +template void CompareTensor(const tensor& output, - const tensor& ref_out, + const tensor& out_ref, const double threshold = std::numeric_limits::epsilon()) { - EXPECT_FALSE(miopen::range_zero(ref_out)) << "CPU data is all zeros"; + EXPECT_FALSE(miopen::range_zero(out_ref)) << "CPU data is all zeros"; EXPECT_FALSE(miopen::range_zero(output)) << "GPU data is all zeros"; EXPECT_FALSE(miopen::find_idx(output, miopen::not_finite) >= 0) << "Non finite number found in the GPU data"; - EXPECT_TRUE(miopen::range_distance(ref_out) == miopen::range_distance(output)); - auto error = miopen::rms_range(ref_out, output); - EXPECT_FALSE(miopen::find_idx(ref_out, miopen::not_finite) >= 0) + EXPECT_TRUE(miopen::range_distance(out_ref) == miopen::range_distance(output)); + auto error = miopen::rms_range(out_ref, output); + EXPECT_FALSE(miopen::find_idx(out_ref, miopen::not_finite) >= 0) << "Non finite number found in the CPU data"; EXPECT_TRUE(error < threshold) << "Error beyond tolerance Error:" << error << ", Threshold: " << threshold; diff --git a/test/gtest/unit_FinInterface.cpp b/test/gtest/unit_FinInterface.cpp index 07d8208d01..acd9b8be3f 100644 --- a/test/gtest/unit_FinInterface.cpp +++ b/test/gtest/unit_FinInterface.cpp @@ -320,16 +320,15 @@ const auto& GetSolversInfo() /// \ref fin_interface_solver_info_coverage static const std::unordered_map solver_info = { // clang-format off - {"BnFwdTrainingSpatialSingle", {113, false, false}}, - {"BnFwdTrainingSpatialMultiple", {115, false, false}}, - {"BnFwdTrainingPerActivation", {116, false, false}}, - {"BnBwdTrainingSpatialSingle", {117, false, false}}, - {"BnBwdTrainingSpatialMultiple", {118, false, false}}, - {"BnBwdTrainingPerActivation", {119, false, false}}, - {"BnFwdInference", {120, true, false}}, - {"BnCKFwdInference", {142, true, true}}, - {"BnCKBwdBackward", {143, true, true}}, - {"BnCKFwdTraining", {144, true, true}}, + // solver-name id, isDynamic, isTunable + {"BnFwdTrainingSpatial", {113, false, true}}, + {"BnFwdTrainingPerActivation", {116, false, false}}, + {"BnBwdTrainingSpatial", {117, false, true}}, + {"BnBwdTrainingPerActivation", {119, false, false}}, + {"BnFwdInference", {120, true, false}}, + {"BnCKFwdInference", {142, true, true}}, + {"BnCKBwdBackward", {143, true, true}}, + {"BnCKFwdTraining", {144, true, true}}, // clang-format on }; diff --git a/test/random.hpp b/test/random.hpp index 9b4815bc1d..62443abb10 100644 --- a/test/random.hpp +++ b/test/random.hpp @@ -40,5 +40,23 @@ inline T gen_descreet_unsigned(double scale, int32_t range) { return static_cast(scale * static_cast(gen_0_to_B(range))); } + } // namespace prng + +// lambda factory +template +auto uniform_signed_initializer(ScaleT scale_arg, RangeT range_arg) +{ + return [=](auto&&...) -> T { + // uniform sign give balance of both negative and positive values + return prng::gen_descreet_uniform_sign(scale_arg, range_arg); + }; +} + +template +auto uniform_unsigned_initializer(ScaleT scale_arg, RangeT range_arg) +{ + return [=](auto&&...) -> T { return prng::gen_descreet_unsigned(scale_arg, range_arg); }; +} + #endif // GUARD_MIOPEN_TEST_RANDOM_HPP