diff --git a/CHANGELOG.md b/CHANGELOG.md
index 6d85f938b9..2f0cce18bc 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,8 +8,11 @@ Full documentation for MIOpen is available [here](https://rocm.docs.amd.com/proj
 * [Conv] Enabled tuning through the `miopenSetConvolutionFindMode` API 
 * [RNN] Added the new algorithm type `miopenRNNroundedDynamic` for LSTM
 * [TunaNet] Enabled NHWC for MI300
+* [BatchNorm] Enabled broad support for NHWC
+* [BatchNorm] Enabled tuning through MIOPEN_FIND_ENFORCE
 ### Optimized
 * Updated KernelTuningNet for CK solvers
+* NHWC Batchnorm
 
 ### Resolved issues
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0222db304a..3fd4b0775b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -128,7 +128,7 @@ if(MIOPEN_STRIP_SYMBOLS AND NOT WIN32 AND NOT APPLE)
     set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -s")
 endif()
 
-rocm_setup_version(VERSION 3.4.0)
+rocm_setup_version(VERSION 3.4.1)
 
 list( APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake )
 include(TargetFlags)
diff --git a/Jenkinsfile b/Jenkinsfile
index 149035a575..37ace74aa6 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -245,7 +245,11 @@ def buildHipClangJob(Map conf=[:]){
         if (conf.get("enforce_xnack_on", false)) {
             dockerOpts = dockerOpts + " --env HSA_XNACK=1"
         }
-
+        def video_id = sh(returnStdout: true, script: 'getent group video | cut -d: -f3')
+        def render_id = sh(returnStdout: true, script: 'getent group render | cut -d: -f3')
+        dockerOpts = dockerOpts + " --group-add=${video_id} --group-add=${render_id} "
+        echo "Docker flags: ${dockerOpts}"
+        
         def variant = env.STAGE_NAME
 
         def needs_gpu = conf.get("needs_gpu", true)
diff --git a/driver/bn_driver.hpp b/driver/bn_driver.hpp
index bfa1b91aef..f7409d29b8 100644
--- a/driver/bn_driver.hpp
+++ b/driver/bn_driver.hpp
@@ -195,10 +195,8 @@ int BatchNormDriver<TInput, Tref, TAcc, TScaleBias, TOut>::GetandSetData()
     SetBNParametersFromCmdLineArgs();
 
     in.AllocOnHost(tensor<TInput>{bn_layout, in_len});
-    for(size_t i = 0; i < in.GetVector().size(); i++)
-    {
-        in.GetVector()[i] = prng::gen_canonical<TInput>();
-    }
+    // 0.0 to 2.0
+    in.GetTensor().generate(uniform_unsigned_initializer<TInput>(2e-3 /*scale*/, 1000 /*range*/));
 
     auto derivedBnDesc = miopen::TensorDescriptor{};
     miopen::DeriveBNTensorDescriptor(derivedBnDesc, in.GetTensor().desc, bn_mode);
@@ -208,20 +206,25 @@ int BatchNormDriver<TInput, Tref, TAcc, TScaleBias, TOut>::GetandSetData()
         out.AllocOnHost(tensor<TInput>{bn_layout, in_len});
         scale.AllocOnHost(tensor<TScaleBias>{bn_layout, derivedBnDesc.GetLengths()});
         bias.AllocOnHost(tensor<TScaleBias>{bn_layout, derivedBnDesc.GetLengths()});
-
-        for(int i = 0; i < scale.GetVector().size(); i++)
-        {
-            scale.GetVector()[i] = prng::gen_canonical<TInput>();
-            bias.GetVector()[i]  = prng::gen_canonical<TInput>();
-        }
+        // -2.0 to 2.0
+        scale.GetTensor().generate(
+            uniform_signed_initializer<TScaleBias>(2e-3 /*scale*/, 1000 /*range*/));
+        bias.GetTensor().generate(
+            uniform_signed_initializer<TScaleBias>(2e-3 /*scale*/, 1000 /*range*/));
     }
     if(isFwdInfer)
     {
         estMean.AllocOnHost(tensor<TAcc>{bn_layout, derivedBnDesc.GetLengths()});
         estVariance.AllocOnHost(tensor<TAcc>{bn_layout, derivedBnDesc.GetLengths()});
 
-        auto gen_value_emean = [](auto...) { return prng::gen_descreet_unsigned<TAcc>(1e-2, 100); };
-        estMean.InitHostData(estMean.GetTensor().desc.GetElementSize(), true, gen_value_emean);
+        // -2.0 to 2.0
+        estMean.InitHostData(estMean.GetTensor().desc.GetElementSize(),
+                             true,
+                             uniform_signed_initializer<TAcc>(2e-3 /*scale*/, 1000 /*range*/));
+        // estVaraince has to be +ve number otherwise 1/sqrt(-ve) would
+        // give img number
+        estVariance.GetTensor().generate(
+            uniform_unsigned_initializer<TAcc>(2e-3 /*scale*/, 1000 /*range*/));
     }
     else if(isFwdTrain)
     {
@@ -230,11 +233,11 @@ int BatchNormDriver<TInput, Tref, TAcc, TScaleBias, TOut>::GetandSetData()
         runMean.AllocOnHost(tensor<TAcc>{bn_layout, derivedBnDesc.GetLengths()});
         runVariance.AllocOnHost(tensor<TAcc>{bn_layout, derivedBnDesc.GetLengths()});
 
-        for(int i = 0; i < runVariance.GetVector().size(); i++)
-        {
-            runMean.GetVector()[i]     = prng::gen_canonical<TAcc>();
-            runVariance.GetVector()[i] = prng::gen_canonical<TAcc>();
-        }
+        // -2.0 to 2.0
+        runMean.GetTensor().generate(
+            uniform_signed_initializer<TAcc>(2e-3 /*scale*/, 1000 /*range*/));
+        runVariance.GetTensor().generate(
+            uniform_signed_initializer<TAcc>(2e-3 /*scale*/, 1000 /*range*/));
     }
     else if(isBwd)
     {
@@ -242,33 +245,33 @@ int BatchNormDriver<TInput, Tref, TAcc, TScaleBias, TOut>::GetandSetData()
 
         bnScale.AllocOnHost(tensor<TScaleBias>{bn_layout, derivedBnDesc.GetLengths()});
         dy.AllocOnHost(tensor<TOut>{bn_layout, in_len});
-
-        auto gen_var_bwd = [](auto...) {
-            return static_cast<TOut>(1e-2 * (prng::gen_0_to_B(100) + 1));
-        };
-
-        dy.InitHostData(dy.GetTensor().desc.GetElementSize(), true, gen_var_bwd);
+        // -2.0 to 2.0
+        dy.InitHostData(dy.GetTensor().desc.GetElementSize(),
+                        true,
+                        uniform_signed_initializer<TOut>(2e-3 /*scale*/, 1000 /*range*/));
 
         dScale.AllocOnHost(tensor<TAcc>{bn_layout, derivedBnDesc.GetLengths()});
         dBias.AllocOnHost(tensor<TAcc>{bn_layout, derivedBnDesc.GetLengths()});
         savedMean.AllocOnHost(tensor<TAcc>{bn_layout, derivedBnDesc.GetLengths()});
         savedInvVar.AllocOnHost(tensor<TAcc>{bn_layout, derivedBnDesc.GetLengths()});
 
-        auto gen_value = [](auto...) { return prng::gen_descreet_unsigned<TScaleBias>(1e-2, 100); };
-        bnScale.InitHostData(bnScale.GetTensor().desc.GetElementSize(), true, gen_value);
-
-        auto gen_in_var = [](auto...) {
-            return static_cast<TAcc>(1e-2 * (prng::gen_0_to_B(100) + 1));
-        };
-        savedMean.InitHostData(savedMean.GetTensor().desc.GetElementSize(), true, gen_in_var);
-        savedInvVar.InitHostData(savedInvVar.GetTensor().desc.GetElementSize(), true, gen_in_var);
+        bnScale.InitHostData(
+            bnScale.GetTensor().desc.GetElementSize(),
+            true,
+            uniform_signed_initializer<TScaleBias>(2e-3 /*scale*/, 1000 /*range*/));
+        // -2.0 to 2.0
+        savedMean.InitHostData(savedMean.GetTensor().desc.GetElementSize(),
+                               true,
+                               uniform_signed_initializer<TAcc>(2e-3 /*scale*/, 1000 /*range*/));
+        savedInvVar.InitHostData(savedInvVar.GetTensor().desc.GetElementSize(),
+                                 true,
+                                 uniform_signed_initializer<TAcc>(2e-3 /*scale*/, 1000 /*range*/));
     }
     else
     {
         std::cout << "\nUnknown batch norm state!\n";
         exit(EXIT_FAILURE);
     }
-
     return miopenStatusSuccess;
 }
 
@@ -297,7 +300,7 @@ int BatchNormDriver<TInput, Tref, TAcc, TScaleBias, TOut>::AddCmdLineArgs()
 
     inflags.AddInputFlag("alpha", 'A', "1.0", "Alpha (Default=1.0)", "float");
     inflags.AddInputFlag("beta", 'B', "0.", "Beta (Default=0.)", "float");
-    inflags.AddInputFlag("iter", 'i', "1", "Number of Iterations (Default=1)", "int");
+    inflags.AddInputFlag("iter", 'i', "10", "Number of Iterations (Default=10)", "int");
     inflags.AddInputFlag("verify", 'V', "1", "Verify Each Layer (Default=1)", "int");
     inflags.AddInputFlag("time", 't', "0", "Time Each Layer (Default=0)", "int");
     inflags.AddInputFlag("printconv", 'P', "1", "Print Convolution Dimensions (Default=1)", "int");
@@ -1364,11 +1367,6 @@ int BatchNormDriver<TInput, Tref, TAcc, TScaleBias, TOut>::RunBackwardCPU()
     if(!back)
         return miopenStatusSuccess;
 
-    //	T alphaDiff = 1, betaDiff = 0;
-    //	T alphaParam = 1, betaParam = 0;
-    double alpha = static_cast<double>(1), beta = static_cast<double>(0),
-           gamma = static_cast<double>(1);
-
     // float alphaDataDiff = static_cast<float>(1), betaDataDiff = static_cast<float>(0);
     // float alphaParamDiff = static_cast<float>(1), betaParamDiff = static_cast<float>(0);
     int size{0};
@@ -1394,20 +1392,14 @@ int BatchNormDriver<TInput, Tref, TAcc, TScaleBias, TOut>::RunBackwardCPU()
     if(bn_mode == miopenBNPerActivation)
     {
         // 1xCxHxW
-        batchNormActivSpatialHostBwdTrain(activ_mode,
-                                          gamma,
-                                          beta,
-                                          alpha,
-                                          in.GetTensor(),
-                                          dy.GetTensor(),
-                                          out.GetTensor(),
-                                          out_ref,
-                                          bnScale.GetTensor(),
-                                          dBias.GetTensor(),
-                                          dScale_ref,
-                                          dBias_ref,
-                                          savedMean.GetTensor(),
-                                          savedInvVar.GetTensor());
+        batchNormPerActHostBwdTrain(in.GetTensor(),
+                                    dy.GetTensor(),
+                                    out_ref,
+                                    bnScale.GetTensor(),
+                                    dScale_ref,
+                                    dBias_ref,
+                                    savedMean.GetTensor(),
+                                    savedInvVar.GetTensor());
     }
     else if(bn_mode == miopenBNSpatial)
     { // 1xCx1x1
diff --git a/fin b/fin
index 344cf42f6c..25a7bffc6c 160000
--- a/fin
+++ b/fin
@@ -1 +1 @@
-Subproject commit 344cf42f6c18f309f3d1dd08af1cd7b73dd38e46
+Subproject commit 25a7bffc6c252bc5f5a4c05eba10f1c2930faa0f
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 78a4ef79ec..d69586adcb 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -211,15 +211,13 @@ set( MIOpen_Source
     solver/batchnorm/backward_ck.cpp
     solver/batchnorm/backward_per_activation.cpp
     solver/batchnorm/backward_per_activation_fused.cpp
-    solver/batchnorm/backward_spatial_multiple.cpp
-    solver/batchnorm/backward_spatial_single.cpp
+    solver/batchnorm/backward_spatial.cpp
     solver/batchnorm/forward_inference.cpp
     solver/batchnorm/forward_inference_ck.cpp
     solver/batchnorm/forward_inference_fused.cpp
     solver/batchnorm/forward_per_activation.cpp
     solver/batchnorm/forward_per_activation_fused.cpp
-    solver/batchnorm/forward_spatial_multiple.cpp
-    solver/batchnorm/forward_spatial_single.cpp
+    solver/batchnorm/forward_spatial.cpp
     solver/batchnorm/forward_training_ck.cpp
     solver/cat/forward_cat.cpp
     solver/conv/conv_asm_1x1u.cpp
diff --git a/src/batch_norm.cpp b/src/batch_norm.cpp
index 2c5486f307..2d56ec2aa2 100644
--- a/src/batch_norm.cpp
+++ b/src/batch_norm.cpp
@@ -73,21 +73,10 @@ TensorDescriptor BuildReshaped4DTensorDescriptor(const miopen::TensorDescriptor&
     if(layout == miopenTensorNCDHW)
     {
         layout = miopenTensorNCHW;
-
-        // NxCxDxHxW -> NxCx(D*H)xW
-        dims[2] *= dims[3];
-        dims[3] = dims[4];
-        dims.pop_back();
     }
     else if(layout == miopenTensorNDHWC)
     {
         layout = miopenTensorNHWC;
-
-        // NxDxHxWxC -> Nx(D*H)xWxC
-        dims[1] *= dims[2];
-        dims[2] = dims[3];
-        dims[3] = dims[4];
-        dims.pop_back();
     }
     else
     {
@@ -95,6 +84,12 @@ TensorDescriptor BuildReshaped4DTensorDescriptor(const miopen::TensorDescriptor&
         exit(EXIT_FAILURE); // NOLINT (concurrency-mt-unsafe)
     }
 
+    // Both NCDHW and NDHWC layout store the lens in NCHDW form
+    // hence : NxCxDxHxW -> NxCx(D*H)xW
+    dims[2] *= dims[3];
+    dims[3] = dims[4];
+    dims.pop_back();
+
     return {dataType, layout, dims};
 }
 
@@ -109,14 +104,11 @@ void profileSequence(const Handle& handle, unsigned char select, float* ctime)
     case 0:
         if(handle.IsProfilingEnabled())
         {
-            *ctime = 0.;
-            handle.ResetKernelTime();
             ktime  = handle.GetKernelTime();
             *ctime = ktime;
 
 #if(MIO_BN_CPP_PROF == 1)
-            printf("ktime0: %lf\n", ktime);
-            printf("ctime: %f\n", *ctime);
+            printf("kernel0: %7.3f ms   total: %7.3f ms\n", ktime, *ctime);
 #endif
         }
 #if(MIOPEN_BN_SYNCH == 1)
@@ -133,8 +125,7 @@ void profileSequence(const Handle& handle, unsigned char select, float* ctime)
             *ctime += ktime;
 
 #if(MIO_BN_CPP_PROF == 1)
-            printf("ktime1: %lf\n", ktime);
-            printf("ctime: %f\n", *ctime);
+            printf("kernel1: %7.3f ms   total: %7.3f ms\n", ktime, *ctime);
 #endif
         }
 #if(MIOPEN_BN_SYNCH == 1)
@@ -148,15 +139,12 @@ void profileSequence(const Handle& handle, unsigned char select, float* ctime)
     case 2:
         if(handle.IsProfilingEnabled())
         {
-
-#if(MIO_BN_CPP_PROF == 1)
             ktime = handle.GetKernelTime();
+            *ctime += ktime;
+            handle.ResetKernelTime();
             handle.AccumKernelTime(*ctime);
-            printf("ktime2: %lf\n", ktime);
-            printf("ctime: %f\n", *ctime + ktime);
-#else
-            handle.GetKernelTime();
-            handle.AccumKernelTime(*ctime);
+#if(MIO_BN_CPP_PROF == 1)
+            printf("kernel2: %7.3f ms   total: %7.3f ms\n", ktime, *ctime);
 #endif
         }
         break;
diff --git a/src/batchnorm/problem_description.cpp b/src/batchnorm/problem_description.cpp
index daebf49208..2a5513bb24 100644
--- a/src/batchnorm/problem_description.cpp
+++ b/src/batchnorm/problem_description.cpp
@@ -30,8 +30,6 @@
 #include <cmath>
 #include <sstream>
 
-#define WORKAROUND_SWDEV_253606 1
-
 namespace miopen {
 
 namespace batchnorm {
@@ -140,314 +138,49 @@ bool IsCKBwdTypeValid(const ProblemDescription& bn_problem)
 }
 
 NetworkConfig ProblemDescription::MakeNetworkConfig() const
-{
-    switch(direction)
-    {
-    case Direction::ForwardTraining: return MakeForwardTrainingNetworkConfig();
-    case Direction::ForwardInference: return MakeForwardInferenceNetworkConfig();
-    case Direction::Backward: return MakeBackwardNetworkConfig();
-    default: MIOPEN_THROW(miopenStatusInternalError);
-    }
-}
-
-NetworkConfig ProblemDescription::MakeForwardTrainingNetworkConfig() const
 {
     std::ostringstream ss;
 
     int n, c, h, w;
     std::tie(n, c, h, w) = tien<4>(xDesc.GetLengths());
-
-    const unsigned int in_cstride = h * w;
-    const unsigned int in_nhw     = n * in_cstride;
-
-    size_t xlocalsize = 1024;
-    if(((in_cstride < 256) && (n < 256)) || ((in_cstride < 100) && (n <= 256)))
-        xlocalsize = 256;
-
-    size_t ylocalsize = 1;
-
-    size_t xgridsize = c * xlocalsize;
-    size_t ygridsize = 1;
-
-    bool bfpmixparm = false;
-    if(IsMix())
+    int d                = 1;
+    // dimensions
+    ss << c;
+    ss << "x" << d << "x" << h << "x" << w;
+    ss << "x" << n;
+    // layout
+    ss << "x" << ComputeInLayout();
+    ss << "x" << ComputeOutLayout();
+    if(direction == Direction::Backward)
     {
-        bfpmixparm = true;
+        ss << "x" << ComputeDinLayout();
     }
-
-    if(bn_mode == miopenBNSpatial)
+    // data type
+    ss << "x" << GetDataTypeName(xDesc.GetType());
+    ss << "x" << GetDataTypeName(yOrDyDesc.GetType());
+    ss << "x" << GetDataTypeName(scaleDesc.GetType());
+    ss << "x" << GetDataTypeName(biasDesc.GetType());
+    ss << "x" << GetDataTypeName(sMeanDesc.GetType());
+    ss << "x" << GetDataTypeName(sVarianceDesc.GetType());
+    if(direction == Direction::Backward)
     {
-        bool single         = true;
-        int variant         = 1;
-        unsigned int ldsgcn = xlocalsize / 64;
-
-#if(WORKAROUND_SWDEV_253606 == 0)
-        if(n < 3)
-        {
-            variant    = 4;
-            xlocalsize = 256;
-            xgridsize  = c * xlocalsize;
-            ylocalsize = 1;
-            ygridsize  = 1;
-            ldsgcn     = xlocalsize / 64;
-        }
-        else
-#endif
-
-            // clang-format off
-        if((in_nhw < 33554432 && in_cstride > 1024) ||
-            ((n >= 256) && (in_cstride > 60) && bfpmixparm) ||
-            ((in_cstride > 512) && bfpmixparm))
-        {
-            variant = 1;
-        }
-        else if(in_cstride <= 512)
-        {
-            variant = 0;
-        }
-        else
-        {
-            variant      = 2;
-            xlocalsize   = 1;
-            ylocalsize   = 1024;
-            const auto segment = int(std::ceil(double(in_cstride) / double(ylocalsize)));
-            xgridsize    = c;
-            ygridsize    = segment * ylocalsize;
-            single       = false;
-            ldsgcn       = ylocalsize / 64;
-        }
-        // clang-format on
-
-        if((n > 768) && (in_cstride > 150) && IsFp32())
-        {
-            variant            = 2;
-            xlocalsize         = 1;
-            ylocalsize         = 1024;
-            const auto segment = int(std::ceil(double(in_cstride) / double(ylocalsize)));
-            xgridsize          = c;
-            ygridsize          = segment * ylocalsize;
-            single             = false;
-            ldsgcn             = ylocalsize / 64;
-        }
-
-        ss << "variant" << variant;
-
-#if(WORKAROUND_SWDEV_253606 == 0)
-        if(variant == 4)
-        {
-            ss << "rs" << static_cast<int>(resultsave);
-            ss << "rr" << static_cast<int>(resultrunning);
-            ss << "fp16" << static_cast<int>(IsFp16());
-            ss << "fp32" << static_cast<int>(IsFp32());
-            ss << "fp64" << static_cast<int>(IsFp64());
-            ss << "fbf16" << static_cast<int>(IsBFp16());
-            ss << "fmix" << static_cast<int>(IsMix());
-            ss << "c" << c;
-        }
-        else
-#endif
-        {
-            ss << "gx" << xgridsize;
-            ss << "gy" << ygridsize;
-            ss << "xl" << xlocalsize;
-            ss << "yl" << ylocalsize;
-            ss << "ldsgcn" << ldsgcn;
-            ss << "rs" << static_cast<int>(resultsave);
-            ss << "rr" << static_cast<int>(resultrunning);
-            ss << "fp16" << static_cast<int>(IsFp16());
-            ss << "fp32" << static_cast<int>(IsFp32());
-            ss << "fp64" << static_cast<int>(IsFp64());
-            ss << "fbf16" << static_cast<int>(IsBFp16());
-            ss << "fmix" << static_cast<int>(IsMix());
-            ss << "single" << static_cast<int>(single);
-            ss << "n" << n;
-            ss << "c" << c;
-            ss << "hw" << in_cstride;
-        }
-    }
-    else
-    {
-        xlocalsize                = 1;
-        ylocalsize                = 256;
-        const std::size_t segment = (in_cstride + ylocalsize - 1) / ylocalsize;
-        xgridsize                 = c;
-        ygridsize                 = segment * ylocalsize;
-
-        ss << "fp16" << static_cast<int>(IsFp16());
-        ss << "fp32" << static_cast<int>(IsFp32());
-        ss << "fp64" << static_cast<int>(IsFp64());
-        ss << "fbf16" << static_cast<int>(IsBFp16());
-        ss << "fmix" << static_cast<int>(IsMix());
-        ss << "gx" << xgridsize;
-        ss << "gy" << ygridsize;
-        ss << "lx" << xlocalsize;
-        ss << "ly" << ylocalsize;
-        ss << "rs" << static_cast<int>(resultsave);
-        ss << "rr" << static_cast<int>(resultrunning);
-        ss << "segment" << segment;
-        ss << "n" << n;
-        ss << "c" << c;
-        ss << "hw" << in_cstride;
+        ss << "x" << GetDataTypeName(dxDesc.GetType());
     }
-    ss << "layout" << in_layout;
-    ss << "scaleType" << static_cast<int>(IsScaleFp16());
-    ss << "scaleType" << static_cast<int>(IsScaleFp32());
-
-    return NetworkConfig{ss.str()};
-}
+    ss << "x" << IsMix();
 
-NetworkConfig ProblemDescription::MakeForwardInferenceNetworkConfig() const
-{
-    std::ostringstream ss;
-
-    int n, c, h, w;
-    std::tie(n, c, h, w) = tien<4>(xDesc.GetLengths());
-
-    const unsigned int in_cstride = h * w;
-
-    ss << "fp16" << static_cast<int>(IsFp16());
-    ss << "fp32" << static_cast<int>(IsFp32());
-    ss << "fp64" << static_cast<int>(IsFp64());
-    ss << "fbf16" << static_cast<int>(IsBFp16());
-    ss << "fmix" << static_cast<int>(IsMix());
-    ss << "mode" << bn_mode;
-    ss << "HWdims" << in_cstride;
-    ss << "C" << c;
-    ss << "layout" << in_layout;
-    ss << "scaleType" << static_cast<int>(IsScaleFp16());
-    ss << "scaleType" << static_cast<int>(IsScaleFp32());
-
-    return NetworkConfig{ss.str()};
-}
-
-NetworkConfig ProblemDescription::MakeBackwardNetworkConfig() const
-{
-    std::ostringstream ss;
-
-    bool bfpmixparm = false;
-    if(xDesc.GetType() == miopenHalf && GetBnScale().GetType() == miopenFloat)
+    // direction
+    ss << "x" << GetDirectionStr();
+    // save and running
+    if(direction == Direction::ForwardTraining)
     {
-        bfpmixparm = true;
+        ss << "x" << resultsave;
+        ss << "x" << resultrunning;
     }
-
-    int n, c, h, w;
-    std::tie(n, c, h, w) = tien<4>(xDesc.GetLengths());
-
-    const unsigned int in_cstride = h * w;
-    const unsigned int in_nhw     = n * in_cstride;
-
-    size_t xlocalsize = 1;
-    size_t ylocalsize = 1;
-
-    size_t xgridsize = 1;
-    size_t ygridsize = 1;
-
-    if(bn_mode == miopenBNSpatial)
-    {
-        unsigned int ldsgcn = 0;
-        bool single         = true;
-        int variant         = 1;
-
-        if((in_nhw < (32 * 1024 * 1024) && in_cstride > 1024))
-        {
-            variant    = 1;
-            xlocalsize = 1024;
-            xgridsize  = c * xlocalsize;
-            ldsgcn     = xlocalsize / 64;
-        }
-        else if(in_nhw < (32 * 1024 * 1024) && in_cstride > 512)
-        {
-            variant    = (n >= 32) ? 1 : 3;
-            xlocalsize = std::min(64 * ((in_cstride + 63) / 64), static_cast<unsigned int>(1024));
-            xgridsize  = c * xlocalsize;
-            ldsgcn     = xlocalsize / 64;
-        }
-        else if(in_cstride <= 512)
-        {
-            if((n > 64) && (in_cstride > 160))
-            {
-                variant = 3;
-                xlocalsize =
-                    std::min(64 * ((in_cstride + 63) / 64), static_cast<unsigned int>(1024));
-                xgridsize = c * xlocalsize;
-                ldsgcn    = xlocalsize / 64;
-            }
-            else
-            {
-                variant = 0;
-                if(IsFp32())
-                {
-                    xlocalsize = 1024;
-                    xgridsize  = 1024 * static_cast<size_t>(c);
-                }
-                else
-                {
-                    xlocalsize = 256;
-                    xgridsize  = 256 * static_cast<size_t>(c);
-                }
-                ldsgcn = xlocalsize / 64;
-            }
-        }
-        else
-        {
-            variant      = 2;
-            ylocalsize   = 1024;
-            auto segment = int(std::ceil(double(in_cstride) / double(ylocalsize)));
-            xgridsize    = c;
-            ygridsize    = segment * ylocalsize;
-            single       = false;
-            ldsgcn       = ylocalsize / 64;
-        }
-        if((in_cstride < 200) && (in_cstride > 60) && bfpmixparm)
-        {
-            variant    = 1;
-            xlocalsize = 1024;
-            xgridsize  = c * xlocalsize;
-            ldsgcn     = xlocalsize / 64;
-        }
-
-        ss << "variant" << variant;
-        ss << "gx" << xgridsize;
-        ss << "n" << n;
-        ss << "c" << c;
-        ss << "hw" << in_cstride;
-        ss << "gy" << ygridsize;
-        ss << "lx" << xlocalsize;
-        ss << "ly" << ylocalsize;
-        ss << "us" << static_cast<int>(useSaved);
-        ss << "fp16" << static_cast<int>(IsFp16());
-        ss << "fp32" << static_cast<int>(IsFp32());
-        ss << "fp64" << static_cast<int>(IsFp64());
-        ss << "fbf16" << static_cast<int>(IsBFp16());
-        ss << "fmix" << static_cast<int>(IsMix());
-        ss << "single" << static_cast<int>(single);
-        ss << "gcn" << ldsgcn;
-    }
-    else
+    if(direction == Direction::Backward)
     {
-        ylocalsize                 = (64 >= in_cstride) ? 64 : 256;
-        const unsigned int segment = std::ceil(double(in_cstride) / double(ylocalsize));
-        xgridsize                  = c;
-        ygridsize                  = segment * ylocalsize;
-
-        ss << "gx" << xgridsize;
-        ss << "gy" << ygridsize;
-        ss << "lx" << xlocalsize;
-        ss << "ly" << ylocalsize;
-        ss << "n" << n;
-        ss << "c" << c;
-        ss << "hw" << in_cstride;
-        ss << "u" << static_cast<int>(useSaved);
-        ss << "fp16" << static_cast<int>(IsFp16());
-        ss << "fp32" << static_cast<int>(IsFp32());
-        ss << "fp64" << static_cast<int>(IsFp64());
-        ss << "fbf16" << static_cast<int>(IsBFp16());
-        ss << "fmix" << static_cast<int>(IsMix());
-        ss << "nhw" << in_nhw;
+        ss << "x" << useSaved;
     }
-    ss << "layout" << in_layout;
-    ss << "scaleType" << static_cast<int>(IsScaleFp16());
-    ss << "scaleType" << static_cast<int>(IsScaleFp32());
+    ss << "x" << GetModeStr();
 
     return NetworkConfig{ss.str()};
 }
diff --git a/src/driver_arguments.cpp b/src/driver_arguments.cpp
index b730325add..6f079e01f9 100644
--- a/src/driver_arguments.cpp
+++ b/src/driver_arguments.cpp
@@ -156,11 +156,11 @@ void BnDriverInfo(std::stringstream& ss,
     }
     if((resultRunningMean != nullptr) && (resultRunningVariance != nullptr))
     {
-        ss << " -s 1";
+        ss << " -r 1";
     }
     if((resultSaveMean != nullptr) && (resultSaveInvVariance != nullptr))
     {
-        ss << " -r 1";
+        ss << " -s 1";
     }
 }
 
diff --git a/src/fin/fin_interface.cpp b/src/fin/fin_interface.cpp
index d321a687a0..f704461a89 100644
--- a/src/fin/fin_interface.cpp
+++ b/src/fin/fin_interface.cpp
@@ -354,6 +354,8 @@ AnySolver<miopen::ExecutionContext, miopen::batchnorm::ProblemDescription>::AnyS
 
     switch(id)
     {
+    case 113: SetObject<miopen::solver::batchnorm::BnFwdTrainingSpatial>(); break;
+    case 117: SetObject<miopen::solver::batchnorm::BnBwdTrainingSpatial>(); break;
     case 142: SetObject<miopen::solver::batchnorm::BnCKFwdInference>(); break;
     case 143: SetObject<miopen::solver::batchnorm::BnCKBwdBackward>(); break;
     case 144: SetObject<miopen::solver::batchnorm::BnCKFwdTraining>(); break;
diff --git a/src/include/miopen/batchnorm/common_spatial.hpp b/src/include/miopen/batchnorm/common_spatial.hpp
new file mode 100644
index 0000000000..c95ef8b4f1
--- /dev/null
+++ b/src/include/miopen/batchnorm/common_spatial.hpp
@@ -0,0 +1,453 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2025 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#pragma once
+
+#include <miopen/batchnorm/problem_description.hpp>
+
+#define WORKAROUND_SWDEV_253606 1
+
+namespace miopen {
+
+namespace solver {
+
+namespace batchnorm {
+
+inline void GetWGSizeNHWC(size_t c,
+                          size_t h,
+                          size_t w,
+                          size_t min_workgroups,
+                          bool bfp32parm,
+                          size_t vectorsize,
+                          size_t& xlocalsize,
+                          size_t& ylocalsize)
+{
+    unsigned int xlocalsize_limit = vectorsize > 1 ? (bfp32parm ? 16 : 32) : 64;
+    // shared memory size per workgroup is fixed
+    unsigned int max_localsize = 1024 / vectorsize;
+
+    size_t nworkgroups = 0;
+    xlocalsize         = 0;
+    // decrease max_localsize until the number of workgroups is greater than 80%
+    // of the available CUs
+    while(nworkgroups < min_workgroups && max_localsize >= xlocalsize_limit)
+    {
+        // xlocalsize must be power of 2 as reductions in the kernels rely on it, here c is rounded
+        // up to next power of 2.
+        xlocalsize  = std::min(size_t{1 << int(std::ceil(std::log2(c / vectorsize)))},
+                              size_t{xlocalsize_limit});
+        ylocalsize  = max_localsize / xlocalsize;
+        nworkgroups = ((c / vectorsize + xlocalsize - 1) / xlocalsize) *
+                      ((h * w + ylocalsize - 1) / ylocalsize);
+        max_localsize >>= 1;
+    }
+}
+
+inline int GetStashMethod(bool IsLayoutNHWC,
+                          miopenDataType_t problem_type,
+                          unsigned int stash_values,
+                          size_t c,
+                          size_t n,
+                          size_t in_cstride,
+                          unsigned int ylocalsize)
+{
+    // See `batchnorm_functions.hpp` for stash implementation of different methods
+    int stash_method = 0;
+    stash_values *= (problem_type == miopenFloat ? 1 : 2);
+    unsigned int last_ylocalsize =
+        (in_cstride) % ylocalsize == 0 ? ylocalsize : (in_cstride) % ylocalsize;
+    if(last_ylocalsize < stash_values && n >= (size_t)stash_values)
+    {
+        stash_method = 1;
+    }
+    if(IsLayoutNHWC && !(problem_type == miopenFloat) && (c % 2 != 0) && (n >= stash_values))
+    {
+        stash_method = 2;
+    }
+    return stash_method;
+}
+
+// Returns true if spatial multiple is applicable and fill NHWC configuration
+// (xlocalsize, ylocalsize).
+// First workgroup size is computed given a problem and vectorsize, then it checks
+// if the computed workgroup is applicable (spatial multiple restrictions)
+inline bool GetLocalConfigNHWC(const miopen::batchnorm::ProblemDescription& problem,
+                               unsigned int stash_values,
+                               size_t vectorsize,
+                               size_t& xlocalsize,
+                               size_t& ylocalsize)
+{
+    bool bfp32parm =
+        problem.GetXDesc().GetType() == miopenHalf || problem.GetXDesc().GetType() == miopenBFloat16
+            ? false
+            : true;
+
+    size_t n, c, h, w = 0;
+    std::tie(n, c, h, w) = tien<4>(problem.GetXDesc().GetLengths());
+    assert((n != 0) && "n cannot be 0");
+    assert((c != 0) && "c cannot be 0");
+    assert((h != 0) && "h cannot be 0");
+    assert((w != 0) && "w cannot be 0");
+
+    GetWGSizeNHWC(
+        c, h, w, problem.GetMinWorkgroups(), bfp32parm, vectorsize, xlocalsize, ylocalsize);
+    assert((xlocalsize != 0) && "xlocalsize cannot be 0");
+    assert((ylocalsize != 0) && "ylocalsize cannot be 0");
+    if(ylocalsize == 0)
+    {
+        ylocalsize = 1;
+    }
+    stash_values *= (bfp32parm ? 1 : 2);
+    unsigned int last_ylocalsize = (h * w) % ylocalsize == 0 ? ylocalsize : (h * w) % ylocalsize;
+    // FP32:
+    //  - last block must have enough space to stash intermediate results in HW dimension
+    //  - if last block doesn't fit, intermediate results are stored in N dimension which must
+    //    be large enough
+    // Mix precision:
+    //  - last block must have enough space to stash intermediate results in HW dimension
+    //  - if last block doesn't fit, intermediate results are stored in N dimension which must
+    //    be large enough
+    //  - if C is not multiple of 2, intermediate results are stored in N dimension splitting
+    //    float values in group of 2 bytes. N must be large enough
+    if((!bfp32parm && (c % 2 != 0 && n < (size_t)stash_values)) ||
+       ((last_ylocalsize < stash_values) && (n < (size_t)stash_values)))
+    {
+        return false;
+    }
+
+    return true;
+}
+
+inline bool IsSpatialMultipleApplicable(const miopen::batchnorm::ProblemDescription& problem,
+                                        size_t vectorsize,
+                                        unsigned int stash_values)
+{
+    int n, c, h, w = 0;
+    std::tie(n, c, h, w) = tien<4>(problem.GetXDesc().GetLengths());
+    assert((n != 0) && "n cannot be 0");
+    assert((c != 0) && "c cannot be 0");
+    assert((h != 0) && "h cannot be 0");
+    assert((w != 0) && "w cannot be 0");
+
+    unsigned int in_cstride = h * w;
+
+    if(problem.IsLayoutNHWC())
+    {
+        // check if the provided vectorsize can be used
+        if(c % vectorsize != 0)
+        {
+            return false;
+        }
+        // Variant 2 is the primary choice for NHWC
+        size_t xlocalsize, ylocalsize = 0;
+
+        // The configuration is ignored at this point, it was just computed to check
+        // if spatial multiple could be applied.
+        return GetLocalConfigNHWC(problem, stash_values, vectorsize, xlocalsize, ylocalsize);
+    }
+    else
+    {
+        // check if the provided vectorsize can be used
+        if(in_cstride % vectorsize != 0)
+        {
+            return false;
+        }
+
+        unsigned int ylocalsize = 1024;
+        unsigned int last_ylocalsize =
+            in_cstride % ylocalsize == 0 ? ylocalsize : in_cstride % ylocalsize;
+        // Restrictions:
+        //  - last block must have enough space to stash intermediate results in HW dimension
+        //  - if last block doesn't fit, intermediate results are stored in N dimension which must
+        //    be large enough
+        stash_values *= (problem.GetXDesc().GetType() == miopenFloat ? 1 : 2);
+        if(last_ylocalsize < stash_values && n < (size_t)stash_values)
+        {
+            return false;
+        }
+    }
+    return true;
+}
+
+inline void GetSpatialMultipleConfig(const miopen::batchnorm::ProblemDescription& problem,
+                                     unsigned int stash_values,
+                                     size_t vectorsize,
+                                     size_t& xlocalsize,
+                                     size_t& ylocalsize,
+                                     size_t& xgridsize,
+                                     size_t& ygridsize,
+                                     int& stash_method)
+{
+    int n, c, h, w;
+    std::tie(n, c, h, w)    = tien<4>(problem.GetXDesc().GetLengths());
+    unsigned int in_cstride = h * w;
+
+    if(problem.IsLayoutNHWC())
+    {
+        // The function returns if the method is valid but we can ignore it
+        // at this point
+        GetLocalConfigNHWC(problem, stash_values, vectorsize, xlocalsize, ylocalsize);
+
+        xgridsize = xlocalsize * ((c / vectorsize + xlocalsize - 1) / xlocalsize);
+        ygridsize = ylocalsize * ((in_cstride + ylocalsize - 1) / ylocalsize);
+    }
+    else
+    {
+        xlocalsize = 1;
+        xgridsize  = c;
+        ylocalsize = 1024;
+        if(ylocalsize > in_cstride / vectorsize)
+        {
+            // No need to use workgroups larger than the HW dimension
+            ylocalsize = std::max(size_t{64},
+                                  size_t{1 << int(std::ceil(std::log2(in_cstride / vectorsize)))});
+        }
+        ygridsize = ylocalsize * ((in_cstride / vectorsize + ylocalsize - 1) / ylocalsize);
+    }
+    stash_method = GetStashMethod(problem.IsLayoutNHWC(),
+                                  problem.GetXDesc().GetType(),
+                                  stash_values,
+                                  c,
+                                  n,
+                                  in_cstride,
+                                  ylocalsize);
+}
+
+inline void GetVariantFromKernelId(const std::string& kernel_id, int& variant, size_t& vectorsize)
+{
+    // kernel_id has the following standard:
+    // Variant<variant>-<vectorsize>
+    size_t pos = kernel_id.find("Variant");
+    if(pos != std::string::npos)
+    {
+        variant    = kernel_id[pos + 7] - '0';
+        vectorsize = kernel_id[pos + 9] - '0';
+    }
+}
+
+inline std::string GetKernelIdFromVariant(int variant, size_t vectorsize)
+{
+    std::stringstream stream;
+    stream << "Variant" << variant << "-" << vectorsize;
+    return stream.str();
+}
+
+inline bool UseMultiple(const miopen::batchnorm::ProblemDescription& problem)
+{
+    size_t n, c, h, w;
+    std::tie(n, c, h, w) = tien<4>(problem.GetXDesc().GetLengths());
+
+    bool bfpmixparm = (problem.GetXDesc().GetType() == miopenHalf ||
+                       problem.GetXDesc().GetType() == miopenBFloat16) &&
+                              problem.GetBnScale().GetType() == miopenFloat
+                          ? true
+                          : false;
+
+    unsigned int in_cstride = h * w;
+    unsigned int in_nhw     = n * in_cstride;
+    // Check heuristics (used to choose between spatial single and multiple for performance)
+    // TODO: review these conditions (variant 2 was optimized and vectorization was added,
+    // so we need a set of benchmarks to check that these conditions are still correct)
+    if(!problem.IsLayoutNHWC() &&
+       problem.GetDirection() == miopen::batchnorm::Direction::Backward &&
+       (!((in_nhw >= static_cast<size_t>(32 * 1024 * 1024) || in_cstride <= 1024) &&
+          (in_nhw >= static_cast<size_t>(32 * 1024 * 1024) || in_cstride <= 512) &&
+          in_cstride > 512)))
+    {
+        return false;
+    }
+
+    if(!problem.IsLayoutNHWC() &&
+       problem.GetDirection() == miopen::batchnorm::Direction::ForwardTraining &&
+       (!((n >= 3 && in_cstride > 512 && (in_nhw >= 33554432 || in_cstride <= 1024) &&
+           ((n < 256) || (in_cstride <= 60) || !bfpmixparm) &&
+           (!bfpmixparm || in_cstride <= 512)) ||
+          ((n > 768) && (in_cstride > 150)))))
+    {
+        return false;
+    }
+
+    return true;
+}
+
+inline void DefaultConfigSpatialSingle(const miopen::batchnorm::ProblemDescription& problem,
+                                       std::vector<std::string>& valid_kernels)
+{
+    int n, c, h, w;
+    std::tie(n, c, h, w) = tien<4>(problem.GetXDesc().GetLengths());
+
+    unsigned int in_cstride = h * w;
+    unsigned int in_nhw     = n * in_cstride;
+
+    bool bfpmixparm =
+        problem.GetXDesc().GetType() == miopenHalf && problem.GetBnScale().GetType() == miopenFloat
+            ? true
+            : false;
+
+    bool bbfpmixparam = problem.GetXDesc().GetType() == miopenBFloat16 &&
+                                problem.GetBnScale().GetType() == miopenFloat
+                            ? true
+                            : false;
+
+    // NCHW supports also variants 0 and 3 which can be much faster than
+    // variant 1 but have more restrictions. Here we decide if we use variant
+    // 0, 1, 3
+    // In case variant 0 or 3 are selected, we add also variant 1 for tuning.
+    // Almost always variant 0 and 3 will be faster than variant 1 but
+    // we add the latter for tuning to be sure and because it is cheap
+    if(!problem.IsLayoutNHWC())
+    {
+        if(problem.GetDirection() == miopen::batchnorm::Direction::Backward)
+        {
+            if((in_cstride < 200) && (in_cstride > 60) && bfpmixparm)
+            {
+                valid_kernels.push_back(GetKernelIdFromVariant(1, 1));
+                return;
+            }
+
+            // N*H*W < 32M and H*W > 1024
+            // use batchnorm variant#1 implementation which parallelize
+            // work groups over channels and loop through NHW.
+            if((in_nhw < (32 * 1024 * 1024) && in_cstride > 1024))
+            {
+                valid_kernels.push_back(GetKernelIdFromVariant(1, 1));
+                return;
+            }
+            // N*H*W < 32M and H*W > 512
+            // use batchnorm variant#1 or variant#3 implementation which
+            // parallelize work groups over channels and loop through N.
+            else if(in_nhw < (32 * 1024 * 1024) && in_cstride > 512)
+            {
+                if(n >= 32)
+                {
+                    valid_kernels.push_back(GetKernelIdFromVariant(1, 1));
+                    return;
+                }
+                else
+                {
+                    valid_kernels.push_back(GetKernelIdFromVariant(3, 1));
+                    valid_kernels.push_back(GetKernelIdFromVariant(1, 1));
+                    return;
+                }
+            }
+            // H*W < 512  use batchnorm variant#0 or variant#3 implementation
+            // based on batch size and H*W
+            else if(in_cstride <= 512)
+            {
+                if((n > 64) && (in_cstride > 160))
+                {
+                    valid_kernels.push_back(GetKernelIdFromVariant(3, 1));
+                    valid_kernels.push_back(GetKernelIdFromVariant(1, 1));
+                    return;
+                }
+                else
+                {
+                    valid_kernels.push_back(GetKernelIdFromVariant(0, 1));
+                    valid_kernels.push_back(GetKernelIdFromVariant(1, 1));
+                    return;
+                }
+            }
+        }
+        else
+        {
+#if(WORKAROUND_SWDEV_253606 == 0)
+            if(n < 3)
+            {
+                valid_kernels.push_back(GetKernelIdFromVariant(4, 1));
+                valid_kernels.push_back(GetKernelIdFromVariant(1, 1));
+                return;
+            }
+            else
+#endif
+            {
+                // clang-format off
+                if(in_cstride > 512 && in_cstride <= 1024 && n < 32)
+                {
+                    valid_kernels.push_back(GetKernelIdFromVariant(3, 1));
+                    valid_kernels.push_back(GetKernelIdFromVariant(1, 1));
+                    return;
+                }
+
+                if( (in_nhw < 33554432 && in_cstride > 1024) ||
+                ((n >= 256) && (in_cstride > 60) && (bfpmixparm || bbfpmixparam)) ||
+                ((in_cstride > 512) && (bfpmixparm || bbfpmixparam)))
+                {
+                    valid_kernels.push_back(GetKernelIdFromVariant(1, 1));
+                    if(in_cstride <= 512)
+                    {
+                        valid_kernels.push_back(GetKernelIdFromVariant(0, 1));
+                    }
+                    return;
+                }
+                else if(in_cstride <= 512)
+                {
+                    valid_kernels.push_back(GetKernelIdFromVariant(0, 1));
+                    valid_kernels.push_back(GetKernelIdFromVariant(1, 1));
+                    return;
+                }
+                // clang-format on
+            }
+        }
+        valid_kernels.push_back(GetKernelIdFromVariant(1, 1));
+    }
+}
+
+inline void DefaultConfigSpatialMultiple(const miopen::batchnorm::ProblemDescription& problem,
+                                         unsigned int stash_values,
+                                         std::vector<std::string>& valid_kernels)
+{
+    int n, c, h, w;
+    std::tie(n, c, h, w)    = tien<4>(problem.GetXDesc().GetLengths());
+    unsigned int in_cstride = h * w;
+
+    // Default configuration for spatial multiple tries to use vectorization
+    // for both NCHW or NHWC
+    size_t vectorsize =
+        problem.IsLayoutNHWC() ? (c % 4 == 0 ? 4 : 1) : (in_cstride % 4 == 0 ? 4 : 1);
+    if(IsSpatialMultipleApplicable(problem, vectorsize, stash_values))
+    {
+        valid_kernels.push_back(GetKernelIdFromVariant(2, vectorsize));
+        // if vectorized version is applicable, then the non vectorized version
+        // is also added to the list of configurations
+        if(vectorsize > 1)
+        {
+            valid_kernels.push_back(GetKernelIdFromVariant(2, 1));
+        }
+        return;
+    }
+
+    // If spatial multiple with vectorization can not be used, try without vectorization
+    if(vectorsize > 1 && IsSpatialMultipleApplicable(problem, 1, stash_values))
+    {
+        valid_kernels.push_back(GetKernelIdFromVariant(2, 1));
+    }
+}
+
+} // namespace batchnorm
+
+} // namespace solver
+
+} // namespace miopen
diff --git a/src/include/miopen/batchnorm/problem_description.hpp b/src/include/miopen/batchnorm/problem_description.hpp
index 4fed9033a6..50d026ea71 100644
--- a/src/include/miopen/batchnorm/problem_description.hpp
+++ b/src/include/miopen/batchnorm/problem_description.hpp
@@ -73,7 +73,8 @@ struct MIOPEN_INTERNALS_EXPORT ProblemDescription : ProblemDescriptionBase,
                        double expAvgFactor_,
                        double epsilon_,
                        bool resultsave_,
-                       bool resultrunning_)
+                       bool resultrunning_,
+                       size_t min_workgroups_)
         : direction(Direction::ForwardTraining),
           bn_mode(bn_mode_),
           xDesc(xDesc_),
@@ -85,7 +86,8 @@ struct MIOPEN_INTERNALS_EXPORT ProblemDescription : ProblemDescriptionBase,
           expAvgFactor(expAvgFactor_),
           epsilon(epsilon_),
           resultsave(resultsave_),
-          resultrunning(resultrunning_)
+          resultrunning(resultrunning_),
+          min_workgroups(min_workgroups_)
     {
         SetSpatialDims();
         in_layout  = ComputeInLayout();
@@ -126,7 +128,8 @@ struct MIOPEN_INTERNALS_EXPORT ProblemDescription : ProblemDescriptionBase,
                        const TensorDescriptor& sMeanDesc_,
                        const TensorDescriptor& sVarianceDesc_,
                        double epsilon_,
-                       bool useSaved_)
+                       bool useSaved_,
+                       size_t min_workgroups_)
         : direction(Direction::Backward),
           bn_mode(bn_mode_),
           xDesc(xDesc_),
@@ -137,7 +140,8 @@ struct MIOPEN_INTERNALS_EXPORT ProblemDescription : ProblemDescriptionBase,
           sMeanDesc(sMeanDesc_),
           sVarianceDesc(sVarianceDesc_),
           epsilon(epsilon_),
-          useSaved(useSaved_)
+          useSaved(useSaved_),
+          min_workgroups(min_workgroups_)
     {
         SetSpatialDims();
         in_layout  = ComputeInLayout();
@@ -196,6 +200,12 @@ struct MIOPEN_INTERNALS_EXPORT ProblemDescription : ProblemDescriptionBase,
         return resultrunning;
     }
 
+    std::size_t GetMinWorkgroups() const
+    {
+        assert(direction == Direction::ForwardTraining || direction == Direction::Backward);
+        return min_workgroups;
+    }
+
     bool UseSaved() const
     {
         assert(direction == Direction::Backward);
@@ -307,17 +317,14 @@ struct MIOPEN_INTERNALS_EXPORT ProblemDescription : ProblemDescriptionBase,
 #pragma clang diagnostic pop
 #endif
 
-    bool resultsave         = false;
-    bool resultrunning      = false;
-    bool useSaved           = false;
-    std::string in_layout   = "NCHW";
-    std::string out_layout  = "NCHW";
-    std::string din_layout  = "NCHW";
-    std::size_t spatial_dim = 2;
-
-    NetworkConfig MakeForwardTrainingNetworkConfig() const;
-    NetworkConfig MakeForwardInferenceNetworkConfig() const;
-    NetworkConfig MakeBackwardNetworkConfig() const;
+    bool resultsave            = false;
+    bool resultrunning         = false;
+    bool useSaved              = false;
+    std::string in_layout      = "NCHW";
+    std::string out_layout     = "NCHW";
+    std::string din_layout     = "NCHW";
+    std::size_t spatial_dim    = 2;
+    std::size_t min_workgroups = 1;
 
     std::string ComputeLayout(const TensorDescriptor& td) const { return td.GetLayout_str(); }
     std::string ComputeInLayout() const { return ComputeLayout(xDesc); }
diff --git a/src/include/miopen/batchnorm/solvers.hpp b/src/include/miopen/batchnorm/solvers.hpp
index 7b8696b95d..13a9a52387 100644
--- a/src/include/miopen/batchnorm/solvers.hpp
+++ b/src/include/miopen/batchnorm/solvers.hpp
@@ -49,11 +49,11 @@ using BatchNormTunableSolver =
     TunableSolverMixin<ExecutionContext, miopen::batchnorm::ProblemDescription, PerformanceConfig>;
 ;
 
-struct BnFwdTrainingSpatialSingle final : BatchnormSolver
+struct BnFwdTrainingPerActivation final : BatchnormSolver
 {
     const std::string& SolverDbId() const override
     {
-        return GetSolverDbId<BnFwdTrainingSpatialSingle>();
+        return GetSolverDbId<BnFwdTrainingPerActivation>();
     }
 
     bool IsApplicable(const ExecutionContext& context,
@@ -62,56 +62,106 @@ struct BnFwdTrainingSpatialSingle final : BatchnormSolver
                              const miopen::batchnorm::ProblemDescription& problem) const override;
 };
 
-struct BnFwdTrainingSpatialMultiple final : BatchnormSolver
+struct PerformanceConfigBnBwdBackward : PerfConfigBase<PerformanceConfigBnBwdBackward>
 {
-    const std::string& SolverDbId() const override
+    int index;
+    std::string kernel_id;
+    std::vector<std::string> valid_kernels;
+    MIOPEN_INTERNALS_EXPORT PerformanceConfigBnBwdBackward(int idx, std::string kernl_id)
+        : index(idx), kernel_id(kernl_id)
     {
-        return GetSolverDbId<BnFwdTrainingSpatialMultiple>();
     }
+    PerformanceConfigBnBwdBackward() : PerformanceConfigBnBwdBackward(0, "") {}
+    PerformanceConfigBnBwdBackward(bool) : PerformanceConfigBnBwdBackward(0, "") {}
+    MIOPEN_INTERNALS_EXPORT void
+    HeuristicInit(const miopen::batchnorm::ProblemDescription& problem);
+    MIOPEN_INTERNALS_EXPORT bool SetNextValue(const miopen::batchnorm::ProblemDescription& problem);
+    MIOPEN_INTERNALS_EXPORT bool IsValidValue() const;
+    MIOPEN_INTERNALS_EXPORT bool
+    IsValid(const ExecutionContext&, const miopen::batchnorm::ProblemDescription& problem) const;
 
-    bool IsApplicable(const ExecutionContext& context,
-                      const miopen::batchnorm::ProblemDescription& problem) const override;
-    ConvSolution GetSolution(const ExecutionContext& context,
-                             const miopen::batchnorm::ProblemDescription& problem) const override;
+    template <typename Self, typename F>
+    static void Visit(Self&& s, F f)
+    {
+        f(s.kernel_id, "kernel_id");
+    }
+    MIOPEN_INTERNALS_EXPORT bool operator==(const PerformanceConfigBnBwdBackward& other) const;
 };
 
-struct BnFwdTrainingPerActivation final : BatchnormSolver
+struct PerformanceConfigBnFwdTraining : PerfConfigBase<PerformanceConfigBnFwdTraining>
 {
-    const std::string& SolverDbId() const override
+    int index;
+    std::string kernel_id;
+    std::vector<std::string> valid_kernels;
+    MIOPEN_INTERNALS_EXPORT PerformanceConfigBnFwdTraining(int idx, std::string kernl_id)
+        : index(idx), kernel_id(kernl_id)
     {
-        return GetSolverDbId<BnFwdTrainingPerActivation>();
     }
+    PerformanceConfigBnFwdTraining() : PerformanceConfigBnFwdTraining(0, "") {}
+    PerformanceConfigBnFwdTraining(bool) : PerformanceConfigBnFwdTraining(0, "") {}
+    MIOPEN_INTERNALS_EXPORT void
+    HeuristicInit(const miopen::batchnorm::ProblemDescription& problem);
+    MIOPEN_INTERNALS_EXPORT bool SetNextValue(const miopen::batchnorm::ProblemDescription& problem);
+    MIOPEN_INTERNALS_EXPORT bool IsValidValue() const;
+    MIOPEN_INTERNALS_EXPORT bool
+    IsValid(const ExecutionContext&, const miopen::batchnorm::ProblemDescription& problem) const;
 
-    bool IsApplicable(const ExecutionContext& context,
-                      const miopen::batchnorm::ProblemDescription& problem) const override;
-    ConvSolution GetSolution(const ExecutionContext& context,
-                             const miopen::batchnorm::ProblemDescription& problem) const override;
+    template <typename Self, typename F>
+    static void Visit(Self&& s, F f)
+    {
+        f(s.kernel_id, "kernel_id");
+    }
+    MIOPEN_INTERNALS_EXPORT bool operator==(const PerformanceConfigBnFwdTraining& other) const;
 };
 
-struct BnBwdTrainingSpatialSingle final : BatchnormSolver
+struct BnBwdTrainingSpatial final : BatchNormTunableSolver<PerformanceConfigBnBwdBackward>
 {
-    const std::string& SolverDbId() const override
-    {
-        return GetSolverDbId<BnBwdTrainingSpatialSingle>();
-    }
+    const std::string& SolverDbId() const override { return GetSolverDbId<BnBwdTrainingSpatial>(); }
+
+    MIOPEN_INTERNALS_EXPORT PerformanceConfigBnBwdBackward GetDefaultPerformanceConfig(
+        const ExecutionContext& context,
+        const miopen::batchnorm::ProblemDescription& problem) const override;
+
+    MIOPEN_INTERNALS_EXPORT bool
+    IsValidPerformanceConfig(const ExecutionContext& ctx,
+                             const miopen::batchnorm::ProblemDescription& problem,
+                             const PerformanceConfigBnBwdBackward& config) const override;
+
+    MIOPEN_INTERNALS_EXPORT PerformanceConfigBnBwdBackward
+    Search(const ExecutionContext& ctx,
+           const miopen::batchnorm::ProblemDescription& problem,
+           const AnyInvokeParams& invoke_ctx) const override;
 
     bool IsApplicable(const ExecutionContext& context,
                       const miopen::batchnorm::ProblemDescription& problem) const override;
     ConvSolution GetSolution(const ExecutionContext& context,
-                             const miopen::batchnorm::ProblemDescription& problem) const override;
+                             const miopen::batchnorm::ProblemDescription& problem,
+                             const PerformanceConfigBnBwdBackward& config) const override;
 };
 
-struct BnBwdTrainingSpatialMultiple final : BatchnormSolver
+struct BnFwdTrainingSpatial final : BatchNormTunableSolver<PerformanceConfigBnFwdTraining>
 {
-    const std::string& SolverDbId() const override
-    {
-        return GetSolverDbId<BnBwdTrainingSpatialMultiple>();
-    }
+    const std::string& SolverDbId() const override { return GetSolverDbId<BnFwdTrainingSpatial>(); }
+
+    MIOPEN_INTERNALS_EXPORT PerformanceConfigBnFwdTraining GetDefaultPerformanceConfig(
+        const ExecutionContext& context,
+        const miopen::batchnorm::ProblemDescription& problem) const override;
+
+    MIOPEN_INTERNALS_EXPORT bool
+    IsValidPerformanceConfig(const ExecutionContext& ctx,
+                             const miopen::batchnorm::ProblemDescription& problem,
+                             const PerformanceConfigBnFwdTraining& config) const override;
+
+    MIOPEN_INTERNALS_EXPORT PerformanceConfigBnFwdTraining
+    Search(const ExecutionContext& ctx,
+           const miopen::batchnorm::ProblemDescription& problem,
+           const AnyInvokeParams& invoke_ctx) const override;
 
     bool IsApplicable(const ExecutionContext& context,
                       const miopen::batchnorm::ProblemDescription& problem) const override;
     ConvSolution GetSolution(const ExecutionContext& context,
-                             const miopen::batchnorm::ProblemDescription& problem) const override;
+                             const miopen::batchnorm::ProblemDescription& problem,
+                             const PerformanceConfigBnFwdTraining& config) const override;
 };
 
 struct BnBwdTrainingPerActivation final : BatchnormSolver
diff --git a/src/include/miopen/fusion/problem_description.hpp b/src/include/miopen/fusion/problem_description.hpp
index b3d1669cee..fc485300ce 100644
--- a/src/include/miopen/fusion/problem_description.hpp
+++ b/src/include/miopen/fusion/problem_description.hpp
@@ -153,7 +153,8 @@ struct FusionDescription : ProblemDescriptionBase
                     not_used, // expAvgFactor filler
                     not_used,
                     true /* resultSave*/,
-                    bn_op.runningMeanVar};
+                    bn_op.runningMeanVar,
+                    1}; /* min number of workgroups */
         }
         else if(dir == miopen::batchnorm::Direction::Backward)
         {
@@ -170,7 +171,8 @@ struct FusionDescription : ProblemDescriptionBase
                     {} /*bn_op.base_desc*/,
                     {} /*bn_op.base_desc*/,
                     not_used,
-                    bn_op.useBatchStats /*useSaved*/};
+                    bn_op.useBatchStats /*useSaved*/,
+                    1}; /*min number of workgroups */
         }
         else
             MIOPEN_THROW(miopenStatusNotImplemented);
diff --git a/src/include/miopen/solver/implicitgemm_ck_util.hpp b/src/include/miopen/solver/implicitgemm_ck_util.hpp
index 27800f15b0..91d055571b 100644
--- a/src/include/miopen/solver/implicitgemm_ck_util.hpp
+++ b/src/include/miopen/solver/implicitgemm_ck_util.hpp
@@ -312,12 +312,13 @@ ConvSolution InitAnyInvokerFactory(const ProblemDescriptionType& problem,
                 const auto& data_ctx = primitive_parameters.CastTo<CastType>();
                 auto argument_ptr    = ck_args.MakeArgPtr(sh_conv_ptr, data_ctx);
                 auto invoker_ptr     = sh_conv_ptr->MakeInvokerPointer();
-
-                const auto enable_profiling = handle.IsProfilingEnabled();
-                float elapsed_time =
-                    invoker_ptr->Run(argument_ptr.get(), {handle.GetStream(), enable_profiling});
-                if(enable_profiling)
                 {
+                    WorkAroundHipEventProfiler prf(handle);
+                    invoker_ptr->Run(argument_ptr.get(), {handle.GetStream(), false});
+                }
+                if(handle.IsProfilingEnabled())
+                {
+                    float elapsed_time = handle.GetKernelTime();
                     handle.ResetKernelTime();
                     handle.AccumKernelTime(elapsed_time);
                 }
diff --git a/src/kernels/MIOpenBatchNormBwdSpatial.cl b/src/kernels/MIOpenBatchNormBwdSpatial.cl
index a018d35640..7465ff547f 100644
--- a/src/kernels/MIOpenBatchNormBwdSpatial.cl
+++ b/src/kernels/MIOpenBatchNormBwdSpatial.cl
@@ -638,300 +638,451 @@ MIOpenBatchNormBwdSpatial(const __global _FLOAT* __restrict x_in,
 
 __attribute__((reqd_work_group_size(MIO_BN_GRP0, MIO_BN_GRP1, MIO_BN_GRP2))) __kernel void
 MIOpenBatchNormBwdSpatialFinalMeanVariance(__global _FLOAT* __restrict meanvarbuff,
-                                           _FLOAT INHW,
+                                           _FLOAT_PREC INHW,
                                            double epsilon)
 {
-    _FLOAT variance             = (_FLOAT)0.;
-    _FLOAT invVariance          = (_FLOAT)0.;
-    _FLOAT mean                 = (_FLOAT)0.;
-    unsigned int lid            = get_local_id(1);
-    unsigned int ygrp_id        = get_group_id(1);
-    unsigned int xgid           = get_global_id(0);
-    unsigned int ygrp_sz        = get_local_size(1);
-    unsigned int yngrps         = get_num_groups(1);
-    unsigned int cidx           = xgid * MIO_BN_HW;
-    unsigned int meanstashindex = cidx + ygrp_sz * ygrp_id + 1;
-    unsigned int varstashindex  = cidx + ygrp_sz * ygrp_id + 3;
-    unsigned int commitID       = 0;
-
-    for(int gn = 0; gn < yngrps; gn++)
+
+    unsigned int xlid    = get_local_id(0);
+    unsigned int ylid    = get_local_id(1);
+    unsigned int xgrp_id = get_group_id(0);
+    unsigned int xgid    = get_global_id(0);
+    unsigned int xgrp_sz = get_local_size(0);
+    unsigned int ygrp_sz = get_local_size(1);
+
+    unsigned int xstride = MIO_LAYOUT_NHWC ? 1 : MIO_BN_HW;
+    unsigned int ystride = MIO_LAYOUT_NHWC ? MIO_BN_C : 1;
+
+    if(xgid * VEC_SIZE_X >= MIO_BN_C)
+        return;
+
+    _FLOAT_PREC_C variance = (_FLOAT_PREC_C)0.;
+    _FLOAT_PREC_C mean     = (_FLOAT_PREC_C)0.;
+    _FLOAT_PREC_C invVariance;
+
+    for(unsigned int yoffset = ylid; yoffset < MIO_BN_NGRPS; yoffset += ygrp_sz)
     {
-        unsigned int offset    = gn * ygrp_sz + lid;
-        unsigned int meanindex = cidx + ygrp_sz * offset;
-        unsigned int varindex  = cidx + ygrp_sz * offset + 2;
-        if(offset < yngrps)
-        { // modify to span larger number of groups
-            mean += *(meanvarbuff + meanindex);
-            variance += *(meanvarbuff + varindex); // load per group variance
-        }
+        mean += loadFromStash((__global _FLOAT_C*)meanvarbuff,
+                              0,
+                              ygrp_sz * yoffset * VEC_SIZE_Y,
+                              ystride / VEC_SIZE_X,
+                              xgrp_sz,
+                              xgrp_id,
+                              xlid,
+                              xstride);
+        variance += loadFromStash((__global _FLOAT_C*)meanvarbuff,
+                                  1,
+                                  ygrp_sz * yoffset * VEC_SIZE_Y,
+                                  ystride / VEC_SIZE_X,
+                                  xgrp_sz,
+                                  xgrp_id,
+                                  xlid,
+                                  xstride);
     }
 
-#if !MIOPEN_USE_AMDGCN
-    local _FLOAT_ACCUM lcl_data_x[MIO_BN_LDS_SIZE];
-    local _FLOAT_ACCUM lcl_data_y[MIO_BN_LDS_SIZE];
-    lds_reduce2(&mean, &variance, FLOAT2ACCUM(INHW), lcl_data_x, lcl_data_y, lid);
+#if !MIOPEN_USE_AMDGCN || MIO_BN_GRP0 > 1 || MIO_BN_LDSGCN_SIZE == 1
+    // TODO: this simple approach has many bank conflicts, optimize if it affects performance
+    local _FLOAT_ACCUM_C lcl_data_x[MIO_BN_LDS_SIZE];
+    local _FLOAT_ACCUM_C lcl_data_y[MIO_BN_LDS_SIZE];
+    lds_reduce2_2d(&mean,
+                   &variance,
+                   INHW,
+                   lcl_data_x + xlid * ygrp_sz,
+                   lcl_data_y + xlid * ygrp_sz,
+                   ylid,
+                   ygrp_sz);
 #else
-    commitID = 64;
-    local _FLOAT_ACCUM lcl_data_x[MIO_BN_LDSGCN_SIZE];
-    local _FLOAT_ACCUM lcl_data_y[MIO_BN_LDSGCN_SIZE];
-    gcn_reduce2(&mean, &variance, FLOAT2ACCUM(INHW), lcl_data_x, lcl_data_y, lid);
+    local _FLOAT_ACCUM_C lcl_data_x[MIO_BN_LDSGCN_SIZE];
+    local _FLOAT_ACCUM_C lcl_data_y[MIO_BN_LDSGCN_SIZE];
+    gcn_reduce2(&mean, &variance, INHW, lcl_data_x, lcl_data_y, ylid);
 #endif
 
-    barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
-    variance = mad(-mean, mean, variance);
-    if(variance < 0)
-    {
-        variance = 0;
-    }
-    invVariance = rsqrt(variance + epsilon);
-    if(lid == commitID)
+    variance    = mad(-mean, mean, variance);
+    variance    = max(variance, (_FLOAT_PREC_C)0.);
+    invVariance = rsqrt(variance + (_FLOAT_PREC_C)epsilon);
+
+    for(unsigned int yoffset = ylid; yoffset < MIO_BN_NGRPS; yoffset += ygrp_sz)
     {
-        meanvarbuff[meanstashindex] = mean;        // stash mean
-        meanvarbuff[varstashindex]  = invVariance; // stash mean
+        // Replicate mean and variance for all y groups because stash == dx_out and
+        // MIOpenBatchNormBwdSpatialDX will read them and rewrite the buffer entirely.
+        storeToStash(mean,
+                     (__global _FLOAT_C*)meanvarbuff,
+                     0,
+                     ygrp_sz * yoffset * VEC_SIZE_Y,
+                     ystride / VEC_SIZE_X,
+                     xgrp_sz,
+                     xgrp_id,
+                     xlid,
+                     xstride);
+        storeToStash(invVariance,
+                     (__global _FLOAT_C*)meanvarbuff,
+                     1,
+                     ygrp_sz * yoffset * VEC_SIZE_Y,
+                     ystride / VEC_SIZE_X,
+                     xgrp_sz,
+                     xgrp_id,
+                     xlid,
+                     xstride);
     }
 }
 
 __attribute__((reqd_work_group_size(MIO_BN_GRP0, MIO_BN_GRP1, MIO_BN_GRP2))) __kernel void
 MIOpenBatchNormBwdSpatialMeanVariance(const __global _FLOAT* __restrict in,
-                                      __global _FLOAT* __restrict mvbuff)
+                                      __global _FLOAT* __restrict meanvarbuff)
 {
 
+    unsigned int xlid    = get_local_id(0);
     unsigned int ylid    = get_local_id(1);
+    unsigned int xgrp_id = get_group_id(0);
     unsigned int ygrp_id = get_group_id(1);
     unsigned int xgid    = get_global_id(0);
     unsigned int ygid    = get_global_id(1);
+    unsigned int xgrp_sz = get_local_size(0);
     unsigned int ygrp_sz = get_local_size(1);
+
+    unsigned int xstride = MIO_LAYOUT_NHWC ? 1 : MIO_BN_HW;
+    unsigned int ystride = MIO_LAYOUT_NHWC ? MIO_BN_C : 1;
+
+    if(xgid * VEC_SIZE_X >= MIO_BN_C)
+        return;
+
     unsigned int index;
-    unsigned int cidx      = xgid * MIO_BN_HW;
-    unsigned int meanindex = cidx + ygrp_sz * ygrp_id;
-    unsigned int varindex  = meanindex + 2;
-    _FLOAT mean            = (_FLOAT)0.;
-    _FLOAT variance        = (_FLOAT)0.;
-    _FLOAT value           = (_FLOAT)0.;
-
-    if(ygid < MIO_BN_HW)
-    {
+    _FLOAT_PREC_LS value;
+    _FLOAT_PREC_C mean     = (_FLOAT_PREC_C)0.;
+    _FLOAT_PREC_C variance = (_FLOAT_PREC_C)0.;
 
+    if(ygid * VEC_SIZE_Y < MIO_BN_HW)
+    {
+        _FLOAT_LS read4;
         for(unsigned int n = 0; n < MIO_BN_N; n++)
         {
-            index = n * MIO_BN_CHW + cidx + ygid;
-            value = *(in + index);
-            mean += value;
-            variance = mad(value, value, variance);
+            index = n * MIO_BN_CHW + ygid * ystride * VEC_SIZE_Y + xgid * xstride * VEC_SIZE_X;
+            read4 = *((const __global _FLOAT_LS*)(in + index));
+            value = FLOAT2FLOATPREC_VEC(read4);
+            _ACCUMULATE(mean, value)
+            _ACCUMULATE_MAD(variance, value, value, variance)
         }
     }
 
-#if !MIOPEN_USE_AMDGCN
-    local _FLOAT_ACCUM lcl_data_x[MIO_BN_NGRPS];
-    local _FLOAT_ACCUM lcl_data_y[MIO_BN_NGRPS];
-    lds_reduce2(&mean, &variance, (_FLOAT_ACCUM)1.0, lcl_data_x, lcl_data_y, ylid);
+#if !MIOPEN_USE_AMDGCN || MIO_BN_GRP0 > 1 || MIO_BN_LDSGCN_SIZE == 1
+    local _FLOAT_ACCUM_C lcl_data_x[MIO_BN_LDS_SIZE];
+    local _FLOAT_ACCUM_C lcl_data_y[MIO_BN_LDS_SIZE];
+    lds_reduce2_2d(&mean,
+                   &variance,
+                   (_FLOAT_ACCUM)1.0,
+                   lcl_data_x + xlid * ygrp_sz,
+                   lcl_data_y + xlid * ygrp_sz,
+                   ylid,
+                   ygrp_sz);
 #else
-    local _FLOAT_ACCUM lcl_data_x[MIO_BN_LDSGCN_SIZE];
-    local _FLOAT_ACCUM lcl_data_y[MIO_BN_LDSGCN_SIZE];
+    local _FLOAT_ACCUM_C lcl_data_x[MIO_BN_LDSGCN_SIZE];
+    local _FLOAT_ACCUM_C lcl_data_y[MIO_BN_LDSGCN_SIZE];
     gcn_reduce2(&mean, &variance, (_FLOAT_ACCUM)1.0, lcl_data_x, lcl_data_y, ylid);
 #endif
 
     if(ylid == 0)
     {
-        mvbuff[meanindex] = mean;
-        mvbuff[varindex]  = variance;
+        storeToStash(mean,
+                     (__global _FLOAT_C*)meanvarbuff,
+                     0,
+                     ygrp_sz * ygrp_id * VEC_SIZE_Y,
+                     ystride / VEC_SIZE_X,
+                     xgrp_sz,
+                     xgrp_id,
+                     xlid,
+                     xstride);
+        storeToStash(variance,
+                     (__global _FLOAT_C*)meanvarbuff,
+                     1,
+                     ygrp_sz * ygrp_id * VEC_SIZE_Y,
+                     ystride / VEC_SIZE_X,
+                     xgrp_sz,
+                     xgrp_id,
+                     xlid,
+                     xstride);
     }
 } // end spatial mean kernel
 
 #endif // end USESAVED == 0
 
 __attribute__((reqd_work_group_size(MIO_BN_GRP0, MIO_BN_GRP1, MIO_BN_GRP2))) __kernel void
-MIOpenBatchNormBwdSpatialDScaleDBias(const __global _FLOAT* x_in,
-                                     const __global _FLOAT* dy_in,
-                                     __global _FLOAT* buff
-#if(MIO_BN_USESAVED == 1)
-
+MIOpenBatchNormBwdSpatialDScaleDBias(const __global _FLOAT* __restrict x_in,
+                                     const __global _FLOAT* __restrict dy_in,
+                                     __global _FLOAT* __restrict buff
+#if MIO_BN_USESAVED == 1
                                      ,
-                                     const __global _FLOAT* savedMean,
-                                     const __global _FLOAT* savedInvVariance
+                                     const __global _FLOAT_PREC* __restrict savedMean,
+                                     const __global _FLOAT_PREC* __restrict savedInvVariance
 #endif
 )
 {
 
-    unsigned int xgid    = get_global_id(0);
+    unsigned int xlid    = get_local_id(0);
     unsigned int ylid    = get_local_id(1);
-    unsigned int ygrp_id = get_group_id(1);
+    unsigned int xgid    = get_global_id(0);
     unsigned int ygid    = get_global_id(1);
+    unsigned int xgrp_id = get_group_id(0);
+    unsigned int ygrp_id = get_group_id(1);
+    unsigned int xgrp_sz = get_local_size(0);
     unsigned int ygrp_sz = get_local_size(1);
-    unsigned int index;
-    unsigned int cidx = xgid * MIO_BN_HW;
 
-    _FLOAT mean    = (_FLOAT)0.;
-    _FLOAT invVar  = (_FLOAT)0.;
-    _FLOAT elemStd = (_FLOAT)0.;
-    _FLOAT xhat    = (_FLOAT)0.;
-    _FLOAT dscale  = (_FLOAT)0.;
-    _FLOAT dbias   = (_FLOAT)0.;
+    unsigned int xstride = MIO_LAYOUT_NHWC ? 1 : MIO_BN_HW;
+    unsigned int ystride = MIO_LAYOUT_NHWC ? MIO_BN_C : 1;
 
-    __local _FLOAT lmean, livar;
+    if(xgid * VEC_SIZE_X >= MIO_BN_C)
+        return;
+
+    unsigned int index;
+    _FLOAT_PREC_C mean, invVar;
+    _FLOAT_PREC_LS elemStd, xhat;
+    _FLOAT_PREC_C dscale = (_FLOAT_PREC_C)0.;
+    _FLOAT_PREC_C dbias  = (_FLOAT_PREC_C)0.;
+
+    local _FLOAT_PREC_C lmean[MIO_BN_GRP0], livar[MIO_BN_GRP0];
 
     if(ylid == 0)
     {
-#if(MIO_BN_USESAVED == 0)
-        unsigned int meanstashindex = cidx + ygrp_sz * ygrp_id + 1;
-        unsigned int varstashindex  = cidx + ygrp_sz * ygrp_id + 3;
-        lmean                       = *(buff + meanstashindex); // load stashed mean
-        livar                       = *(buff + varstashindex);
-#else  // NO SAVED
-        lmean = *(savedMean + xgid);
-        livar = *(savedInvVariance + xgid);
-#endif // SAVED
+#if MIO_BN_USESAVED == 0
+        lmean[xlid] = loadFromStash((__global _FLOAT_C*)buff,
+                                    0,
+                                    ygrp_sz * ygrp_id * VEC_SIZE_Y,
+                                    ystride / VEC_SIZE_X,
+                                    xgrp_sz,
+                                    xgrp_id,
+                                    xlid,
+                                    xstride);
+        livar[xlid] = loadFromStash((__global _FLOAT_C*)buff,
+                                    1,
+                                    ygrp_sz * ygrp_id * VEC_SIZE_Y,
+                                    ystride / VEC_SIZE_X,
+                                    xgrp_sz,
+                                    xgrp_id,
+                                    xlid,
+                                    xstride);
+#else
+        lmean[xlid] = *((__global _FLOAT_PREC_C*)(savedMean + xgid * VEC_SIZE_X));
+        livar[xlid] = *((__global _FLOAT_PREC_C*)(savedInvVariance + xgid * VEC_SIZE_X));
+#endif
     }
     barrier(CLK_LOCAL_MEM_FENCE);
 
-    if(ygid < MIO_BN_HW)
+    if(ygid * VEC_SIZE_Y < MIO_BN_HW)
     {
-        mean   = lmean;
-        invVar = livar;
+        mean   = lmean[xlid];
+        invVar = livar[xlid];
 
+        _FLOAT_LS read4;
+        _FLOAT_PREC_LS value1, value2;
         for(unsigned int n = 0; n < MIO_BN_N; n++)
         {
-            index = n * MIO_BN_CHW + cidx + ygid;
-            dbias += *(dy_in + index);
-            elemStd = *(x_in + index) - mean;
+            index  = n * MIO_BN_CHW + ygid * ystride * VEC_SIZE_Y + xgid * xstride * VEC_SIZE_X;
+            read4  = *((const __global _FLOAT_LS*)(dy_in + index));
+            value1 = FLOAT2FLOATPREC_VEC(read4);
+            _ACCUMULATE(dbias, value1)
+            read4   = *((const __global _FLOAT_LS*)(x_in + index));
+            value2  = FLOAT2FLOATPREC_VEC(read4);
+            elemStd = value2 - mean;
             xhat    = elemStd * invVar;
-            dscale  = mad(xhat, dy_in[index], dscale);
+            _ACCUMULATE_MAD(dscale, xhat, value1, dscale)
         }
     }
 
-// REDUCE over DS and DB
-#if !MIOPEN_USE_AMDGCN
-    local _FLOAT_ACCUM lcl_data_x2[MIO_BN_LDS_SIZE];
-    local _FLOAT_ACCUM lcl_data_y2[MIO_BN_LDS_SIZE];
-    lds_reduce2(&dscale, &dbias, (_FLOAT_ACCUM)1.0, lcl_data_x2, lcl_data_y2, ylid);
+#if !MIOPEN_USE_AMDGCN || MIO_BN_GRP0 > 1 || MIO_BN_LDSGCN_SIZE == 1
+    local _FLOAT_ACCUM_C lcl_data_x[MIO_BN_LDS_SIZE];
+    local _FLOAT_ACCUM_C lcl_data_y[MIO_BN_LDS_SIZE];
+    lds_reduce2_2d(&dscale,
+                   &dbias,
+                   (_FLOAT_ACCUM)1.0,
+                   lcl_data_x + xlid * ygrp_sz,
+                   lcl_data_y + xlid * ygrp_sz,
+                   ylid,
+                   ygrp_sz);
 #else
-    local _FLOAT_ACCUM lcl_data_x2[MIO_BN_LDSGCN_SIZE];
-    local _FLOAT_ACCUM lcl_data_y2[MIO_BN_LDSGCN_SIZE];
-    gcn_reduce2(&dscale, &dbias, (_FLOAT_ACCUM)1.0, lcl_data_x2, lcl_data_y2, ylid);
+    local _FLOAT_ACCUM_C lcl_data_x[MIO_BN_LDSGCN_SIZE];
+    local _FLOAT_ACCUM_C lcl_data_y[MIO_BN_LDSGCN_SIZE];
+    gcn_reduce2(&dscale, &dbias, (_FLOAT_ACCUM)1.0, lcl_data_x, lcl_data_y, ylid);
 #endif
 
-    // end reduction-----------
     if(ylid == 0)
     {
-        unsigned int betaindex  = cidx + ygrp_sz * ygrp_id + 6;
-        unsigned int gammaindex = cidx + ygrp_sz * ygrp_id + 4;
-        buff[gammaindex]        = FLOAT2FLOATPREC(dscale);
-        buff[betaindex]         = FLOAT2FLOATPREC(dbias);
+        storeToStash(dscale,
+                     (__global _FLOAT_C*)buff,
+                     2,
+                     ygrp_sz * ygrp_id * VEC_SIZE_Y,
+                     ystride / VEC_SIZE_X,
+                     xgrp_sz,
+                     xgrp_id,
+                     xlid,
+                     xstride);
+        storeToStash(dbias,
+                     (__global _FLOAT_C*)buff,
+                     3,
+                     ygrp_sz * ygrp_id * VEC_SIZE_Y,
+                     ystride / VEC_SIZE_X,
+                     xgrp_sz,
+                     xgrp_id,
+                     xlid,
+                     xstride);
     }
 }
 
 __attribute__((reqd_work_group_size(MIO_BN_GRP0, MIO_BN_GRP1, MIO_BN_GRP2))) __kernel void
-MIOpenBatchNormBwdSpatialFinalDScaleDBias(__global _FLOAT* buff,
-                                          __global _FLOAT* delta_scale,
-                                          __global _FLOAT* delta_bias)
+MIOpenBatchNormBwdSpatialFinalDScaleDBias(const __global _FLOAT* __restrict buff,
+                                          __global _FLOAT_PREC* __restrict delta_scale,
+                                          __global _FLOAT_PREC* __restrict delta_bias)
 {
 
-    _FLOAT ds = (_FLOAT)0.;
-    _FLOAT db = (_FLOAT)0.;
-
-    unsigned int lid     = get_local_id(1);
+    unsigned int xlid    = get_local_id(0);
+    unsigned int ylid    = get_local_id(1);
     unsigned int xgid    = get_global_id(0);
-    unsigned int ygid    = get_global_id(1);
+    unsigned int xgrp_id = get_group_id(0);
+    unsigned int xgrp_sz = get_local_size(0);
     unsigned int ygrp_sz = get_local_size(1);
-    unsigned int yngrps  = get_num_groups(1);
-    int cidx             = MIO_BN_HW * xgid;
 
-    for(int gn = 0; gn < MIO_BN_NGRPS; gn++)
+    unsigned int xstride = MIO_LAYOUT_NHWC ? 1 : MIO_BN_HW;
+    unsigned int ystride = MIO_LAYOUT_NHWC ? MIO_BN_C : 1;
+
+    if(xgid * VEC_SIZE_X >= MIO_BN_C)
+        return;
+
+    _FLOAT_PREC_C dscale = (_FLOAT_PREC_C)0.;
+    _FLOAT_PREC_C dbias  = (_FLOAT_PREC_C)0.;
+
+    for(unsigned int yoffset = ylid; yoffset < MIO_BN_NGRPS; yoffset += ygrp_sz)
     {
-        unsigned int offset = gn * ygrp_sz + lid;
-        if(offset < yngrps)
-        { // modify to span larger number of groups
-            unsigned int gammaindex = cidx + ygrp_sz * offset + 4;
-            unsigned int betaindex  = cidx + ygrp_sz * offset + 6;
-            ds += *(buff + gammaindex);
-            db += *(buff + betaindex);
-        }
+        dscale += loadFromStash((__global _FLOAT_C*)buff,
+                                2,
+                                ygrp_sz * yoffset * VEC_SIZE_Y,
+                                ystride / VEC_SIZE_X,
+                                xgrp_sz,
+                                xgrp_id,
+                                xlid,
+                                xstride);
+        dbias += loadFromStash((__global _FLOAT_C*)buff,
+                               3,
+                               ygrp_sz * yoffset * VEC_SIZE_Y,
+                               ystride / VEC_SIZE_X,
+                               xgrp_sz,
+                               xgrp_id,
+                               xlid,
+                               xstride);
     }
 
-#if !MIOPEN_USE_AMDGCN
-    local _FLOAT_ACCUM lcl_data_x2[MIO_BN_NGRPS];
-    local _FLOAT_ACCUM lcl_data_y2[MIO_BN_NGRPS];
-    lds_reduce2(&ds, &db, (_FLOAT_ACCUM)1.0, lcl_data_x2, lcl_data_y2, lid);
+#if !MIOPEN_USE_AMDGCN || MIO_BN_GRP0 > 1 || MIO_BN_LDSGCN_SIZE == 1
+    local _FLOAT_ACCUM_C lcl_data_x[MIO_BN_LDS_SIZE];
+    local _FLOAT_ACCUM_C lcl_data_y[MIO_BN_LDS_SIZE];
+    lds_reduce2_2d(&dscale,
+                   &dbias,
+                   (_FLOAT_ACCUM)1.0,
+                   lcl_data_x + xlid * ygrp_sz,
+                   lcl_data_y + xlid * ygrp_sz,
+                   ylid,
+                   ygrp_sz);
 #else
-    local _FLOAT_ACCUM lcl_data_x2[MIO_BN_LDSGCN_SIZE];
-    local _FLOAT_ACCUM lcl_data_y2[MIO_BN_LDSGCN_SIZE];
-    gcn_reduce2(&ds, &db, (_FLOAT_ACCUM)1.0, lcl_data_x2, lcl_data_y2, lid);
+    local _FLOAT_ACCUM_C lcl_data_x[MIO_BN_LDSGCN_SIZE];
+    local _FLOAT_ACCUM_C lcl_data_y[MIO_BN_LDSGCN_SIZE];
+    gcn_reduce2(&dscale, &dbias, (_FLOAT_ACCUM)1.0, lcl_data_x, lcl_data_y, ylid);
 #endif
 
-    if(ygid == 0)
+    if(ylid == 0)
     {
-        delta_scale[xgid] = FLOAT2FLOATPREC(ds);
-        delta_bias[xgid]  = FLOAT2FLOATPREC(db);
+        *((__global _FLOAT_PREC_C*)(delta_scale + xgid * VEC_SIZE_X)) = dscale;
+        *((__global _FLOAT_PREC_C*)(delta_bias + xgid * VEC_SIZE_X))  = dbias;
     }
 }
 
 __attribute__((reqd_work_group_size(MIO_BN_GRP0, MIO_BN_GRP1, MIO_BN_GRP2))) __kernel void
-MIOpenBatchNormBwdSpatialDX(const __global _FLOAT* x_in,
-                            const __global _FLOAT* dy_in,
-                            __global _FLOAT* dx_out,
-                            const __global _FLOAT* bnScale,
-                            __global _FLOAT* delta_scale,
-                            __global _FLOAT* delta_bias,
-#if(MIO_BN_USESAVED == 1)
-                            const __global _FLOAT* savedMean,
-                            const __global _FLOAT* savedInvVariance,
+MIOpenBatchNormBwdSpatialDX(const __global _FLOAT* __restrict x_in,
+                            const __global _FLOAT* __restrict dy_in,
+                            __global _FLOAT* __restrict dx_out,
+                            const __global _FLOAT_PREC* __restrict bnScale,
+                            const __global _FLOAT_PREC* __restrict delta_scale,
+                            const __global _FLOAT_PREC* __restrict delta_bias,
+#if MIO_BN_USESAVED == 1
+                            const __global _FLOAT_PREC* __restrict savedMean,
+                            const __global _FLOAT_PREC* __restrict savedInvVariance,
 #endif
-                            _FLOAT INHW)
+                            _FLOAT_PREC INHW)
 {
 
-    int xgid = get_global_id(0);
-    int ygid = get_global_id(1);
-    int cidx = MIO_BN_HW * xgid;
+    unsigned int xlid = get_local_id(0);
+    unsigned int ylid = get_local_id(1);
+    unsigned int xgid = get_global_id(0);
+    unsigned int ygid = get_global_id(1);
+
+    unsigned int xstride = MIO_LAYOUT_NHWC ? 1 : MIO_BN_HW;
+    unsigned int ystride = MIO_LAYOUT_NHWC ? MIO_BN_C : 1;
+
+    if(xgid * VEC_SIZE_X >= MIO_BN_C)
+        return;
+
     unsigned int index;
-    _FLOAT mean, invVar;
-    _FLOAT elemStd, xhat;
-    _FLOAT scale, dscale, dbias;
-    _FLOAT tmp1, tmp2, tmp3;
-    _FLOAT NHW = (_FLOAT)MIO_BN_NHW;
+    _FLOAT_PREC_C mean, invVar;
+    _FLOAT_PREC_LS elemStd, xhat;
+    _FLOAT_PREC_C scale, dscale, dbias;
+    _FLOAT_PREC_LS tmp1, tmp2, tmp3, tmp4;
+    _FLOAT_PREC_LS value1, value2;
+    _FLOAT_LS read4;
+    _FLOAT_PREC NHW = (_FLOAT_PREC)MIO_BN_NHW;
 
-    local _FLOAT lscale, ldscale, ldbias, lmean, livar;
+    local _FLOAT_PREC_C lscale[MIO_BN_GRP0], ldscale[MIO_BN_GRP0], ldbias[MIO_BN_GRP0],
+        lmean[MIO_BN_GRP0], livar[MIO_BN_GRP0];
 
-    if(get_local_id(1) == 0)
+    if(ylid == 0)
     {
-
-#if(MIO_BN_USESAVED == 0)
-        int ygrp_id                 = get_group_id(1);
-        int ygrp_sz                 = get_local_size(1);
-        unsigned int meanstashindex = cidx + ygrp_sz * ygrp_id + 1;
-        unsigned int varstashindex  = cidx + ygrp_sz * ygrp_id + 3;
-        lmean                       = *(dx_out + meanstashindex); // load stashed mean
-        livar                       = *(dx_out + varstashindex);
-#else  // SAVED
-        lmean = *(savedMean + xgid);
-        livar = *(savedInvVariance + xgid);
-#endif // SAVED
-        lscale                      = *(bnScale + xgid);
-        ldscale                     = *(delta_scale + xgid);
-        ldbias                      = *(delta_bias + xgid);
+#if MIO_BN_USESAVED == 0
+        unsigned int xgrp_id = get_group_id(0);
+        unsigned int ygrp_id = get_group_id(1);
+        unsigned int xgrp_sz = get_local_size(0);
+        unsigned int ygrp_sz = get_local_size(1);
+
+        lmean[xlid]   = loadFromStash((__global _FLOAT_C*)dx_out,
+                                    0,
+                                    ygrp_sz * ygrp_id * VEC_SIZE_Y,
+                                    ystride / VEC_SIZE_X,
+                                    xgrp_sz,
+                                    xgrp_id,
+                                    xlid,
+                                    xstride);
+        livar[xlid]   = loadFromStash((__global _FLOAT_C*)dx_out,
+                                    1,
+                                    ygrp_sz * ygrp_id * VEC_SIZE_Y,
+                                    ystride / VEC_SIZE_X,
+                                    xgrp_sz,
+                                    xgrp_id,
+                                    xlid,
+                                    xstride);
+#else
+        lmean[xlid] = *((const __global _FLOAT_PREC_C*)(savedMean + xgid * VEC_SIZE_X));
+        livar[xlid] = *((const __global _FLOAT_PREC_C*)(savedInvVariance + xgid * VEC_SIZE_X));
+#endif
+        lscale[xlid]  = *((const __global _FLOAT_PREC_C*)(bnScale + xgid * VEC_SIZE_X));
+        ldscale[xlid] = *((const __global _FLOAT_PREC_C*)(delta_scale + xgid * VEC_SIZE_X));
+        ldbias[xlid]  = *((const __global _FLOAT_PREC_C*)(delta_bias + xgid * VEC_SIZE_X));
     }
     barrier(CLK_LOCAL_MEM_FENCE);
-    //________________________________________________
-    // Group level reduction
-    // Need to reduce over all elements in NxHxW
-    // move across the sections of an image in the mini_batch stack
-    if(ygid < MIO_BN_HW)
-    {
 
-        mean   = lmean;
-        invVar = livar;
-        scale  = lscale;
-        dscale = ldscale;
-        dbias  = ldbias;
+    if(ygid * VEC_SIZE_Y < MIO_BN_HW)
+    {
+        mean   = lmean[xlid];
+        invVar = livar[xlid];
+        scale  = lscale[xlid];
+        dscale = ldscale[xlid];
+        dbias  = ldbias[xlid];
 
         for(unsigned int n = 0; n < MIO_BN_N; n++)
         { // apply normalization
-            index         = n * MIO_BN_CHW + cidx + ygid;
-            elemStd       = *(x_in + index) - mean; // (x_i - mean)
-            xhat          = elemStd * invVar;       // recalculating this again...
-            tmp1          = mad(NHW, *(dy_in + index), -dbias);
-            tmp2          = -xhat * dscale;
-            tmp3          = scale * invVar * INHW;
-            dx_out[index] = tmp3 * (tmp2 + tmp1);
+            index   = n * MIO_BN_CHW + ygid * ystride * VEC_SIZE_Y + xgid * xstride * VEC_SIZE_X;
+            read4   = *((const __global _FLOAT_LS*)(x_in + index));
+            value1  = FLOAT2FLOATPREC_VEC(read4);
+            elemStd = value1 - mean;    // (x_i - mean)
+            xhat    = elemStd * invVar; // recalculating this again...
+            read4   = *((const __global _FLOAT_LS*)(dy_in + index));
+            value2  = FLOAT2FLOATPREC_VEC(read4);
+            tmp1    = mad((_FLOAT_PREC_LS)NHW, value2, -dbias);
+            tmp2    = -xhat * dscale;
+            tmp3    = scale * invVar * INHW;
+            tmp4    = tmp3 * (tmp2 + tmp1);
+            *((__global _FLOAT_LS*)(dx_out + index)) = FLOATPREC2FLOAT_VEC(tmp4);
         }
     }
 }
diff --git a/src/kernels/MIOpenBatchNormFwdInferPerAct.cl b/src/kernels/MIOpenBatchNormFwdInferPerAct.cl
index f516a076ea..df2d66582d 100644
--- a/src/kernels/MIOpenBatchNormFwdInferPerAct.cl
+++ b/src/kernels/MIOpenBatchNormFwdInferPerAct.cl
@@ -2,7 +2,7 @@
  *
  * MIT License
  *
- * Copyright (c) 2017 Advanced Micro Devices, Inc.
+ * Copyright (c) 2017-2025 Advanced Micro Devices, Inc.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -35,43 +35,52 @@
 #include "batchnorm_functions.h"
 
 __attribute__((reqd_work_group_size(MIO_BN_GRP0, MIO_BN_GRP1, MIO_BN_GRP2))) __kernel void
-MIOpenBatchNormFwdInferPerActivationEst(const __global _FLOAT* in,
-                                        __global _FLOAT* __restrict out,
-                                        __global _FLOAT_PREC* __restrict estimatedMean,
-                                        __global _FLOAT_PREC* __restrict estimatedVariance,
+MIOpenBatchNormFwdInferPerActivationEst(const __global _FLOAT* __restrict in, /* x input */
+                                        __global _FLOAT* __restrict out,      /* y output */
+                                        const __global _FLOAT_PREC* __restrict estimatedMean,
+                                        const __global _FLOAT_PREC* __restrict estimatedVariance,
                                         const __global _FLOAT_PREC* __restrict scale,
                                         const __global _FLOAT_PREC* __restrict bias,
                                         double epsilon,
+                                        unsigned int c,
+                                        unsigned int hw,
                                         unsigned int batchSize,
-                                        unsigned int imageDims,
+                                        unsigned int cStride,
+                                        unsigned int hwStride,
                                         unsigned int batchStride)
 {
+    int xgid = get_global_id(0);
+    int ygid = get_global_id(1);
+
+    if(xgid * VEC_SIZE_X >= c || ygid * VEC_SIZE_Y >= hw)
+        return;
 
-    // PER ACTIVATION
-    _FLOAT_PREC mean, variance;
-    _FLOAT_PREC invVariance, elemStd, inhat;
-    _FLOAT_PREC pvt_scale, pvt_bias;
     unsigned int adjIndex, index;
-    int ygid    = get_global_id(1);
-    int yglb_sz = get_global_size(1);
-    int grpid   = get_group_id(0);
 
-    for(int img_offset = ygid; img_offset < imageDims; img_offset += yglb_sz)
+    // PER ACTIVATION
+    _FLOAT_PREC_LS mean, variance, invVariance;
+    _FLOAT_PREC_LS inhat;
+    _FLOAT_PREC_LS pscale, pbias;
+    _FLOAT_LS value;
+
+    adjIndex    = (xgid * cStride * VEC_SIZE_X) + (ygid * hwStride * VEC_SIZE_Y);
+    mean        = *((const __global _FLOAT_PREC_LS*)(estimatedMean + adjIndex));
+    variance    = *((const __global _FLOAT_PREC_LS*)(estimatedVariance + adjIndex));
+    pscale      = *((const __global _FLOAT_PREC_LS*)(scale + adjIndex));
+    pbias       = *((const __global _FLOAT_PREC_LS*)(bias + adjIndex));
+    invVariance = rsqrt(fabs(variance + (_FLOAT_PREC_LS)epsilon));
+
+    for(int n = 0; n < batchSize; n++)
     {
-        adjIndex    = (grpid * imageDims) + img_offset;
-        mean        = estimatedMean[adjIndex];
-        variance    = estimatedVariance[adjIndex];
-        invVariance = rsqrt(fabs(variance + epsilon));
-        pvt_scale   = *(scale + adjIndex);
-        pvt_bias    = *(bias + adjIndex);
+        index = (n * batchStride) + adjIndex;
+        value = *((const __global _FLOAT_LS*)(in + index));
+
+        inhat = FLOAT2FLOATPREC_VEC(value);
+        inhat = (inhat - mean) * invVariance;
+        inhat = mad(pscale, inhat, pbias);
+        value = FLOATPREC2FLOAT_VEC(inhat);
 
-        for(int n = 0; n < batchSize; n++)
-        {
-            index      = (batchStride * n) + adjIndex;
-            elemStd    = FLOAT2FLOATPREC(*(in + index)) - mean;
-            inhat      = elemStd * invVariance;
-            out[index] = FLOATPREC2FLOAT(mad(pvt_scale, inhat, pvt_bias));
-        }
+        *((__global _FLOAT_LS*)(out + index)) = value;
     }
 }
 
diff --git a/src/kernels/MIOpenBatchNormFwdInferSpatial.cl b/src/kernels/MIOpenBatchNormFwdInferSpatial.cl
index a81db2a03b..788dbb1340 100644
--- a/src/kernels/MIOpenBatchNormFwdInferSpatial.cl
+++ b/src/kernels/MIOpenBatchNormFwdInferSpatial.cl
@@ -2,7 +2,7 @@
  *
  * MIT License
  *
- * Copyright (c) 2017 Advanced Micro Devices, Inc.
+ * Copyright (c) 2017-2025 Advanced Micro Devices, Inc.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -42,34 +42,42 @@ MIOpenBatchNormFwdInferSpatialEst(const __global _FLOAT* __restrict in, /* x inp
                                   const __global _FLOAT_PREC* __restrict scale,
                                   const __global _FLOAT_PREC* __restrict bias,
                                   double epsilon,
+                                  unsigned int c,
+                                  unsigned int hw,
                                   unsigned int batchSize,
-                                  unsigned int imageDims,
+                                  unsigned int cStride,
+                                  unsigned int hwStride,
                                   unsigned int batchStride)
 {
+    unsigned int xgid = get_global_id(0);
+    unsigned int ygid = get_global_id(1);
 
-    int xgid = get_global_id(0);
-    int ygid = get_global_id(1);
+    if(xgid * VEC_SIZE_X >= c || ygid * VEC_SIZE_Y >= hw)
+        return;
 
     unsigned int index;
+    _FLOAT_PREC_C mean, variance, invVariance;
+    _FLOAT_PREC_C pscale, pbias;
+    _FLOAT_PREC_LS inhat;
+    _FLOAT_LS value;
 
-    _FLOAT_PREC mean, variance, invVariance;
-    _FLOAT_PREC inhat;
-    _FLOAT_PREC pscale, pbias;
+    mean        = *((const __global _FLOAT_PREC_C*)(estimatedMean + xgid * VEC_SIZE_X));
+    variance    = *((const __global _FLOAT_PREC_C*)(estimatedVariance + xgid * VEC_SIZE_X));
+    pscale      = *((const __global _FLOAT_PREC_C*)(scale + xgid * VEC_SIZE_X));
+    pbias       = *((const __global _FLOAT_PREC_C*)(bias + xgid * VEC_SIZE_X));
+    invVariance = rsqrt(fabs(variance + (_FLOAT_PREC_C)epsilon));
 
-    mean        = *(estimatedMean + xgid);
-    variance    = *(estimatedVariance + xgid);
-    pscale      = *(scale + xgid);
-    pbias       = *(bias + xgid);
-    invVariance = rsqrt(fabs(variance + epsilon));
-
-    for(int idx = ygid; idx < imageDims; idx += get_global_size(1))
+    for(int n = 0; n < batchSize; n++)
     {
-        for(int n = 0; n < batchSize; n++)
-        {
-            index      = (n * batchStride) + (xgid * imageDims) + idx;
-            inhat      = (FLOAT2FLOATPREC(*(in + index)) - mean) * invVariance;
-            out[index] = FLOATPREC2FLOAT(mad(pscale, inhat, pbias));
-        }
+        index = (n * batchStride) + (xgid * cStride * VEC_SIZE_X) + (ygid * hwStride * VEC_SIZE_Y);
+        value = *((const __global _FLOAT_LS*)(in + index));
+
+        inhat = FLOAT2FLOATPREC_VEC(value);
+        inhat = (inhat - mean) * invVariance;
+        inhat = mad(pscale, inhat, (_FLOAT_PREC_LS)pbias);
+        value = FLOATPREC2FLOAT_VEC(inhat);
+
+        *((__global _FLOAT_LS*)(out + index)) = value;
     }
 } // end spatial norm
 
diff --git a/src/kernels/MIOpenBatchNormFwdTrainSpatial.cl b/src/kernels/MIOpenBatchNormFwdTrainSpatial.cl
index ef69963149..e3bf837cbf 100644
--- a/src/kernels/MIOpenBatchNormFwdTrainSpatial.cl
+++ b/src/kernels/MIOpenBatchNormFwdTrainSpatial.cl
@@ -446,51 +446,77 @@ MIOpenBatchNormFwdTrainSpatialNorm(const __global _FLOAT* __restrict in,
                                    const __global _FLOAT_PREC* __restrict bias)
 {
 
-    // SPATIAL
-    _FLOAT_PREC mean        = (_FLOAT_PREC)0.;
-    _FLOAT_PREC invVariance = (_FLOAT_PREC)0.;
-    _FLOAT_PREC inhat       = (_FLOAT_PREC)0.;
-    _FLOAT_PREC pvt_scale   = (_FLOAT_PREC)0.;
-    _FLOAT_PREC pvt_bias    = (_FLOAT_PREC)0.;
-    __local _FLOAT_PREC lcl_bias;
-    __local _FLOAT_PREC lcl_scale;
-    __local _FLOAT lcl_mean, lcl_ivar;
+    unsigned int xstride = MIO_LAYOUT_NHWC ? 1 : MIO_BN_HW;
+    unsigned int ystride = MIO_LAYOUT_NHWC ? MIO_BN_C : 1;
 
+    unsigned int xgrp_id = get_group_id(0);
     unsigned int ygrp_id = get_group_id(1);
     unsigned int xgid    = get_global_id(0);
     unsigned int ygid    = get_global_id(1);
+    unsigned int xgrp_sz = get_local_size(0);
     unsigned int ygrp_sz = get_local_size(1);
+    unsigned int xlid    = get_local_id(0);
     unsigned int index;
-    unsigned int cidx           = xgid * MIO_BN_HW;
-    unsigned int meanstashindex = cidx + ygrp_sz * ygrp_id + 1;
-    unsigned int varstashindex  = cidx + ygrp_sz * ygrp_id + 3;
+
+    // SPATIAL
+    _FLOAT_PREC_C mean        = (_FLOAT_PREC_C)0.;
+    _FLOAT_PREC_C invVariance = (_FLOAT_PREC_C)0.;
+    _FLOAT_PREC_LS inhat      = (_FLOAT_PREC_LS)0.;
+    _FLOAT_PREC_C pvt_scale   = (_FLOAT_PREC_C)0.;
+    _FLOAT_PREC_C pvt_bias    = (_FLOAT_PREC_C)0.;
+    _FLOAT_LS value;
+    __local _FLOAT_PREC_C lcl_bias[MIO_BN_GRP0];
+    __local _FLOAT_PREC_C lcl_scale[MIO_BN_GRP0];
+    __local _FLOAT_PREC_C lcl_mean[MIO_BN_GRP0];
+    __local _FLOAT_PREC_C lcl_ivar[MIO_BN_GRP0];
+
+    if(xgid * VEC_SIZE_X >= MIO_BN_C)
+        return;
 
     // #4 apply the normalization :: x_hat = (x_i - mean) / sqrt(variance_accum + epsilon)
     if(get_local_id(1) == 0)
     {
-        lcl_scale = *(scale + xgid);
-        lcl_bias  = *(bias + xgid);
-        lcl_mean  = *(out + meanstashindex); // load stashed mean
-        lcl_ivar  = *(out + varstashindex);
+        lcl_scale[xlid] = *((const __global _FLOAT_PREC_C*)(scale + xgid * VEC_SIZE_X));
+        lcl_bias[xlid]  = *((const __global _FLOAT_PREC_C*)(bias + xgid * VEC_SIZE_X));
+        lcl_mean[xlid]  = loadFromStash((__global _FLOAT_C*)out,
+                                       0,
+                                       ygrp_sz * ygrp_id * VEC_SIZE_Y,
+                                       ystride / VEC_SIZE_X,
+                                       xgrp_sz,
+                                       xgrp_id,
+                                       xlid,
+                                       xstride);
+        lcl_ivar[xlid]  = loadFromStash((__global _FLOAT_C*)out,
+                                       1,
+                                       ygrp_sz * ygrp_id * VEC_SIZE_Y,
+                                       ystride / VEC_SIZE_X,
+                                       xgrp_sz,
+                                       xgrp_id,
+                                       xlid,
+                                       xstride);
     }
     barrier(CLK_LOCAL_MEM_FENCE);
 
-    if(ygid < MIO_BN_HW)
+    if(ygid * VEC_SIZE_Y < MIO_BN_HW)
     {
-        mean        = FLOAT2FLOATPREC(lcl_mean);
-        invVariance = FLOAT2FLOATPREC(lcl_ivar);
-        pvt_scale   = lcl_scale;
-        pvt_bias    = lcl_bias;
+        mean        = lcl_mean[xlid];
+        invVariance = lcl_ivar[xlid];
+        pvt_scale   = lcl_scale[xlid];
+        pvt_bias    = lcl_bias[xlid];
 #if(MIO_BN_HW > MIO_BN_LOOP_UNROLL_MAXHW)
         for(unsigned int n = 0; n < MIO_BN_N; n++)
 #else
         __attribute__((opencl_unroll_hint(2))) for(unsigned int n = 0; n < MIO_BN_N; n++)
 #endif
         { // apply normalization
-            index = n * MIO_BN_CHW + cidx + ygid;
-            inhat = (FLOAT2FLOATPREC(*(in + index)) - mean) * invVariance;
+            index = n * MIO_BN_CHW + ygid * ystride * VEC_SIZE_Y + xgid * xstride * VEC_SIZE_X;
+            value = *((const __global _FLOAT_LS*)(in + index));
+            inhat = FLOAT2FLOATPREC_VEC(value);
+            inhat = (inhat - mean) * invVariance;
+            inhat = mad(pvt_scale, inhat, pvt_bias);
+            value = FLOATPREC2FLOAT_VEC(inhat);
             // #5 Gamma and Beta adjust :: y_i = gamma*x_hat + beta
-            out[index] = FLOATPREC2FLOAT(mad(pvt_scale, inhat, pvt_bias));
+            *((__global _FLOAT_LS*)(out + index)) = value;
         } // end for(n)
     }     // end if(inImgIndex)
 } // end spatial norm
@@ -516,65 +542,105 @@ MIOpenBatchNormFwdTrainSpatialFinalMeanVariance(
 #endif
 )
 {
-    _FLOAT_PREC variance        = (_FLOAT_PREC)0.;
-    _FLOAT_PREC invVariance     = (_FLOAT_PREC)0.;
-    _FLOAT_PREC mean            = (_FLOAT_PREC)0.;
-    unsigned int lid            = get_local_id(1);
-    unsigned int ygrp_id        = get_group_id(1);
-    unsigned int xgid           = get_global_id(0);
-    unsigned int ygrp_sz        = get_local_size(1);
-    unsigned int yngrps         = get_num_groups(1);
-    unsigned int cidx           = xgid * MIO_BN_HW;
-    unsigned int meanstashindex = cidx + ygrp_sz * ygrp_id + 1;
-    unsigned int varstashindex  = cidx + ygrp_sz * ygrp_id + 3;
-    unsigned int commitID       = 0;
-
-    for(int gn = 0; gn < yngrps; gn++)
-    {
-        unsigned int offset    = gn * ygrp_sz + lid;
-        unsigned int meanindex = cidx + ygrp_sz * offset;
-        unsigned int varindex  = cidx + ygrp_sz * offset + 2;
-        if(offset < yngrps)
-        { // modify to span larger number of groups
-            mean += FLOAT2FLOATPREC(*(meanvarbuff + meanindex));
-            variance += FLOAT2FLOATPREC(*(meanvarbuff + varindex)); // load per group variance
-        }
+    _FLOAT_PREC_C variance    = (_FLOAT_PREC_C)0.;
+    _FLOAT_PREC_C invVariance = (_FLOAT_PREC_C)0.;
+    _FLOAT_PREC_C mean        = (_FLOAT_PREC_C)0.;
+    unsigned int xgid         = get_global_id(0);
+    unsigned int ygid         = get_global_id(1);
+    unsigned int xlid         = get_local_id(0);
+    unsigned int ylid         = get_local_id(1);
+    unsigned int xgrp_sz      = get_local_size(0);
+    unsigned int ygrp_sz      = get_local_size(1);
+    unsigned int xgrp_id      = get_group_id(0);
+    unsigned int xstride      = MIO_LAYOUT_NHWC ? 1 : MIO_BN_HW;
+    unsigned int ystride      = MIO_LAYOUT_NHWC ? MIO_BN_C : 1;
+    unsigned int commitID     = 0;
+
+    if(xgid * VEC_SIZE_X >= MIO_BN_C)
+        return;
+
+    for(unsigned int yoffset = ylid; yoffset < MIO_BN_NGRPS; yoffset += ygrp_sz)
+    {
+        mean += loadFromStash((__global _FLOAT_C*)meanvarbuff,
+                              0,
+                              ygrp_sz * yoffset * VEC_SIZE_Y,
+                              ystride / VEC_SIZE_X,
+                              xgrp_sz,
+                              xgrp_id,
+                              xlid,
+                              xstride);
+        variance += loadFromStash((__global _FLOAT_C*)meanvarbuff,
+                                  1,
+                                  ygrp_sz * yoffset * VEC_SIZE_Y,
+                                  ystride / VEC_SIZE_X,
+                                  xgrp_sz,
+                                  xgrp_id,
+                                  xlid,
+                                  xstride);
     }
 
-#if !MIOPEN_USE_AMDGCN
-    local _FLOAT_ACCUM lcl_data_x[MIO_BN_LDS_SIZE];
-    local _FLOAT_ACCUM lcl_data_y[MIO_BN_LDS_SIZE];
-    lds_reduce2(&mean, &variance, (_FLOAT_ACCUM)INHW, lcl_data_x, lcl_data_y, lid);
+#if !MIOPEN_USE_AMDGCN || MIO_BN_GRP0 > 1 || MIO_BN_LDSGCN_SIZE == 1
+    // TODO: this simple approach has many bank conflicts, optimize if it affects performance
+    local _FLOAT_ACCUM_C lcl_data_x[MIO_BN_LDS_SIZE];
+    local _FLOAT_ACCUM_C lcl_data_y[MIO_BN_LDS_SIZE];
+    lds_reduce2_2d(&mean,
+                   &variance,
+                   INHW,
+                   lcl_data_x + xlid * ygrp_sz,
+                   lcl_data_y + xlid * ygrp_sz,
+                   ylid,
+                   ygrp_sz);
 #else
     commitID = 64;
-    local _FLOAT_ACCUM lcl_data_x[MIO_BN_LDSGCN_SIZE];
-    local _FLOAT_ACCUM lcl_data_y[MIO_BN_LDSGCN_SIZE];
-    gcn_reduce2(&mean, &variance, (_FLOAT_ACCUM)INHW, lcl_data_x, lcl_data_y, lid);
+    local _FLOAT_ACCUM_C lcl_data_x[MIO_BN_LDSGCN_SIZE];
+    local _FLOAT_ACCUM_C lcl_data_y[MIO_BN_LDSGCN_SIZE];
+    gcn_reduce2(&mean, &variance, INHW, lcl_data_x, lcl_data_y, ylid);
 #endif
 
-    barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
-    variance = mad(-mean, mean, variance);
-    if(variance < 0)
-    {
-        variance = 0;
-    }
-    invVariance = rsqrt(variance + epsilon);
-    if(lid == commitID)
-    {
-        meanvarbuff[meanstashindex] = FLOATPREC2FLOAT(mean);        // stash mean
-        meanvarbuff[varstashindex]  = FLOATPREC2FLOAT(invVariance); // stash mean
+    variance    = mad(-mean, mean, variance);
+    variance    = max(variance, (_FLOAT_PREC_C)0.);
+    invVariance = rsqrt(variance + (_FLOAT_PREC_C)epsilon);
+
+    for(unsigned int yoffset = ylid; yoffset < MIO_BN_NGRPS; yoffset += ygrp_sz)
+    {
+        storeToStash(mean,
+                     (__global _FLOAT_C*)meanvarbuff,
+                     0,
+                     ygrp_sz * yoffset * VEC_SIZE_Y,
+                     ystride / VEC_SIZE_X,
+                     xgrp_sz,
+                     xgrp_id,
+                     xlid,
+                     xstride);
+        storeToStash(invVariance,
+                     (__global _FLOAT_C*)meanvarbuff,
+                     1,
+                     ygrp_sz * yoffset * VEC_SIZE_Y,
+                     ystride / VEC_SIZE_X,
+                     xgrp_sz,
+                     xgrp_id,
+                     xlid,
+                     xstride);
     }
 
     // Save mean and calculate and save running mean
-    unsigned int ygid = get_global_id(1);
     if(ygid == commitID)
     {
 #if(MIO_RUNNING_RESULT == 1)
-        running_stash(resultRunningMean, resultRunningVariance, expAvgFactor, mean, variance, xgid);
+        running_stash((global _FLOAT_PREC_C*)resultRunningMean,
+                      (global _FLOAT_PREC_C*)resultRunningVariance,
+                      expAvgFactor,
+                      mean,
+                      variance,
+                      xgid);
 #endif
 
 #if(MIO_SAVE_MEAN_VARIANCE == 1)
-        saved_stash(resultSaveMean, resultSaveInvVariance, mean, invVariance, xgid);
+        saved_stash((global _FLOAT_PREC_C*)resultSaveMean,
+                    (global _FLOAT_PREC_C*)resultSaveInvVariance,
+                    mean,
+                    invVariance,
+                    xgid);
 #endif
     }
 }
@@ -584,50 +650,77 @@ MIOpenBatchNormFwdTrainSpatialMeanVariance(const __global _FLOAT* __restrict in,
                                            __global _FLOAT* __restrict mvbuff)
 {
 
-    unsigned int ylid    = get_local_id(1);
-    unsigned int ygrp_id = get_group_id(1);
     unsigned int xgid    = get_global_id(0);
     unsigned int ygid    = get_global_id(1);
+    unsigned int xlid    = get_local_id(0);
+    unsigned int ylid    = get_local_id(1);
+    unsigned int xgrp_id = get_group_id(0);
+    unsigned int ygrp_id = get_group_id(1);
+    unsigned int xgrp_sz = get_local_size(0);
     unsigned int ygrp_sz = get_local_size(1);
     unsigned int index;
-    unsigned int cidx      = xgid * MIO_BN_HW;
-    unsigned int meanindex = cidx + ygrp_sz * ygrp_id;
-    unsigned int varindex  = meanindex + 2;
-    _FLOAT_ACCUM mean      = (_FLOAT_ACCUM)0.;
-    _FLOAT_ACCUM variance  = (_FLOAT_ACCUM)0.;
-    _FLOAT_ACCUM value     = (_FLOAT_ACCUM)0.;
+    unsigned int xstride = MIO_LAYOUT_NHWC ? 1 : MIO_BN_HW;
+    unsigned int ystride = MIO_LAYOUT_NHWC ? MIO_BN_C : 1;
+
+    _FLOAT_PREC_C mean     = (_FLOAT_PREC_C)0.;
+    _FLOAT_PREC_C variance = (_FLOAT_PREC_C)0.;
+    _FLOAT_PREC_LS value;
+
+    if(xgid * VEC_SIZE_X >= MIO_BN_C)
+        return;
 
-    if(ygid < MIO_BN_HW)
+    if(ygid * VEC_SIZE_Y < MIO_BN_HW)
     {
-#if(MIO_BN_HW > MIO_BN_LOOP_UNROLL_MAXHW)
+        _FLOAT_LS read4;
         for(unsigned int n = 0; n < MIO_BN_N; n++)
-#else
-        __attribute__((opencl_unroll_hint(2))) for(unsigned int n = 0; n < MIO_BN_N; n++)
-#endif
         {
-            index = n * MIO_BN_CHW + cidx + ygid;
-            value = FLOAT2ACCUM(*(in + index));
-            mean += value;
-            variance = mad(value, value, variance);
+            index = n * MIO_BN_CHW + ygid * ystride * VEC_SIZE_Y + xgid * xstride * VEC_SIZE_X;
+            read4 = *((const global _FLOAT_LS*)(in + index));
+            value = FLOAT2FLOATPREC_VEC(read4);
+            _ACCUMULATE(mean, value)
+            _ACCUMULATE_MAD(variance, value, value, variance)
         }
     }
 
-#if !MIOPEN_USE_AMDGCN
-    local _FLOAT_ACCUM lcl_data_x[MIO_BN_LDS_SIZE];
-    local _FLOAT_ACCUM lcl_data_y[MIO_BN_LDS_SIZE];
-    lds_reduce2(&mean, &variance, (_FLOAT_ACCUM)1.0, lcl_data_x, lcl_data_y, ylid);
+#if !MIOPEN_USE_AMDGCN || MIO_BN_GRP0 > 1 || MIO_BN_LDSGCN_SIZE == 1
+    // TODO: this simple approach has many bank conflicts, optimize if it affects performance
+    local _FLOAT_ACCUM_C lcl_data_x[MIO_BN_LDS_SIZE];
+    local _FLOAT_ACCUM_C lcl_data_y[MIO_BN_LDS_SIZE];
+    lds_reduce2_2d(&mean,
+                   &variance,
+                   (_FLOAT_ACCUM)1.0,
+                   lcl_data_x + xlid * ygrp_sz,
+                   lcl_data_y + xlid * ygrp_sz,
+                   ylid,
+                   ygrp_sz);
 #else
-    local _FLOAT_ACCUM lcl_data_x[MIO_BN_LDSGCN_SIZE];
-    local _FLOAT_ACCUM lcl_data_y[MIO_BN_LDSGCN_SIZE];
+    local _FLOAT_ACCUM_C lcl_data_x[MIO_BN_LDSGCN_SIZE];
+    local _FLOAT_ACCUM_C lcl_data_y[MIO_BN_LDSGCN_SIZE];
     gcn_reduce2(&mean, &variance, (_FLOAT_ACCUM)1.0, lcl_data_x, lcl_data_y, ylid);
 #endif
 
     if(ylid == 0)
     {
-        mvbuff[meanindex] = ACCUM2FLOAT(mean);
-        mvbuff[varindex]  = ACCUM2FLOAT(variance);
+        storeToStash(mean,
+                     (__global _FLOAT_C*)mvbuff,
+                     0,
+                     ygrp_sz * ygrp_id * VEC_SIZE_Y,
+                     ystride / VEC_SIZE_X,
+                     xgrp_sz,
+                     xgrp_id,
+                     xlid,
+                     xstride);
+        storeToStash(variance,
+                     (__global _FLOAT_C*)mvbuff,
+                     1,
+                     ygrp_sz * ygrp_id * VEC_SIZE_Y,
+                     ystride / VEC_SIZE_X,
+                     xgrp_sz,
+                     xgrp_id,
+                     xlid,
+                     xstride);
     }
-} // end spatial mean kernel
+}
 
 #elif(MIO_BN_VARIANT == 3)
 
diff --git a/src/kernels/batchnorm_functions.h b/src/kernels/batchnorm_functions.h
index b95ff4cc1f..7f92514cb6 100644
--- a/src/kernels/batchnorm_functions.h
+++ b/src/kernels/batchnorm_functions.h
@@ -96,6 +96,7 @@
 #define _AS_FLOAT PPCAT(as_, _FLOAT)
 
 #define _FLOAT_PREC4 PPCAT(_FLOAT_PREC, FOUR)
+#define _FLOAT_ACCUM4 PPCAT(_FLOAT_ACCUM, FOUR)
 
 #ifndef MIO_BN_LDSGCN_SIZE
 #define MIO_BN_LDSGCN_SIZE 16
@@ -206,30 +207,211 @@
 #define MIO_BN_GFX120X 0
 #endif
 
+#ifndef MIO_BN_VECTORIZE
+#define MIO_BN_VECTORIZE 0
+#endif
+
+#ifndef MIO_BN_STASH_METHOD
+#define MIO_BN_STASH_METHOD 0
+#endif
+
+#define FLOATPREC4_2_FLOAT4(val)       \
+    ((_FLOAT4)(FLOATPREC2FLOAT(val.x), \
+               FLOATPREC2FLOAT(val.y), \
+               FLOATPREC2FLOAT(val.z), \
+               FLOATPREC2FLOAT(val.w)))
+
+#define FLOAT4_2_FLOATPREC4(val)            \
+    ((_FLOAT_PREC4)(FLOAT2FLOATPREC(val.x), \
+                    FLOAT2FLOATPREC(val.y), \
+                    FLOAT2FLOATPREC(val.z), \
+                    FLOAT2FLOATPREC(val.w)))
+
+#define _ACCUMULATE1(a, b) a += b;
+
+#define _ACCUMULATE_MAD1(a, b, c, d) a = mad(b, c, d);
+
+#define _ACCUMULATE4(a, b) \
+    a += b.x;              \
+    a += b.y;              \
+    a += b.z;              \
+    a += b.w;
+
+#define _ACCUMULATE_MAD4(a, b, c, d) \
+    a = mad(b.x, c.x, d);            \
+    a = mad(b.y, c.y, d);            \
+    a = mad(b.z, c.z, d);            \
+    a = mad(b.w, c.w, d);
+
+#if MIO_BN_VECTORIZE
+#define VEC_SIZE 4
+
+#if MIO_LAYOUT_NHWC
+// NHWC vectorize in X direction which corresponds
+// to channels
+#define VEC_SIZE_X VEC_SIZE
+#define VEC_SIZE_Y 1
+// _C suffix means used for computation
+// _LS suffix means used for loading / storing
+#define _FLOAT_PREC_C _FLOAT_PREC4
+#define _FLOAT_PREC_LS _FLOAT_PREC4
+#define _FLOAT_C _FLOAT4
+#define _FLOAT_LS _FLOAT4
+#define _FLOAT_ACCUM_C _FLOAT_ACCUM4
+#define _FLOAT_ACCUM_LS _FLOAT_ACCUM4
+#define _ACCUMULATE _ACCUMULATE1
+#define _ACCUMULATE_MAD _ACCUMULATE_MAD1
+#else
+// NCHW vectorize in Y direction which corresponds
+// to HW
+#define VEC_SIZE_X 1
+#define VEC_SIZE_Y VEC_SIZE
+#define _FLOAT_PREC_C _FLOAT_PREC
+#define _FLOAT_PREC_LS _FLOAT_PREC4
+// _C suffix means used for computation
+// _LS suffix means used for loading / storing
+#define _FLOAT_C _FLOAT
+#define _FLOAT_LS _FLOAT4
+#define _FLOAT_ACCUM_C _FLOAT_ACCUM
+#define _FLOAT_ACCUM_LS _FLOAT_ACCUM4
+#define _ACCUMULATE _ACCUMULATE4
+#define _ACCUMULATE_MAD _ACCUMULATE_MAD4
+#endif
+
+#define FLOAT2FLOATPREC_VEC FLOAT4_2_FLOATPREC4
+#define FLOATPREC2FLOAT_VEC FLOATPREC4_2_FLOAT4
+
+#else
+
+#define VEC_SIZE 1
+#define VEC_SIZE_X 1
+#define VEC_SIZE_Y 1
+#define _FLOAT_PREC_C _FLOAT_PREC
+#define _FLOAT_PREC_LS _FLOAT_PREC
+#define _FLOAT_C _FLOAT
+#define _FLOAT_LS _FLOAT
+#define _FLOAT_ACCUM_C _FLOAT_ACCUM
+#define _FLOAT_ACCUM_LS _FLOAT_ACCUM
+#define FLOAT2FLOATPREC_VEC FLOAT2FLOATPREC
+#define FLOATPREC2FLOAT_VEC FLOATPREC2FLOAT
+#define _ACCUMULATE _ACCUMULATE1
+#define _ACCUMULATE_MAD _ACCUMULATE_MAD1
+
+#endif
+
 #define UNUSED __attribute__((__unused__))
 
+#if(MIO_BN_VARIANT == 2)
+
+#if(MIO_BN_STASH_METHOD == 0)
+// store values in HW dimension
+#define NSTRIDE ystride
+#else
+// store values in N dimension
+#define NSTRIDE (MIO_BN_C / VEC_SIZE_X * MIO_BN_HW)
+#endif
+
+inline unsigned int getStashIndex(unsigned int vindex,
+                                  unsigned int ygroupoffset,
+                                  unsigned int ystride,
+                                  unsigned int xgrp_sz,
+                                  unsigned int xgrp_id,
+                                  unsigned int xlid,
+                                  unsigned int xstride)
+{
+#if MIOPEN_USE_FPMIX || MIOPEN_USE_BFPMIX
+    // 2 _FLOAT values are used to store 1 _FLOAT_PREC value.
+#if MIO_LAYOUT_NHWC
+#if MIO_BN_C % 2 == 0
+    // xgrp_sz values are split in two parts: even threads use 2 values at even rows, odd threads -
+    // at odd rows.
+    // The only restriction for C and xgrp_sz is that they must be even.
+    return (vindex * 2 + xlid % 2) * NSTRIDE + ygroupoffset * ystride +
+           (xgrp_sz * xgrp_id + xlid / 2 * 2) * xstride;
+#else
+    // Values are stored consecutively in y dim.
+    return (vindex * 2) * NSTRIDE + ygroupoffset * ystride + (xgrp_sz * xgrp_id + xlid) * xstride;
+#endif
+#else // !MIO_LAYOUT_NHWC
+    // Values are stored consecutively in y dim, indices are aligned up by 2 (_FLOAT_PREC).
+    return ((vindex * 2) * NSTRIDE + ygroupoffset * ystride + (xgrp_sz * xgrp_id + xlid) * xstride +
+            1) /
+           2 * 2;
+#endif
+#else
+    return vindex * NSTRIDE + ygroupoffset * ystride + (xgrp_sz * xgrp_id + xlid) * xstride;
+#endif
+}
+
+inline _FLOAT_PREC_C loadFromStash(const __global _FLOAT_C* stash,
+                                   unsigned int vindex,
+                                   unsigned int ygroupoffset,
+                                   unsigned int ystride,
+                                   unsigned int xgrp_sz,
+                                   unsigned int xgrp_id,
+                                   unsigned int xlid,
+                                   unsigned int xstride)
+{
+    unsigned int index =
+        getStashIndex(vindex, ygroupoffset, ystride, xgrp_sz, xgrp_id, xlid, xstride);
+
+#if(MIO_BN_STASH_METHOD == 0 || MIO_BN_STASH_METHOD == 1)
+    return *((const __global _FLOAT_PREC_C*)(stash + index));
+#else
+    _FLOAT_PREC_C value;
+    *((_FLOAT_C*)(&value)) = *(stash + index);
+    index += NSTRIDE;
+    *((_FLOAT_C*)(&value) + 1) = *(stash + index);
+
+    return value;
+#endif
+}
+
+inline void storeToStash(_FLOAT_PREC_C value,
+                         __global _FLOAT_C* stash,
+                         unsigned int vindex,
+                         unsigned int ygroupoffset,
+                         unsigned int ystride,
+                         unsigned int xgrp_sz,
+                         unsigned int xgrp_id,
+                         unsigned int xlid,
+                         unsigned int xstride)
+{
+    unsigned int index =
+        getStashIndex(vindex, ygroupoffset, ystride, xgrp_sz, xgrp_id, xlid, xstride);
+
+#if(MIO_BN_STASH_METHOD == 0 || MIO_BN_STASH_METHOD == 1)
+    *((__global _FLOAT_PREC_C*)(stash + index)) = value;
+#else
+    *(stash + index) = *((_FLOAT_C*)(&value));
+    index += NSTRIDE;
+    *(stash + index) = *((_FLOAT_C*)(&value) + 1);
+#endif
+}
+#endif
+
 #if(MIO_BN_VARIANT != 4)
-static inline void running_stash(global _FLOAT_PREC* resultRunningMean,
-                                 global _FLOAT_PREC* resultRunningVariance,
+static inline void running_stash(global _FLOAT_PREC_C* resultRunningMean,
+                                 global _FLOAT_PREC_C* resultRunningVariance,
                                  double expAvgFactor,
-                                 _FLOAT_ACCUM mean,
-                                 _FLOAT_ACCUM variance,
+                                 _FLOAT_ACCUM_C mean,
+                                 _FLOAT_ACCUM_C variance,
                                  uint channel)
 {
-    _FLOAT_ACCUM pvt_runMean = (_FLOAT_ACCUM)(*(resultRunningMean + channel));
-    _FLOAT_ACCUM pvt_newRunMean =
+    _FLOAT_ACCUM_C pvt_runMean = (_FLOAT_ACCUM_C)(*(resultRunningMean + channel));
+    _FLOAT_ACCUM_C pvt_newRunMean =
         mad((_FLOAT_ACCUM)-expAvgFactor, pvt_runMean, pvt_runMean); // tmp = oldRunMean*(1-factor)
-    resultRunningMean[channel] =
-        (_FLOAT_PREC)mad(mean, (_FLOAT_ACCUM)expAvgFactor, pvt_newRunMean); // newMean*factor + tmp
-    const _FLOAT_ACCUM adjust =
-        (_FLOAT_ACCUM)((MIO_BN_NHW == 1)
-                           ? variance
-                           : variance * ((_FLOAT_ACCUM)MIO_BN_NHW /
-                                         ((_FLOAT_ACCUM)MIO_BN_NHW - (_FLOAT_ACCUM)1.0)));
+    resultRunningMean[channel] = (_FLOAT_PREC_C)mad(
+        mean, (_FLOAT_ACCUM)expAvgFactor, pvt_newRunMean); // newMean*factor + tmp
+    const _FLOAT_ACCUM_C adjust =
+        (_FLOAT_ACCUM_C)((MIO_BN_NHW == 1)
+                             ? variance
+                             : variance * ((_FLOAT_ACCUM)MIO_BN_NHW /
+                                           ((_FLOAT_ACCUM)MIO_BN_NHW - (_FLOAT_ACCUM)1.0)));
     resultRunningVariance[channel] =
-        (_FLOAT_PREC)((1 - (_FLOAT_ACCUM)expAvgFactor) *
-                          (_FLOAT_ACCUM)(*(resultRunningVariance + channel)) +
-                      (_FLOAT_ACCUM)expAvgFactor * adjust);
+        (_FLOAT_PREC_C)((1 - (_FLOAT_ACCUM)expAvgFactor) *
+                            (_FLOAT_ACCUM_C)(*(resultRunningVariance + channel)) +
+                        (_FLOAT_ACCUM)expAvgFactor * adjust);
 }
 
 static inline void running_stash_pa(global _FLOAT_PREC* resultRunningMean,
@@ -279,12 +461,12 @@ static inline void running_stash_dyn(global _FLOAT_PREC* resultRunningMean,
 }
 #endif
 
-static inline void saved_stash(global _FLOAT_PREC* resultSaveMean,
-                               global _FLOAT_PREC* resultSaveInvVariance,
-                               _FLOAT_ACCUM mean,
-                               _FLOAT_ACCUM invVariance,
+static inline void saved_stash(global _FLOAT_PREC_C* resultSaveMean,
+                               global _FLOAT_PREC_C* resultSaveInvVariance,
+                               _FLOAT_ACCUM_C mean,
+                               _FLOAT_ACCUM_C invVariance,
                                uint channel)
 {
-    *(resultSaveMean + channel)        = (_FLOAT_PREC)mean;
-    *(resultSaveInvVariance + channel) = (_FLOAT_PREC)invVariance;
+    *(resultSaveMean + channel)        = (_FLOAT_PREC_C)mean;
+    *(resultSaveInvVariance + channel) = (_FLOAT_PREC_C)invVariance;
 }
diff --git a/src/kernels/reduction_functions.h b/src/kernels/reduction_functions.h
index 4e8c607982..88aeed431a 100644
--- a/src/kernels/reduction_functions.h
+++ b/src/kernels/reduction_functions.h
@@ -59,6 +59,30 @@ regLDSreduce(_FLOAT_ACCUM* value, local _FLOAT_ACCUM* data, uint localID, _FLOAT
 
 #endif
 
+static inline void lds_reduce2_2d(_FLOAT_ACCUM_C* x,
+                                  _FLOAT_ACCUM_C* y,
+                                  _FLOAT_ACCUM scale,
+                                  local _FLOAT_ACCUM_C* lcl_data_x,
+                                  local _FLOAT_ACCUM_C* lcl_data_y,
+                                  unsigned int lid,
+                                  unsigned int size)
+{
+    lcl_data_x[lid] = (_FLOAT_ACCUM_C)*x;
+    lcl_data_y[lid] = (_FLOAT_ACCUM_C)*y;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    for(unsigned int red = (size >> 1); red > 0; red >>= 1)
+    {
+        if(lid < red)
+        {
+            lcl_data_x[lid] += lcl_data_x[lid + red];
+            lcl_data_y[lid] += lcl_data_y[lid + red];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    *x = (_FLOAT_ACCUM_C)(lcl_data_x[0] * scale);
+    *y = (_FLOAT_ACCUM_C)(lcl_data_y[0] * scale);
+}
+
 #if MIOPEN_USE_AMDGCN
 static inline void dpp_reduction(_FLOAT_ACCUM* temp_sum)
 {
diff --git a/src/ocl/batchnormocl.cpp b/src/ocl/batchnormocl.cpp
index 7d3310145b..dca3ea6803 100644
--- a/src/ocl/batchnormocl.cpp
+++ b/src/ocl/batchnormocl.cpp
@@ -129,7 +129,8 @@ void BatchNormForwardTraining(const Handle& handle,
                                                        expAvgFactor,
                                                        epsilon,
                                                        resultsave,
-                                                       resultrunning};
+                                                       resultrunning,
+                                                       size_t(0.6f * handle.GetMaxComputeUnits())};
 
     const auto algo = bn_mode == miopenBNSpatial
                           ? AlgorithmName{"miopenBatchNormForwardTrainingSpatial"}
@@ -151,10 +152,9 @@ void BatchNormForwardTraining(const Handle& handle,
         return tmp;
     }();
 
-    const auto solvers = solver::SolverContainer<solver::batchnorm::BnFwdTrainingSpatialSingle,
-                                                 //  solver::batchnorm::BnCKFwdTraining,
-                                                 solver::batchnorm::BnFwdTrainingSpatialMultiple,
+    const auto solvers = solver::SolverContainer<solver::batchnorm::BnFwdTrainingSpatial,
                                                  solver::batchnorm::BnFwdTrainingPerActivation>{};
+    //  solver::batchnorm::BnCKFwdTraining>{};
 
     solvers.ExecutePrimitive(handle, problem, algo, invoke_params);
 
@@ -250,9 +250,8 @@ void BatchNormForwardInference(const Handle& handle,
         }();
 
         const auto algo    = AlgorithmName{"miopenBatchNormalizationForwardInference"};
-        const auto solvers = solver::SolverContainer<solver::batchnorm::BnFwdInference
-                                                     //  solver::batchnorm::BnCKFwdInference
-                                                     >{};
+        const auto solvers = solver::SolverContainer<solver::batchnorm::BnFwdInference>{};
+        //  solver::batchnorm::BnCKFwdInference>{};
 
         solvers.ExecutePrimitive(handle, problem, algo, invoke_params);
     }
@@ -373,7 +372,8 @@ void BatchNormBackward(const Handle& handle,
                                                        savedMeanDesc,
                                                        savedVarianceDesc,
                                                        epsilon,
-                                                       useSaved};
+                                                       useSaved,
+                                                       size_t(0.6f * handle.GetMaxComputeUnits())};
 
     const auto algo = bn_mode == miopenBNSpatial
                           ? AlgorithmName{"miopenBatchNormBackwardPropSpatial"}
@@ -394,10 +394,9 @@ void BatchNormBackward(const Handle& handle,
         return tmp;
     }();
 
-    const auto solvers = solver::SolverContainer<solver::batchnorm::BnBwdTrainingSpatialSingle,
-                                                 //  solver::batchnorm::BnCKBwdBackward,
-                                                 solver::batchnorm::BnBwdTrainingSpatialMultiple,
+    const auto solvers = solver::SolverContainer<solver::batchnorm::BnBwdTrainingSpatial,
                                                  solver::batchnorm::BnBwdTrainingPerActivation>{};
+    //  solver::batchnorm::BnCKBwdBackward>{};
 
     solvers.ExecutePrimitive(handle, problem, algo, invoke_params);
 
diff --git a/src/solver.cpp b/src/solver.cpp
index c3b077d435..bb707fff81 100644
--- a/src/solver.cpp
+++ b/src/solver.cpp
@@ -562,19 +562,18 @@ inline SolverRegistrar::SolverRegistrar(IdRegistryData& registry)
     Register(registry, ++id, Primitive::Activation, activ::ActivBwdSolver0{}.SolverDbId());
     Register(registry, ++id, Primitive::Activation, activ::ActivBwdSolver1{}.SolverDbId());
 
-    RegisterWithSolver<batchnorm::BnFwdTrainingSpatialSingle>(registry, ++id, Primitive::Batchnorm);
+    // combine BnFwdTrainingSpatialMultiple and BnFwdTrainingSpatialSingle
+    RegisterWithSolver<batchnorm::BnFwdTrainingSpatial>(registry, ++id, Primitive::Batchnorm);
 
     RegisterWithSolver(
         registry, ++id, conv::ConvCkIgemmFwdV6r1DlopsNchw{}, miopenConvolutionAlgoImplicitGEMM);
 
-    RegisterWithSolver<batchnorm::BnFwdTrainingSpatialMultiple>(
-        registry, ++id, Primitive::Batchnorm);
-
+    ++id; // removed solver BnFwdTrainingSpatialMultiple (it is now part of BnFwdTrainingSpatial)
     RegisterWithSolver<batchnorm::BnFwdTrainingPerActivation>(registry, ++id, Primitive::Batchnorm);
 
-    RegisterWithSolver<batchnorm::BnBwdTrainingSpatialSingle>(registry, ++id, Primitive::Batchnorm);
-    RegisterWithSolver<batchnorm::BnBwdTrainingSpatialMultiple>(
-        registry, ++id, Primitive::Batchnorm);
+    // combine BnBwdTrainingSpatialMultiple and BnBwdTrainingSpatialSingle
+    RegisterWithSolver<batchnorm::BnBwdTrainingSpatial>(registry, ++id, Primitive::Batchnorm);
+    ++id; // removed solver BnBwdTrainingSpatialMultiple (it is now part of BnBwdTrainingSpatial)
     RegisterWithSolver<batchnorm::BnBwdTrainingPerActivation>(registry, ++id, Primitive::Batchnorm);
 
     RegisterWithSolver<batchnorm::BnFwdInference>(registry, ++id, Primitive::Batchnorm);
diff --git a/src/solver/batchnorm/backward_spatial.cpp b/src/solver/batchnorm/backward_spatial.cpp
new file mode 100644
index 0000000000..75fc09f4a5
--- /dev/null
+++ b/src/solver/batchnorm/backward_spatial.cpp
@@ -0,0 +1,425 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2021 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <miopen/batchnorm/common_spatial.hpp>
+#include <miopen/batchnorm/solvers.hpp>
+
+#include <miopen/generic_search.hpp>
+#include <miopen/batchnorm/invoke_params.hpp>
+#include <miopen/batch_norm.hpp>
+#include <miopen/stringutils.hpp>
+#include <miopen/visit_float.hpp>
+#include <miopen/kernel_build_params.hpp>
+
+namespace miopen {
+
+namespace solver {
+
+namespace batchnorm {
+
+// Spatial multiple needs space for 4 fp32 elements
+// per each x thread (including the last workgroup)
+// to stash intermediate mean and variance
+const unsigned int stash_values_bwd = 4;
+
+bool PerformanceConfigBnBwdBackward::IsValid(
+    const ExecutionContext&, const miopen::batchnorm::ProblemDescription& problem) const
+{
+    if(this->kernel_id.empty())
+    {
+        return false;
+    }
+
+    // if default config is variant 2, check if it can be applied
+    // (based on variant 2 restrictions)
+    size_t vectorsize;
+    int variant = 0;
+    GetVariantFromKernelId(this->kernel_id, variant, vectorsize);
+    if(variant == 2)
+    {
+        return IsSpatialMultipleApplicable(problem, vectorsize, stash_values_bwd);
+    }
+    return true;
+}
+
+void PerformanceConfigBnBwdBackward::HeuristicInit(
+    const miopen::batchnorm::ProblemDescription& problem)
+{
+    // Define default configuration based on heuristics and
+    // add all other valid configurations for the given problem
+    if(UseMultiple(problem))
+    {
+        DefaultConfigSpatialMultiple(problem, stash_values_bwd, this->valid_kernels);
+        DefaultConfigSpatialSingle(problem, this->valid_kernels);
+    }
+    else
+    {
+        DefaultConfigSpatialSingle(problem, this->valid_kernels);
+        DefaultConfigSpatialMultiple(problem, stash_values_bwd, this->valid_kernels);
+    }
+
+    // Set index and kernel_id to default value
+    this->index     = 0;
+    this->kernel_id = valid_kernels[0];
+}
+
+bool PerformanceConfigBnBwdBackward::SetNextValue(
+    const miopen::batchnorm::ProblemDescription& problem_desc)
+{
+    // In case the valid_kernel list is empty, we fill it with
+    // default value as first one and all other valid ones will follow
+    if(this->valid_kernels.empty())
+    {
+        this->HeuristicInit(problem_desc);
+        return true;
+    }
+    // Get next valid configuration
+    if((this->index + 1) < valid_kernels.size())
+    {
+        ++this->index;
+        this->kernel_id = this->valid_kernels[index];
+        return true;
+    }
+    else
+    {
+        return false;
+    }
+}
+
+bool PerformanceConfigBnBwdBackward::operator==(const PerformanceConfigBnBwdBackward& other) const
+{
+    return this->kernel_id == other.kernel_id;
+}
+
+bool PerformanceConfigBnBwdBackward::IsValidValue() const
+{
+    return this->index >= 0 && this->index < valid_kernels.size();
+}
+
+bool BnBwdTrainingSpatial::IsApplicable(
+    const ExecutionContext&, const miopen::batchnorm::ProblemDescription& bn_problem) const
+{
+    if(bn_problem.GetDirection() != miopen::batchnorm::Direction::Backward ||
+       bn_problem.GetMode() != miopenBNSpatial)
+        return false;
+
+    if(!bn_problem.Is2D())
+        return false;
+
+#if WORKAROUND_ISSUE_1549_FP16_BUILD_ERROR
+    if(bn_problem.GetXDesc().GetType() == miopenHalf &&
+       bn_problem.GetBnScale().GetType() == miopenHalf)
+    {
+        // bfp16parm = true;
+        // Unsupported kernel mode, error in kernel code
+        // MIOpenBatchNormBwdSpatial.cl:526 issue#1549
+        return false;
+    }
+#endif
+    if(!IsOCLBwdTypeValid(bn_problem))
+        return false;
+
+    return true;
+}
+
+PerformanceConfigBnBwdBackward BnBwdTrainingSpatial::GetDefaultPerformanceConfig(
+    const ExecutionContext&, const miopen::batchnorm::ProblemDescription& problem_desc) const
+{
+    PerformanceConfigBnBwdBackward pp;
+    pp.HeuristicInit(problem_desc);
+    MIOPEN_LOG_I(pp.ToString());
+    return pp;
+}
+
+bool BnBwdTrainingSpatial::IsValidPerformanceConfig(
+    const ExecutionContext& ctx,
+    const miopen::batchnorm::ProblemDescription& problem_desc,
+    const PerformanceConfigBnBwdBackward& config) const
+{
+    return config.IsValid(ctx, problem_desc);
+}
+
+PerformanceConfigBnBwdBackward
+BnBwdTrainingSpatial::Search(const ExecutionContext& ctx,
+                             const miopen::batchnorm::ProblemDescription& problem,
+                             const AnyInvokeParams& invoke_ctx) const
+{
+    return GenericSearch(*this, ctx, problem, invoke_ctx);
+}
+
+ConvSolution BnBwdTrainingSpatial::GetSolution(const ExecutionContext& context,
+                                               const miopen::batchnorm::ProblemDescription& problem,
+                                               const PerformanceConfigBnBwdBackward& config) const
+{
+    const auto& handle      = context.GetStream();
+    const unsigned wavesize = (miopen::StartsWith(handle.GetDeviceName(), "gfx10") ? 32 : 64);
+
+    bool bfpmixparm   = false;
+    bool bbfpmixparam = false;
+    bool bfp16parm    = false;
+    bool bfp32parm    = true;
+
+    if(problem.GetXDesc().GetType() == miopenHalf && problem.GetBnScale().GetType() == miopenHalf)
+    {
+        bfp16parm = true;
+        bfp32parm = false;
+    }
+    else if(problem.GetXDesc().GetType() == miopenHalf &&
+            problem.GetBnScale().GetType() == miopenFloat)
+    {
+        bfpmixparm = true;
+        bfp32parm  = false;
+    }
+    else if(problem.GetXDesc().GetType() == miopenBFloat16 &&
+            problem.GetBnScale().GetType() == miopenFloat)
+    {
+        bbfpmixparam = true;
+        bfp32parm    = false;
+    }
+
+    int n, c, h, w;
+    std::tie(n, c, h, w) = tien<4>(problem.GetXDesc().GetLengths());
+
+    unsigned int in_cstride = h * w;
+    unsigned int in_nstride = c * in_cstride;
+    unsigned int in_nhw     = n * in_cstride;
+    unsigned int in_nchw    = n * in_nstride;
+
+    auto inhw = float(1.0 / in_nhw);
+
+    int variant       = -1;
+    size_t vectorsize = 1;
+    GetVariantFromKernelId(config.kernel_id, variant, vectorsize);
+
+    size_t xlocalsize, xgridsize;
+    size_t ylocalsize = 1, ygridsize = 1, zlocalsize = 1, zgridsize = 1;
+    unsigned int ldsgcn, ldsnogcn;
+    int stash_method = -1;
+    if(variant != 2)
+    {
+        xlocalsize = 1024;
+        xgridsize  = static_cast<size_t>(1024) * c;
+        ldsgcn     = xlocalsize / wavesize;
+        ldsnogcn   = xlocalsize;
+    }
+    else
+    {
+        GetSpatialMultipleConfig(problem,
+                                 stash_values_bwd,
+                                 vectorsize,
+                                 xlocalsize,
+                                 ylocalsize,
+                                 xgridsize,
+                                 ygridsize,
+                                 stash_method);
+        ldsnogcn = xlocalsize * ylocalsize;
+        ldsgcn   = xlocalsize * ylocalsize / wavesize;
+    }
+
+    auto result = ConvSolution{miopenStatusSuccess};
+
+    {
+        auto kernel = KernelInfo{};
+
+        auto build_params = KernelBuildParameters{
+            {"MIOPEN_USE_FP16", static_cast<int>(bfp16parm)},
+            {"MIOPEN_USE_FP32", static_cast<int>(bfp32parm)},
+            {"MIOPEN_USE_FPMIX", static_cast<int>(bfpmixparm)},
+            {"MIOPEN_USE_BFPMIX", static_cast<int>(bbfpmixparam)},
+            {"MIO_BN_USESAVED", static_cast<int>(problem.UseSaved())},
+            {"MIO_BN_N", static_cast<int>(n)},
+            {"MIO_BN_C", static_cast<int>(c)},
+            {"MIO_BN_HW", static_cast<int>(in_cstride)},
+            {"MIO_BN_NHW", static_cast<int>(in_nhw)},
+            {"MIO_BN_CHW", in_nstride},
+            {"MIO_BN_NCHW", in_nchw},
+            {"MIO_BN_NGRPS", ygridsize / ylocalsize},
+            {"MIO_BN_LDS_SIZE", ldsnogcn},
+            {"MIO_BN_LDSGCN_SIZE", ldsgcn},
+            {"MIO_BN_VARIANT", variant},
+            {"MIO_WAVESIZE", wavesize},
+            {"MIO_BN_GRP0", xlocalsize},
+            {"MIO_BN_GRP1", ylocalsize},
+            {"MIO_BN_GRP2", zlocalsize},
+            {"MIO_LAYOUT_NHWC", static_cast<int>(problem.IsLayoutNHWC())},
+            {"MIO_BN_VECTORIZE", static_cast<int>(vectorsize > 1)},
+            {"MIO_BN_STASH_METHOD", stash_method},
+        };
+
+        {
+            // OpenCL kernels for variant 0-4
+            kernel.kernel_file      = "MIOpenBatchNormBwdSpatial.cl";
+            std::string kernel_name = "MIOpenBatchNormBwdSpatial";
+
+            build_params << KernelBuildParameters{
+                {"MIO_BN_GFX103X", (StartsWith(handle.GetDeviceName(), "gfx103") ? "1" : "0")},
+                {"MIO_BN_GFX110X", (StartsWith(handle.GetDeviceName(), "gfx110") ? "1" : "0")},
+                {"MIO_BN_GFX120X", (StartsWith(handle.GetDeviceName(), "gfx120") ? "1" : "0")},
+            };
+
+            kernel.comp_options = build_params.GenerateFor(kbp::OpenCL{});
+
+            kernel.l_wk.push_back(xlocalsize);
+            kernel.l_wk.push_back(ylocalsize);
+            kernel.l_wk.push_back(zlocalsize);
+
+            kernel.g_wk.push_back(xgridsize);
+            kernel.g_wk.push_back(ygridsize);
+            kernel.g_wk.push_back(zgridsize);
+
+            if(variant != 2)
+            {
+                kernel.kernel_name = kernel_name;
+                result.construction_params.push_back(kernel);
+            }
+            else
+            {
+                auto single_ygroup_kernel = kernel;
+
+                single_ygroup_kernel.g_wk[1] = single_ygroup_kernel.l_wk[1];
+
+                if(!problem.UseSaved())
+                {
+                    kernel.kernel_name = kernel_name + "MeanVariance";
+                    result.construction_params.push_back(kernel);
+
+                    single_ygroup_kernel.kernel_name = kernel_name + "FinalMeanVariance";
+                    result.construction_params.push_back(single_ygroup_kernel);
+                }
+
+                kernel.kernel_name = kernel_name + "DScaleDBias";
+                result.construction_params.push_back(kernel);
+
+                single_ygroup_kernel.kernel_name = kernel_name + "FinalDScaleDBias";
+                result.construction_params.push_back(single_ygroup_kernel);
+
+                kernel.kernel_name = kernel_name + "DX";
+                result.construction_params.push_back(kernel);
+            }
+        }
+    }
+
+    const auto dtype    = problem.GetBnScale().GetType();
+    const auto useSaved = problem.UseSaved();
+
+    result.invoker_factory = [=](const std::vector<Kernel>& kernels) {
+        return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
+            decltype(auto) params = raw_params.CastTo<miopen::batchnorm::BwdInvokeParams>();
+
+            float ctime = 0.;
+            visit_float(dtype, [&](auto as_float) {
+                if(variant != 2)
+                {
+                    decltype(auto) kernel = handle_.Run(kernels.front());
+                    if(useSaved)
+                    {
+                        kernel(params.x,
+                               params.dy,
+                               params.dx,
+                               params.bnScale,
+                               params.resultBnScaleDiff,
+                               params.resultBnBiasDiff,
+                               params.savedMean,
+                               params.savedInvVariance,
+                               as_float(inhw));
+                    }
+                    else
+                    {
+                        kernel(params.x,
+                               params.dy,
+                               params.dx,
+                               params.bnScale,
+                               params.resultBnScaleDiff,
+                               params.resultBnBiasDiff,
+                               params.epsilon,
+                               inhw);
+                    }
+                }
+                else
+                {
+                    if(useSaved)
+                    {
+                        handle_.Run(kernels[0])(params.x,
+                                                params.dy,
+                                                params.dx,
+                                                params.savedMean,
+                                                params.savedInvVariance);
+                        profileSequence(handle_, 0, &ctime);
+
+                        handle_.Run(kernels[1])(
+                            params.dx, params.resultBnScaleDiff, params.resultBnBiasDiff);
+                        profileSequence(handle_, 1, &ctime);
+
+                        handle_.Run(kernels[2])(params.x,
+                                                params.dy,
+                                                params.dx,
+                                                params.bnScale,
+                                                params.resultBnScaleDiff,
+                                                params.resultBnBiasDiff,
+                                                params.savedMean,
+                                                params.savedInvVariance,
+                                                as_float(inhw));
+                        profileSequence(handle_, 2, &ctime);
+                    }
+                    else
+                    {
+                        handle_.Run(kernels[0])(params.x, params.dx); // mean variance
+                        profileSequence(handle_, 0, &ctime);
+
+                        handle_.Run(kernels[1])(
+                            params.dx, as_float(inhw), params.epsilon); // final mean variance
+                        profileSequence(handle_, 1, &ctime);
+
+                        handle_.Run(kernels[2])(params.x, params.dy, params.dx); // dscale dbias
+                        profileSequence(handle_, 1, &ctime);
+
+                        handle_.Run(kernels[3])(params.dx,
+                                                params.resultBnScaleDiff,
+                                                params.resultBnBiasDiff); // final dscale dbias
+                        profileSequence(handle_, 1, &ctime);
+
+                        handle_.Run(kernels[4])(params.x,
+                                                params.dy,
+                                                params.dx,
+                                                params.bnScale,
+                                                params.resultBnScaleDiff,
+                                                params.resultBnBiasDiff,
+                                                as_float(inhw)); // dx
+                        profileSequence(handle_, 2, &ctime);
+                    }
+                }
+            });
+        };
+    };
+
+    return result;
+}
+
+} // namespace batchnorm
+
+} // namespace solver
+
+} // namespace miopen
diff --git a/src/solver/batchnorm/backward_spatial_multiple.cpp b/src/solver/batchnorm/backward_spatial_multiple.cpp
deleted file mode 100644
index e26922f478..0000000000
--- a/src/solver/batchnorm/backward_spatial_multiple.cpp
+++ /dev/null
@@ -1,352 +0,0 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2021 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-
-#include <miopen/batchnorm/solvers.hpp>
-
-#include <miopen/batchnorm/invoke_params.hpp>
-#include <miopen/batch_norm.hpp>
-#include <miopen/stringutils.hpp>
-#include <miopen/visit_float.hpp>
-#include <miopen/kernel_build_params.hpp>
-
-namespace miopen {
-
-namespace solver {
-
-namespace batchnorm {
-
-bool BnBwdTrainingSpatialMultiple::IsApplicable(
-    const ExecutionContext& context, const miopen::batchnorm::ProblemDescription& problem) const
-{
-    if(problem.GetDirection() != miopen::batchnorm::Direction::Backward ||
-       problem.GetMode() != miopenBNSpatial)
-        return false;
-    if(!problem.Is2D())
-    {
-        return false;
-    }
-    if(!IsOCLBwdTypeValid(problem))
-        return false;
-
-#if WORKAROUND_ISSUE_1549_FP16_BUILD_ERROR
-    if(problem.GetXDesc().GetType() == miopenHalf && problem.GetBnScale().GetType() == miopenHalf)
-    {
-        // bfp16parm = true;
-        // Unsupported kernel mode, error in kernel code
-        // MIOpenBatchNormBwdSpatial.cl:526 issue#1549
-        return false;
-    }
-#endif
-
-    return !BnBwdTrainingSpatialSingle{}.IsApplicable(context, problem);
-}
-
-ConvSolution BnBwdTrainingSpatialMultiple::GetSolution(
-    const ExecutionContext& context, const miopen::batchnorm::ProblemDescription& problem) const
-{
-    const auto& handle = context.GetStream();
-
-    bool bfpmixparm   = false;
-    bool bbfpmixparam = false;
-    bool bfp16parm    = false;
-    bool bfp32parm    = true;
-
-    if(problem.GetXDesc().GetType() == miopenHalf && problem.GetBnScale().GetType() == miopenHalf)
-    {
-        bfp16parm = true;
-        bfp32parm = false;
-    }
-    else if(problem.GetXDesc().GetType() == miopenHalf &&
-            problem.GetBnScale().GetType() == miopenFloat)
-    {
-        bfpmixparm = true;
-        bfp32parm  = false;
-    }
-    else if(problem.GetXDesc().GetType() == miopenBFloat16 &&
-            problem.GetBnScale().GetType() == miopenFloat)
-    {
-        bbfpmixparam = true;
-        bfp32parm    = false;
-    }
-
-    int n, c, h, w;
-    std::tie(n, c, h, w) = tien<4>(problem.GetXDesc().GetLengths());
-
-    unsigned int in_cstride = h * w;
-    unsigned int in_nstride = c * in_cstride;
-    unsigned int in_nhw     = n * in_cstride;
-    unsigned int in_nchw    = n * in_nstride;
-
-    auto inhw = float(1.0 / in_nhw);
-
-    size_t xlocalsize = 1;
-    size_t ylocalsize = 1;
-
-    size_t xgridsize = 1;
-    size_t ygridsize = 1;
-
-    unsigned int ldsgcn   = 0;
-    unsigned int ldsnogcn = 0;
-    int variant           = 1;
-
-    if(problem.IsLayoutNHWC())
-    {
-        xlocalsize = 1024;
-        xgridsize  = c * xlocalsize;
-        ldsgcn     = xlocalsize / 64;
-        ldsnogcn   = xlocalsize;
-    }
-    else
-    {
-        //*************************************************************************************************
-        // N*H*W < 32M and H*W > 1024, use batchnorm variant#1 implementation which parallelize
-        // work groups over channels and loop through NHW.
-        //*************************************************************************************************
-        if((in_nhw < (32 * 1024 * 1024) && in_cstride > 1024))
-        {
-            variant    = 1;
-            xlocalsize = 1024;
-            xgridsize  = c * xlocalsize;
-            ldsgcn     = xlocalsize / 64;
-            ldsnogcn   = xlocalsize;
-        }
-        //*************************************************************************************************
-        // N*H*W < 32M and H*W > 512  use batchnorm variant#1 or variant#3 implementation which
-        // parallelize
-        // work groups over channels and loop through N.
-        //*************************************************************************************************
-        else if(in_nhw < (32 * 1024 * 1024) && in_cstride > 512)
-        {
-            variant    = (n >= 32) ? 1 : 3;
-            xlocalsize = std::min(64 * ((in_cstride + 63) / 64), static_cast<unsigned int>(1024));
-            xgridsize  = c * xlocalsize;
-            ldsgcn     = xlocalsize / 64;
-            ldsnogcn   = xlocalsize;
-        }
-        //*************************************************************************************************
-        // H*W < 512  use batchnorm variant#0 or variant#3 implementation based on batch size and
-        // H*W
-        //*************************************************************************************************
-        else if(in_cstride <= 512)
-        {
-            if((n > 64) && (in_cstride > 160))
-            {
-                variant = 3;
-                xlocalsize =
-                    std::min(64 * ((in_cstride + 63) / 64), static_cast<unsigned int>(1024));
-                xgridsize = c * xlocalsize;
-                ldsgcn    = xlocalsize / 64;
-                ldsnogcn  = xlocalsize;
-            }
-            else
-            {
-                variant = 0;
-                if(bfp32parm)
-                {
-                    xlocalsize = 1024;
-                    xgridsize  = static_cast<size_t>(1024) * c;
-                }
-                else
-                {
-                    xlocalsize = 256;
-                    xgridsize  = static_cast<size_t>(256) * c;
-                }
-                ldsgcn   = xlocalsize / 64;
-                ldsnogcn = xlocalsize;
-            }
-        }
-        //*************************************************************************************************
-        // N*H*W > 32M, use batchnorm variant#2 implementation which parallelize
-        // work groups over channels and data segments.
-        //*************************************************************************************************
-        else
-        {
-            variant      = 2;
-            ylocalsize   = 1024;
-            auto segment = int(std::ceil(double(in_cstride) / double(ylocalsize)));
-            xgridsize    = c;
-            ygridsize    = segment * ylocalsize;
-            ldsgcn       = ylocalsize / 64;
-            ldsnogcn     = ylocalsize;
-        }
-        if((in_cstride < 200) && (in_cstride > 60) && bfpmixparm)
-        {
-            variant    = 1;
-            xlocalsize = 1024;
-            xgridsize  = c * xlocalsize;
-            ldsgcn     = xlocalsize / 64;
-            ldsnogcn   = xlocalsize;
-        }
-    }
-
-    auto result = ConvSolution{miopenStatusSuccess};
-
-    {
-        size_t zlocalsize = 1;
-        size_t zgridsize  = 1;
-
-        auto kernel = KernelInfo{};
-
-        kernel.kernel_file = "MIOpenBatchNormBwdSpatial.cl";
-        kernel.kernel_name = "MIOpenBatchNormBwdSpatial";
-
-        auto build_params = KernelBuildParameters{
-            {"MIOPEN_USE_FP16", static_cast<int>(bfp16parm)},
-            {"MIOPEN_USE_FP32", static_cast<int>(bfp32parm)},
-            {"MIOPEN_USE_FPMIX", static_cast<int>(bfpmixparm)},
-            {"MIOPEN_USE_BFPMIX", static_cast<int>(bbfpmixparam)},
-            {"MIO_BN_USESAVED", static_cast<int>(problem.UseSaved())},
-            {"MIO_BN_N", static_cast<int>(n)},
-            {"MIO_BN_C", static_cast<int>(c)},
-            {"MIO_BN_HW", static_cast<int>(in_cstride)},
-            {"MIO_BN_NHW", static_cast<int>(in_nhw)},
-            {"MIO_BN_CHW", in_nstride},
-            {"MIO_BN_NCHW", in_nchw},
-            {"MIO_BN_NGRPS", int(std::ceil(float(ygridsize) / ylocalsize))},
-            {"MIO_BN_LDS_SIZE", ldsnogcn},
-            {"MIO_BN_LDSGCN_SIZE", ldsgcn},
-            {"MIO_BN_VARIANT", variant},
-            {"MIO_BN_GRP0", xlocalsize},
-            {"MIO_BN_GRP1", ylocalsize},
-            {"MIO_BN_GRP2", zlocalsize},
-            {"MIO_BN_GFX103X", (StartsWith(handle.GetDeviceName(), "gfx103") ? "1" : "0")},
-            {"MIO_BN_GFX110X", (StartsWith(handle.GetDeviceName(), "gfx110") ? "1" : "0")},
-            {"MIO_BN_GFX120X", (StartsWith(handle.GetDeviceName(), "gfx120") ? "1" : "0")},
-            {"MIO_LAYOUT_NHWC", static_cast<int>(problem.IsLayoutNHWC())},
-        };
-
-        kernel.comp_options = build_params.GenerateFor(kbp::OpenCL{});
-
-        kernel.l_wk.push_back(xlocalsize);
-        kernel.l_wk.push_back(ylocalsize);
-        kernel.l_wk.push_back(zlocalsize);
-
-        kernel.g_wk.push_back(xgridsize);
-        kernel.g_wk.push_back(ygridsize);
-        kernel.g_wk.push_back(zgridsize);
-
-        if(problem.UseSaved())
-        {
-            auto copy = kernel;
-
-            copy.kernel_name = kernel.kernel_name + "DScaleDBias";
-            result.construction_params.push_back(copy);
-
-            copy.kernel_name = kernel.kernel_name + "FinalDScaleDBias";
-            result.construction_params.push_back(copy);
-
-            copy.kernel_name = kernel.kernel_name + "DX";
-            result.construction_params.push_back(copy);
-        }
-        else
-        {
-            auto copy = kernel;
-
-            copy.kernel_name = kernel.kernel_name + "MeanVariance";
-            result.construction_params.push_back(copy);
-
-            copy.kernel_name = kernel.kernel_name + "FinalMeanVariance";
-            result.construction_params.push_back(copy);
-
-            copy.kernel_name = kernel.kernel_name + "DScaleDBias";
-            result.construction_params.push_back(copy);
-
-            copy.kernel_name = kernel.kernel_name + "FinalDScaleDBias";
-            result.construction_params.push_back(copy);
-
-            copy.kernel_name = kernel.kernel_name + "DX";
-            result.construction_params.push_back(copy);
-        }
-    }
-
-    const auto dtype    = problem.GetBnScale().GetType();
-    const auto useSaved = problem.UseSaved();
-
-    result.invoker_factory = [=](const std::vector<Kernel>& kernels) {
-        return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
-            decltype(auto) params = raw_params.CastTo<miopen::batchnorm::BwdInvokeParams>();
-
-            float ctime = 0.;
-            visit_float(dtype, [&](auto as_float) {
-                if(useSaved)
-                {
-                    handle_.Run(kernels[0])(
-                        params.x, params.dy, params.dx, params.savedMean, params.savedInvVariance);
-                    profileSequence(handle_, 0, &ctime);
-
-                    handle_.Run(kernels[1])(
-                        params.dx, params.resultBnScaleDiff, params.resultBnBiasDiff);
-                    profileSequence(handle_, 1, &ctime);
-
-                    handle_.Run(kernels[2])(params.x,
-                                            params.dy,
-                                            params.dx,
-                                            params.bnScale,
-                                            params.resultBnScaleDiff,
-                                            params.resultBnBiasDiff,
-                                            params.savedMean,
-                                            params.savedInvVariance,
-                                            as_float(inhw));
-                    profileSequence(handle_, 2, &ctime);
-                }
-                else
-                {
-                    handle_.Run(kernels[0])(params.x, params.dx); // mean variance
-                    profileSequence(handle_, 0, &ctime);
-
-                    handle_.Run(kernels[1])(
-                        params.dx, as_float(inhw), params.epsilon); // final mean variance
-                    profileSequence(handle_, 1, &ctime);
-
-                    handle_.Run(kernels[2])(params.x, params.dy, params.dx); // dscale dbias
-                    profileSequence(handle_, 1, &ctime);
-
-                    handle_.Run(kernels[3])(params.dx,
-                                            params.resultBnScaleDiff,
-                                            params.resultBnBiasDiff); // final dscale dbias
-                    profileSequence(handle_, 1, &ctime);
-
-                    handle_.Run(kernels[4])(params.x,
-                                            params.dy,
-                                            params.dx,
-                                            params.bnScale,
-                                            params.resultBnScaleDiff,
-                                            params.resultBnBiasDiff,
-                                            as_float(inhw)); // dx
-                    profileSequence(handle_, 2, &ctime);
-                }
-            });
-        };
-    };
-
-    return result;
-}
-
-} // namespace batchnorm
-
-} // namespace solver
-
-} // namespace miopen
diff --git a/src/solver/batchnorm/backward_spatial_single.cpp b/src/solver/batchnorm/backward_spatial_single.cpp
deleted file mode 100644
index 9b375f517c..0000000000
--- a/src/solver/batchnorm/backward_spatial_single.cpp
+++ /dev/null
@@ -1,330 +0,0 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2021 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-
-#include <miopen/batchnorm/solvers.hpp>
-
-#include <miopen/batchnorm/invoke_params.hpp>
-#include <miopen/stringutils.hpp>
-#include <miopen/visit_float.hpp>
-#include <miopen/kernel_build_params.hpp>
-
-#define WORKAROUND_ISSUE_1146 1 // check asm solver applicability for gfx90a
-
-namespace miopen {
-
-namespace solver {
-
-namespace batchnorm {
-
-bool BnBwdTrainingSpatialSingle::IsApplicable(
-    const ExecutionContext&, const miopen::batchnorm::ProblemDescription& bn_problem) const
-{
-    if(bn_problem.GetDirection() != miopen::batchnorm::Direction::Backward ||
-       bn_problem.GetMode() != miopenBNSpatial)
-        return false;
-    if(!bn_problem.Is2D())
-        return false;
-
-#if WORKAROUND_ISSUE_1549_FP16_BUILD_ERROR
-    if(bn_problem.GetXDesc().GetType() == miopenHalf &&
-       bn_problem.GetBnScale().GetType() == miopenHalf)
-    {
-        // bfp16parm = true;
-        // Unsupported kernel mode, error in kernel code
-        // MIOpenBatchNormBwdSpatial.cl:526 issue#1549
-        return false;
-    }
-#endif
-    if(!IsOCLBwdTypeValid(bn_problem))
-        return false;
-
-    int n, c, h, w;
-    std::tie(n, c, h, w) = tien<4>(bn_problem.GetXDesc().GetLengths());
-
-    unsigned int in_cstride = h * w;
-    unsigned int in_nhw     = n * in_cstride;
-
-    return (in_cstride > 1024 && in_nhw < (32 * 1024 * 1024)) ||
-           (in_cstride > 512 && in_nhw < (32 * 1024 * 1024)) || in_cstride <= 512;
-}
-
-ConvSolution
-BnBwdTrainingSpatialSingle::GetSolution(const ExecutionContext& context,
-                                        const miopen::batchnorm::ProblemDescription& problem) const
-{
-    const auto& handle      = context.GetStream();
-    const unsigned wavesize = (miopen::StartsWith(handle.GetDeviceName(), "gfx10") ? 32 : 64);
-
-    bool bfpmixparm   = false;
-    bool bbfpmixparam = false;
-    bool bfp16parm    = false;
-    bool bfp32parm    = true;
-
-    if(problem.GetXDesc().GetType() == miopenHalf && problem.GetBnScale().GetType() == miopenHalf)
-    {
-        bfp16parm = true;
-        bfp32parm = false;
-    }
-    else if(problem.GetXDesc().GetType() == miopenHalf &&
-            problem.GetBnScale().GetType() == miopenFloat)
-    {
-        bfpmixparm = true;
-        bfp32parm  = false;
-    }
-    else if(problem.GetXDesc().GetType() == miopenBFloat16 &&
-            problem.GetBnScale().GetType() == miopenFloat)
-    {
-        bbfpmixparam = true;
-        bfp32parm    = false;
-    }
-
-    int n, c, h, w;
-    std::tie(n, c, h, w) = tien<4>(problem.GetXDesc().GetLengths());
-
-    unsigned int in_cstride = h * w;
-    unsigned int in_nstride = c * in_cstride;
-    unsigned int in_nhw     = n * in_cstride;
-    unsigned int in_nchw    = n * in_nstride;
-
-    auto inhw = float(1.0 / in_nhw);
-
-    size_t xlocalsize = 1;
-    size_t ylocalsize = 1;
-
-    size_t xgridsize = 1;
-    size_t ygridsize = 1;
-
-    unsigned int ldsgcn   = 0;
-    unsigned int ldsnogcn = 0;
-    int variant           = 1;
-
-    if(problem.IsLayoutNHWC())
-    {
-        xlocalsize = 1024;
-        xgridsize  = c * xlocalsize;
-        ldsgcn     = xlocalsize / wavesize;
-        ldsnogcn   = xlocalsize;
-    }
-    else
-    {
-        //*************************************************************************************************
-        // N*H*W < 32M and H*W > 1024, use batchnorm variant#1 implementation which parallelize
-        // work groups over channels and loop through NHW.
-        //*************************************************************************************************
-        if((in_nhw < (32 * 1024 * 1024) && in_cstride > 1024))
-        {
-            variant    = 1;
-            xlocalsize = 1024;
-            xgridsize  = c * xlocalsize;
-            ldsgcn     = xlocalsize / wavesize;
-            ldsnogcn   = xlocalsize;
-        }
-        //*************************************************************************************************
-        // N*H*W < 32M and H*W > 512  use batchnorm variant#1 or variant#3 implementation which
-        // parallelize
-        // work groups over channels and loop through N.
-        //*************************************************************************************************
-        else if(in_nhw < (32 * 1024 * 1024) && in_cstride > 512)
-        {
-            variant    = (n >= 32) ? 1 : 3;
-            xlocalsize = 1024;
-            xgridsize  = c * xlocalsize;
-            ldsgcn     = xlocalsize / wavesize;
-            ldsnogcn   = xlocalsize;
-        }
-        //*************************************************************************************************
-        // H*W < 512  use batchnorm variant#0 or variant#3 implementation based on batch size and
-        // H*W
-        //*************************************************************************************************
-        else if(in_cstride <= 512)
-        {
-            if((n > 64) && (in_cstride > 160))
-            {
-                variant    = 3;
-                xlocalsize = 1024;
-                xgridsize  = c * xlocalsize;
-                ldsgcn     = xlocalsize / wavesize;
-                ldsnogcn   = xlocalsize;
-            }
-            else
-            {
-                variant    = 0;
-                xlocalsize = 1024;
-                xgridsize  = static_cast<size_t>(1024) * c;
-                ldsgcn     = xlocalsize / wavesize;
-                ldsnogcn   = xlocalsize;
-            }
-        }
-        //*************************************************************************************************
-        // N*H*W > 32M, use batchnorm variant#2 implementation which parallelize
-        // work groups over channels and data segments.
-        //*************************************************************************************************
-        else
-        {
-            variant      = 2;
-            ylocalsize   = 1024;
-            auto segment = int(std::ceil(double(in_cstride) / double(ylocalsize)));
-            xgridsize    = c;
-            ygridsize    = segment * ylocalsize;
-            ldsgcn       = ylocalsize / wavesize;
-            ldsnogcn     = ylocalsize;
-        }
-        if((in_cstride < 200) && (in_cstride > 60) && bfpmixparm)
-        {
-            variant    = 1;
-            xlocalsize = 1024;
-            xgridsize  = c * xlocalsize;
-            ldsgcn     = xlocalsize / wavesize;
-            ldsnogcn   = xlocalsize;
-        }
-    }
-    auto result = ConvSolution{miopenStatusSuccess};
-
-    {
-        size_t zlocalsize = 1;
-        size_t zgridsize  = 1;
-
-        auto kernel = KernelInfo{};
-
-        auto build_params = KernelBuildParameters{
-            {"MIOPEN_USE_FP16", static_cast<int>(bfp16parm)},
-            {"MIOPEN_USE_FP32", static_cast<int>(bfp32parm)},
-            {"MIOPEN_USE_FPMIX", static_cast<int>(bfpmixparm)},
-            {"MIOPEN_USE_BFPMIX", static_cast<int>(bbfpmixparam)},
-            {"MIO_BN_USESAVED", static_cast<int>(problem.UseSaved())},
-            {"MIO_BN_N", static_cast<int>(n)},
-            {"MIO_BN_C", static_cast<int>(c)},
-            {"MIO_BN_HW", static_cast<int>(in_cstride)},
-            {"MIO_BN_NHW", static_cast<int>(in_nhw)},
-            {"MIO_BN_CHW", in_nstride},
-            {"MIO_BN_NCHW", in_nchw},
-            {"MIO_BN_LDS_SIZE", ldsnogcn},
-            {"MIO_BN_LDSGCN_SIZE", ldsgcn},
-            {"MIO_BN_VARIANT", variant},
-            {"MIO_WAVESIZE", wavesize},
-            {"MIO_BN_GRP0", xlocalsize},
-            {"MIO_BN_GRP1", ylocalsize},
-            {"MIO_BN_GRP2", zlocalsize},
-            {"MIO_LAYOUT_NHWC", static_cast<int>(problem.IsLayoutNHWC())},
-        };
-
-        if((n > 64) && (n % 2 == 0) && (variant == 3) && (bfpmixparm) && (problem.UseSaved()) &&
-           context.use_asm_kernels && context.rmv.IsV2orV3() &&
-           (StartsWith(handle.GetDeviceName(), "gfx8") ||
-            (StartsWith(handle.GetDeviceName(), "gfx9")
-#if WORKAROUND_ISSUE_1146
-             && (handle.GetDeviceName() != "gfx90a")
-#endif
-             && (!StartsWith(handle.GetDeviceName(), "gfx94")))) &&
-           (!handle.GetTargetProperties().Xnack() || !*handle.GetTargetProperties().Xnack()))
-        {
-            kernel.kernel_file = "gcnAsmBNBwdTrainSpatial.s";
-            kernel.kernel_name = "miopenGcnAsmBNBwdTrainSpatial";
-
-            union
-            {
-                unsigned u32;
-                float f32 = 0;
-            } NHW_value;
-
-            NHW_value.f32 = static_cast<float>(in_nhw);
-
-            build_params << KernelBuildParameters{
-                {"ROCM_METADATA_VERSION", context.rmv.UseV3() ? "5" : "4"},
-                {"MIO_BN_NHW_FLOAT", NHW_value.u32},
-            };
-
-            kernel.comp_options = build_params.GenerateFor(kbp::GcnAsm{});
-        }
-        else
-        {
-            kernel.kernel_file = "MIOpenBatchNormBwdSpatial.cl";
-            kernel.kernel_name = "MIOpenBatchNormBwdSpatial";
-
-            build_params << KernelBuildParameters{
-                {"MIO_BN_GFX103X", (StartsWith(handle.GetDeviceName(), "gfx103") ? "1" : "0")},
-                {"MIO_BN_GFX110X", (StartsWith(handle.GetDeviceName(), "gfx110") ? "1" : "0")},
-                {"MIO_BN_GFX120X", (StartsWith(handle.GetDeviceName(), "gfx120") ? "1" : "0")},
-            };
-
-            kernel.comp_options = build_params.GenerateFor(kbp::OpenCL{});
-        }
-
-        kernel.l_wk.push_back(xlocalsize);
-        kernel.l_wk.push_back(ylocalsize);
-        kernel.l_wk.push_back(zlocalsize);
-
-        kernel.g_wk.push_back(xgridsize);
-        kernel.g_wk.push_back(ygridsize);
-        kernel.g_wk.push_back(zgridsize);
-
-        result.construction_params.push_back(kernel);
-    }
-
-    const auto dtype    = problem.GetBnScale().GetType();
-    const auto useSaved = problem.UseSaved();
-
-    result.invoker_factory = [=](const std::vector<Kernel>& kernels) {
-        return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
-            decltype(auto) kernel = handle_.Run(kernels.front());
-            decltype(auto) params = raw_params.CastTo<miopen::batchnorm::BwdInvokeParams>();
-
-            visit_float(dtype, [&](auto as_float) {
-                if(useSaved)
-                {
-                    kernel(params.x,
-                           params.dy,
-                           params.dx,
-                           params.bnScale,
-                           params.resultBnScaleDiff,
-                           params.resultBnBiasDiff,
-                           params.savedMean,
-                           params.savedInvVariance,
-                           as_float(inhw));
-                }
-                else
-                {
-                    kernel(params.x,
-                           params.dy,
-                           params.dx,
-                           params.bnScale,
-                           params.resultBnScaleDiff,
-                           params.resultBnBiasDiff,
-                           params.epsilon,
-                           inhw);
-                }
-            });
-        };
-    };
-
-    return result;
-}
-
-} // namespace batchnorm
-
-} // namespace solver
-
-} // namespace miopen
diff --git a/src/solver/batchnorm/forward_inference.cpp b/src/solver/batchnorm/forward_inference.cpp
index a05fce5105..d338f8001a 100644
--- a/src/solver/batchnorm/forward_inference.cpp
+++ b/src/solver/batchnorm/forward_inference.cpp
@@ -1,172 +1,210 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2021 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-
-#include <miopen/batchnorm/solvers.hpp>
-
-#include <miopen/batchnorm/invoke_params.hpp>
-#include <miopen/batch_norm.hpp>
-#include <miopen/stringutils.hpp>
-#include <miopen/visit_float.hpp>
-#include <miopen/kernel_build_params.hpp>
-
-namespace miopen {
-
-namespace solver {
-
-namespace batchnorm {
-
-bool BnFwdInference::IsApplicable(const ExecutionContext&,
-                                  const miopen::batchnorm::ProblemDescription& bn_problem) const
-{
-    if(bn_problem.IsLayoutNHWC())
-        return false;
-    if(bn_problem.GetDirection() != miopen::batchnorm::Direction::ForwardInference)
-        return false;
-    if(!(bn_problem.IsFp32() or bn_problem.IsFp16() or bn_problem.IsBFp16()))
-        return false;
-    if(!bn_problem.Is2D())
-        return false;
-    if(!IsOCLInferTypeValid(bn_problem))
-        return false;
-
-    return true;
-}
-
-ConvSolution BnFwdInference::GetSolution(const ExecutionContext& context,
-                                         const miopen::batchnorm::ProblemDescription& problem) const
-{
-    const auto& handle = context.GetStream();
-
-    bool bfpmixparm   = false;
-    bool bbfpmixparam = false;
-    bool bfp16parm    = false;
-    bool bfp32parm    = true;
-    if(problem.GetXDesc().GetType() == miopenHalf && problem.GetBnScale().GetType() == miopenHalf)
-    {
-        bfp16parm = true;
-        bfp32parm = false;
-    }
-    else if(problem.GetXDesc().GetType() == miopenHalf &&
-            problem.GetBnScale().GetType() == miopenFloat)
-    {
-        bfpmixparm = true;
-        bfp32parm  = false;
-    }
-    else if(problem.GetXDesc().GetType() == miopenBFloat16 &&
-            problem.GetBnScale().GetType() == miopenFloat)
-    {
-        bbfpmixparam = true;
-        bfp32parm    = false;
-    }
-
-    int n, c, h, w;
-    std::tie(n, c, h, w) = tien<4>(problem.GetXDesc().GetLengths());
-
-    unsigned int in_cstride = h * w;
-
-    auto result = ConvSolution{miopenStatusSuccess};
-
-    {
-        size_t xlocalsize = 1;
-        auto xgridsize    = c;
-        size_t ylocalsize = 256;
-        size_t ygridsize  = ylocalsize * ((in_cstride + ylocalsize - 1) / ylocalsize);
-        size_t zlocalsize = 1;
-        size_t zgridsize  = 1;
-
-        auto kernel = KernelInfo{};
-
-        kernel.kernel_file = "MIOpenBatchNormFwdInfer"; // build this up
-        kernel.kernel_name = "MIOpenBatchNormFwdInfer";
-        if(problem.GetMode() == miopenBNSpatial)
-        { // SPATIAL kernels
-            kernel.kernel_file += "Spatial.cl";
-            kernel.kernel_name += "SpatialEst";
-        }
-        else
-        { // PER ACTIVATION
-            kernel.kernel_file += "PerAct.cl";
-            kernel.kernel_name += "PerActivationEst";
-        }
-
-        const auto build_params = KernelBuildParameters{
-            {"MIOPEN_USE_FP16", static_cast<int>(bfp16parm)},
-            {"MIOPEN_USE_FP32", static_cast<int>(bfp32parm)},
-            {"MIOPEN_USE_FPMIX", static_cast<int>(bfpmixparm)},
-            {"MIOPEN_USE_BFPMIX", static_cast<int>(bbfpmixparam)},
-            {"MIO_BN_GRP0", xlocalsize},
-            {"MIO_BN_GRP1", ylocalsize},
-            {"MIO_BN_GRP2", zlocalsize},
-            {"MIO_BN_GFX103X", (StartsWith(handle.GetDeviceName(), "gfx103") ? "1" : "0")},
-            {"MIO_BN_GFX110X", (StartsWith(handle.GetDeviceName(), "gfx110") ? "1" : "0")},
-            {"MIO_BN_GFX120X", (StartsWith(handle.GetDeviceName(), "gfx120") ? "1" : "0")},
-        };
-
-        kernel.comp_options = build_params.GenerateFor(kbp::OpenCL{});
-
-        kernel.l_wk.push_back(xlocalsize);
-        kernel.l_wk.push_back(ylocalsize);
-        kernel.l_wk.push_back(zlocalsize);
-
-        kernel.g_wk.push_back(xgridsize);
-        kernel.g_wk.push_back(ygridsize);
-        kernel.g_wk.push_back(zgridsize);
-
-        result.construction_params.push_back(kernel);
-    }
-
-    result.invoker_factory = [](const std::vector<Kernel>& kernels) {
-        return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
-            decltype(auto) kernel = handle_.Run(kernels.front());
-            decltype(auto) params = raw_params.CastTo<miopen::batchnorm::InfInvokeParams>();
-
-            int n_, c_, h_, w_;
-            std::tie(n_, c_, h_, w_) = tien<4>(params.xDesc->GetLengths());
-
-            unsigned int in_nstride_ = c_ * h_ * w_;
-            unsigned int in_cstride_ = h_ * w_;
-
-            kernel(params.x,
-                   params.y,
-                   params.estimatedMean,
-                   params.estimatedVariance,
-                   params.bnScale,
-                   params.bnBias,
-                   params.epsilon,
-                   n_,
-                   in_cstride_,
-                   in_nstride_);
-        };
-    };
-
-    return result;
-}
-
-} // namespace batchnorm
-
-} // namespace solver
-
-} // namespace miopen
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2021-2025 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <miopen/batchnorm/solvers.hpp>
+
+#include <miopen/batchnorm/invoke_params.hpp>
+#include <miopen/batch_norm.hpp>
+#include <miopen/stringutils.hpp>
+#include <miopen/visit_float.hpp>
+#include <miopen/kernel_build_params.hpp>
+
+namespace miopen {
+
+namespace solver {
+
+namespace batchnorm {
+
+bool BnFwdInference::IsApplicable(const ExecutionContext&,
+                                  const miopen::batchnorm::ProblemDescription& bn_problem) const
+{
+    if(bn_problem.GetDirection() != miopen::batchnorm::Direction::ForwardInference)
+        return false;
+    if(!(bn_problem.IsFp32() or bn_problem.IsFp16() or bn_problem.IsBFp16()))
+        return false;
+    if(!bn_problem.Is2D())
+        return false;
+    if(!IsOCLInferTypeValid(bn_problem))
+        return false;
+
+    return true;
+}
+
+ConvSolution BnFwdInference::GetSolution(const ExecutionContext& context,
+                                         const miopen::batchnorm::ProblemDescription& problem) const
+{
+    const auto& handle = context.GetStream();
+
+    bool bfpmixparm   = false;
+    bool bbfpmixparam = false;
+    bool bfp16parm    = false;
+    bool bfp32parm    = true;
+    if(problem.GetXDesc().GetType() == miopenHalf && problem.GetBnScale().GetType() == miopenHalf)
+    {
+        bfp16parm = true;
+        bfp32parm = false;
+    }
+    else if(problem.GetXDesc().GetType() == miopenHalf &&
+            problem.GetBnScale().GetType() == miopenFloat)
+    {
+        bfpmixparm = true;
+        bfp32parm  = false;
+    }
+    else if(problem.GetXDesc().GetType() == miopenBFloat16 &&
+            problem.GetBnScale().GetType() == miopenFloat)
+    {
+        bbfpmixparam = true;
+        bfp32parm    = false;
+    }
+
+    int n, c, h, w;
+    std::tie(n, c, h, w) = tien<4>(problem.GetXDesc().GetLengths());
+
+    unsigned int in_cstride = h * w;
+
+    auto result = ConvSolution{miopenStatusSuccess};
+
+    {
+        size_t xlocalsize, xgridsize, ylocalsize, ygridsize, zlocalsize, zgridsize;
+        size_t max_localsize = 256;
+        bool vectorize;
+        if(problem.GetXDesc().GetLayout_t() == miopenTensorNHWC)
+        {
+            vectorize       = c % 4 == 0;
+            int vector_size = vectorize ? 4 : 1;
+            xlocalsize      = std::min(size_t{c / vector_size}, max_localsize);
+            xgridsize       = xlocalsize * ((c / vector_size + xlocalsize - 1) / xlocalsize);
+            ylocalsize      = max_localsize / xlocalsize;
+            ygridsize       = ylocalsize * ((in_cstride + ylocalsize - 1) / ylocalsize);
+        }
+        else
+        {
+            vectorize       = in_cstride % 4 == 0;
+            int vector_size = vectorize ? 4 : 1;
+            xlocalsize      = 1;
+            xgridsize       = c;
+            ylocalsize      = max_localsize;
+            ygridsize = ylocalsize * ((in_cstride / vector_size + ylocalsize - 1) / ylocalsize);
+        }
+        zlocalsize = 1;
+        zgridsize  = 1;
+
+        auto kernel = KernelInfo{};
+
+        kernel.kernel_file = "MIOpenBatchNormFwdInfer"; // build this up
+        kernel.kernel_name = "MIOpenBatchNormFwdInfer";
+        if(problem.GetMode() == miopenBNSpatial)
+        { // SPATIAL kernels
+            kernel.kernel_file += "Spatial.cl";
+            kernel.kernel_name += "SpatialEst";
+        }
+        else
+        { // PER ACTIVATION
+            kernel.kernel_file += "PerAct.cl";
+            kernel.kernel_name += "PerActivationEst";
+        }
+
+        const auto build_params = KernelBuildParameters{
+            {"MIOPEN_USE_FP16", static_cast<int>(bfp16parm)},
+            {"MIOPEN_USE_FP32", static_cast<int>(bfp32parm)},
+            {"MIOPEN_USE_FPMIX", static_cast<int>(bfpmixparm)},
+            {"MIOPEN_USE_BFPMIX", static_cast<int>(bbfpmixparam)},
+            {"MIO_BN_GRP0", xlocalsize},
+            {"MIO_BN_GRP1", ylocalsize},
+            {"MIO_BN_GRP2", zlocalsize},
+            {"MIO_BN_GFX103X", (StartsWith(handle.GetDeviceName(), "gfx103") ? "1" : "0")},
+            {"MIO_BN_GFX110X", (StartsWith(handle.GetDeviceName(), "gfx110") ? "1" : "0")},
+            {"MIO_BN_GFX120X", (StartsWith(handle.GetDeviceName(), "gfx120") ? "1" : "0")},
+            {"MIO_LAYOUT_NHWC", static_cast<int>(problem.IsLayoutNHWC())},
+            {"MIO_BN_VECTORIZE", static_cast<int>(vectorize)},
+        };
+
+        kernel.comp_options = build_params.GenerateFor(kbp::OpenCL{});
+
+        kernel.l_wk.push_back(xlocalsize);
+        kernel.l_wk.push_back(ylocalsize);
+        kernel.l_wk.push_back(zlocalsize);
+
+        kernel.g_wk.push_back(xgridsize);
+        kernel.g_wk.push_back(ygridsize);
+        kernel.g_wk.push_back(zgridsize);
+
+        result.construction_params.push_back(kernel);
+    }
+
+    result.invoker_factory = [](const std::vector<Kernel>& kernels) {
+        return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
+            decltype(auto) kernel = handle_.Run(kernels.front());
+            decltype(auto) params = raw_params.CastTo<miopen::batchnorm::InfInvokeParams>();
+
+            int n_, c_, h_, w_;
+            std::tie(n_, c_, h_, w_) = tien<4>(params.xDesc->GetLengths());
+
+            unsigned int in_nstride_ = c_ * h_ * w_;
+
+            if(params.xDesc->GetLayout_t() == miopenTensorNHWC)
+            {
+                kernel(params.x,
+                       params.y,
+                       params.estimatedMean,
+                       params.estimatedVariance,
+                       params.bnScale,
+                       params.bnBias,
+                       params.epsilon,
+                       c_,
+                       h_ * w_,
+                       n_,
+                       1,            // cStride
+                       c_,           // hwStride
+                       in_nstride_); // batchStride
+            }
+            else
+            {
+                kernel(params.x,
+                       params.y,
+                       params.estimatedMean,
+                       params.estimatedVariance,
+                       params.bnScale,
+                       params.bnBias,
+                       params.epsilon,
+                       c_,
+                       h_ * w_,
+                       n_,
+                       h_ * w_,      // cStride
+                       1,            // hwStride
+                       in_nstride_); // batchStride
+            }
+        };
+    };
+
+    return result;
+}
+
+} // namespace batchnorm
+
+} // namespace solver
+
+} // namespace miopen
diff --git a/src/solver/batchnorm/forward_spatial.cpp b/src/solver/batchnorm/forward_spatial.cpp
new file mode 100644
index 0000000000..49872a3637
--- /dev/null
+++ b/src/solver/batchnorm/forward_spatial.cpp
@@ -0,0 +1,500 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2021 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include <miopen/batchnorm/common_spatial.hpp>
+#include <miopen/batchnorm/solvers.hpp>
+
+#include <miopen/generic_search.hpp>
+#include <miopen/batchnorm/invoke_params.hpp>
+#include <miopen/batch_norm.hpp>
+#include <miopen/stringutils.hpp>
+#include <miopen/visit_float.hpp>
+#include <miopen/kernel_build_params.hpp>
+
+namespace miopen {
+
+namespace solver {
+
+namespace batchnorm {
+
+// Spatial multiple needs space for 2 fp32 elements
+// per each x thread (including the last workgroup)
+// to stash intermediate mean and variance
+const unsigned int stash_values_fwd = 2;
+
+bool PerformanceConfigBnFwdTraining::IsValid(
+    const ExecutionContext&, const miopen::batchnorm::ProblemDescription& problem) const
+{
+    if(this->kernel_id.empty())
+    {
+        return false;
+    }
+
+    // if default config is variant 2, check if it can be applied
+    // (based on variant 2 restrictions)
+    size_t vectorsize;
+    int variant = 0;
+    GetVariantFromKernelId(this->kernel_id, variant, vectorsize);
+    if(variant == 2)
+    {
+        return IsSpatialMultipleApplicable(problem, vectorsize, stash_values_fwd);
+    }
+    return true;
+}
+
+void PerformanceConfigBnFwdTraining::HeuristicInit(
+    const miopen::batchnorm::ProblemDescription& problem)
+{
+    // Define default configuration based on heuristics and
+    // add all other valid configurations for the given problem
+    if(UseMultiple(problem))
+    {
+        DefaultConfigSpatialMultiple(problem, stash_values_fwd, this->valid_kernels);
+        DefaultConfigSpatialSingle(problem, this->valid_kernels);
+    }
+    else
+    {
+        DefaultConfigSpatialSingle(problem, this->valid_kernels);
+        DefaultConfigSpatialMultiple(problem, stash_values_fwd, this->valid_kernels);
+    }
+
+    // Set index and kernel_id to default value
+    this->index     = 0;
+    this->kernel_id = valid_kernels[0];
+}
+
+bool PerformanceConfigBnFwdTraining::SetNextValue(
+    const miopen::batchnorm::ProblemDescription& problem_desc)
+{
+    // In case the valid_kernel list is empty, we fill it with
+    // default value as first one and all other valid ones will follow
+    if(this->valid_kernels.empty())
+    {
+        this->HeuristicInit(problem_desc);
+        return true;
+    }
+    // Get next valid configuration
+    if((this->index + 1) < valid_kernels.size())
+    {
+        ++this->index;
+        this->kernel_id = this->valid_kernels[index];
+        return true;
+    }
+    else
+    {
+        return false;
+    }
+}
+
+bool PerformanceConfigBnFwdTraining::operator==(const PerformanceConfigBnFwdTraining& other) const
+{
+    return this->kernel_id == other.kernel_id;
+}
+
+bool PerformanceConfigBnFwdTraining::IsValidValue() const
+{
+    return this->index >= 0 && this->index < valid_kernels.size();
+}
+
+bool BnFwdTrainingSpatial::IsApplicable(
+    const ExecutionContext&, const miopen::batchnorm::ProblemDescription& bn_problem) const
+{
+    if(bn_problem.GetDirection() != miopen::batchnorm::Direction::ForwardTraining ||
+       bn_problem.GetMode() != miopenBNSpatial)
+        return false;
+
+    if(!bn_problem.Is2D())
+        return false;
+
+    if(!IsOCLFwdTrainTypeValid(bn_problem))
+        return false;
+
+    return true;
+}
+
+PerformanceConfigBnFwdTraining BnFwdTrainingSpatial::GetDefaultPerformanceConfig(
+    const ExecutionContext&, const miopen::batchnorm::ProblemDescription& problem_desc) const
+{
+    PerformanceConfigBnFwdTraining pp;
+    pp.HeuristicInit(problem_desc);
+    MIOPEN_LOG_I(pp.ToString());
+    return pp;
+}
+
+bool BnFwdTrainingSpatial::IsValidPerformanceConfig(
+    const ExecutionContext& ctx,
+    const miopen::batchnorm::ProblemDescription& problem_desc,
+    const PerformanceConfigBnFwdTraining& config) const
+{
+    bool valid = config.IsValid(ctx, problem_desc);
+    return valid;
+}
+
+PerformanceConfigBnFwdTraining
+BnFwdTrainingSpatial::Search(const ExecutionContext& ctx,
+                             const miopen::batchnorm::ProblemDescription& problem,
+                             const AnyInvokeParams& invoke_ctx) const
+{
+    return GenericSearch(*this, ctx, problem, invoke_ctx);
+}
+
+ConvSolution BnFwdTrainingSpatial::GetSolution(const ExecutionContext& context,
+                                               const miopen::batchnorm::ProblemDescription& problem,
+                                               const PerformanceConfigBnFwdTraining& config) const
+{
+    const auto& handle = context.GetStream();
+
+    bool bfpmixparm   = false;
+    bool bbfpmixparam = false;
+    bool bfp16parm    = false;
+    bool bfp32parm    = true;
+
+    if(problem.GetXDesc().GetType() == miopenHalf && problem.GetBnScale().GetType() == miopenHalf)
+    {
+        bfp16parm = true;
+        bfp32parm = false;
+    }
+    else if(problem.GetXDesc().GetType() == miopenHalf &&
+            problem.GetBnScale().GetType() == miopenFloat)
+    {
+        bfpmixparm = true;
+        bfp32parm  = false;
+    }
+    else if(problem.GetXDesc().GetType() == miopenBFloat16 &&
+            problem.GetBnScale().GetType() == miopenFloat)
+    {
+        bbfpmixparam = true;
+        bfp32parm    = false;
+    }
+
+    int n, c, h, w;
+    std::tie(n, c, h, w) = tien<4>(problem.GetXDesc().GetLengths());
+
+    unsigned int in_cstride = h * w;
+    unsigned int in_nstride = c * in_cstride;
+    unsigned int in_nhw     = n * in_cstride;
+    unsigned int in_nchw    = n * in_nstride;
+    auto inhw               = float(1.0 / in_nhw);
+
+    int variant       = -1;
+    size_t vectorsize = 1;
+    GetVariantFromKernelId(config.kernel_id, variant, vectorsize);
+
+    size_t xlocalsize, xgridsize;
+    size_t ylocalsize = 1, ygridsize = 1, zlocalsize = 1, zgridsize = 1;
+    unsigned int ldsgcn, ldsnogcn;
+    int stash_method = -1;
+
+    if(variant != 2)
+    {
+        xlocalsize = 1024;
+        if(((in_cstride < 256) && (n < 256)) || ((in_cstride < 100) && (n <= 256)))
+        {
+            xlocalsize = 256;
+        }
+        xgridsize = c * xlocalsize;
+        ldsgcn    = xlocalsize / 64;
+        ldsnogcn  = xlocalsize;
+#if(WORKAROUND_SWDEV_253606 == 0)
+        if(variant == 4)
+        {
+            xlocalsize = 256;
+            xgridsize  = c * xlocalsize;
+            ylocalsize = 1;
+            ygridsize  = 1;
+            ldsgcn     = xlocalsize / 64;
+            ldsnogcn   = xlocalsize;
+        }
+#endif
+    }
+    else
+    {
+        GetSpatialMultipleConfig(problem,
+                                 stash_values_fwd,
+                                 vectorsize,
+                                 xlocalsize,
+                                 ylocalsize,
+                                 xgridsize,
+                                 ygridsize,
+                                 stash_method);
+        ldsnogcn = xlocalsize * ylocalsize;
+        ldsgcn   = xlocalsize * ylocalsize / 64;
+    }
+
+    auto result = ConvSolution{miopenStatusSuccess};
+
+    {
+        auto kernel = KernelInfo{};
+
+        auto build_params = KernelBuildParameters{
+            {"MIOPEN_USE_FP16", static_cast<int>(bfp16parm)},
+            {"MIOPEN_USE_FP32", static_cast<int>(bfp32parm)},
+            {"MIOPEN_USE_FPMIX", static_cast<int>(bfpmixparm)},
+            {"MIOPEN_USE_BFPMIX", static_cast<int>(bbfpmixparam)},
+            {"MIO_SAVE_MEAN_VARIANCE", static_cast<int>(problem.GetResultSave())},
+            {"MIO_RUNNING_RESULT", static_cast<int>(problem.GetResultRunning())},
+            {"MIO_BN_VARIANT", variant},
+            {"MIO_BN_LDS_SIZE", ldsnogcn},
+            {"MIO_BN_LDSGCN_SIZE", std::to_string(ldsgcn)},
+            {"MIO_BN_N", n},
+            {"MIO_BN_NGRPS", ygridsize / ylocalsize},
+            {"MIO_BN_GRP0", xlocalsize},
+            {"MIO_BN_GRP1", ylocalsize},
+            {"MIO_BN_GRP2", zlocalsize},
+            {"MIO_BN_GFX103X", (StartsWith(handle.GetDeviceName(), "gfx103") ? "1" : "0")},
+            {"MIO_BN_GFX110X", (StartsWith(handle.GetDeviceName(), "gfx110") ? "1" : "0")},
+            {"MIO_BN_GFX120X", (StartsWith(handle.GetDeviceName(), "gfx120") ? "1" : "0")},
+            {"MIO_LAYOUT_NHWC", static_cast<int>(problem.IsLayoutNHWC())},
+            {"MIO_BN_VECTORIZE", static_cast<int>(vectorsize > 1)},
+            {"MIO_BN_STASH_METHOD", stash_method},
+        };
+
+        if(variant != 4)
+        {
+            build_params.Define("MIO_BN_C", c);
+            build_params.Define("MIO_BN_HW", in_cstride);
+            build_params.Define("MIO_BN_NHW", in_nhw);
+            build_params.Define("MIO_BN_CHW", in_nstride);
+            build_params.Define("MIO_BN_NCHW", in_nchw);
+        }
+
+        kernel.kernel_file = "MIOpenBatchNormFwdTrainSpatial.cl";
+        kernel.kernel_name = "MIOpenBatchNormFwdTrainSpatial";
+
+        kernel.comp_options = build_params.GenerateFor(kbp::OpenCL{});
+
+        kernel.l_wk.push_back(xlocalsize);
+        kernel.l_wk.push_back(ylocalsize);
+        kernel.l_wk.push_back(zlocalsize);
+
+        kernel.g_wk.push_back(xgridsize);
+        kernel.g_wk.push_back(ygridsize);
+        kernel.g_wk.push_back(zgridsize);
+
+        if(variant != 2)
+        {
+            result.construction_params.push_back(kernel);
+        }
+        else
+        {
+            auto copy        = kernel;
+            copy.kernel_name = kernel.kernel_name + "MeanVariance";
+            result.construction_params.push_back(copy);
+
+            copy.kernel_name = kernel.kernel_name + "FinalMeanVariance";
+            copy.g_wk[1]     = kernel.l_wk[1];
+            result.construction_params.push_back(copy);
+
+            copy.kernel_name = kernel.kernel_name + "Norm";
+            copy.g_wk[1]     = kernel.g_wk[1];
+            result.construction_params.push_back(copy);
+        }
+    }
+
+    const auto dtype = problem.GetBnScale().GetType();
+    const auto vn4   = (variant != 4);
+
+    result.invoker_factory = [=](const std::vector<Kernel>& kernels) {
+        return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
+            decltype(auto) params = raw_params.CastTo<miopen::batchnorm::FwdTrainInvokeParams>();
+            const auto resultsave =
+                params.resultSaveMean != nullptr && params.resultSaveInvVariance != nullptr;
+            const auto resultrunning =
+                params.resultRunningMean != nullptr && params.resultRunningVariance != nullptr;
+
+            float ctime = 0.;
+            visit_float(dtype, [&](auto as_float) {
+                if(variant != 2)
+                {
+                    decltype(auto) kernel = handle_.Run(kernels.front());
+                    if(resultsave && resultrunning)
+                    {
+                        if(vn4)
+                        {
+                            kernel(params.x,
+                                   params.y,
+                                   params.bnScale,
+                                   params.bnBias,
+                                   as_float(inhw),
+                                   params.expAvgFactor,
+                                   params.resultRunningMean,
+                                   params.resultRunningVariance,
+                                   params.epsilon,
+                                   params.resultSaveMean,
+                                   params.resultSaveInvVariance);
+                        }
+                        else
+                        {
+                            kernel(params.x,
+                                   params.y,
+                                   params.bnScale,
+                                   params.bnBias,
+                                   as_float(inhw),
+                                   params.expAvgFactor,
+                                   params.resultRunningMean,
+                                   params.resultRunningVariance,
+                                   params.epsilon,
+                                   params.resultSaveMean,
+                                   params.resultSaveInvVariance,
+                                   in_cstride,
+                                   in_nstride);
+                        }
+                    }
+                    else if(resultsave)
+                    {
+                        if(vn4)
+                        {
+                            kernel(params.x,
+                                   params.y,
+                                   params.bnScale,
+                                   params.bnBias,
+                                   as_float(inhw),
+                                   params.epsilon,
+                                   params.resultSaveMean,
+                                   params.resultSaveInvVariance);
+                        }
+                        else
+                        {
+                            kernel(params.x,
+                                   params.y,
+                                   params.bnScale,
+                                   params.bnBias,
+                                   as_float(inhw),
+                                   params.epsilon,
+                                   params.resultSaveMean,
+                                   params.resultSaveInvVariance,
+                                   in_cstride,
+                                   in_nstride);
+                        }
+                    }
+                    else if(resultrunning)
+                    {
+                        if(vn4)
+                        {
+                            kernel(params.x,
+                                   params.y,
+                                   params.bnScale,
+                                   params.bnBias,
+                                   as_float(inhw),
+                                   params.expAvgFactor,
+                                   params.resultRunningMean,
+                                   params.resultRunningVariance,
+                                   params.epsilon);
+                        }
+                        else
+                        {
+                            kernel(params.x,
+                                   params.y,
+                                   params.bnScale,
+                                   params.bnBias,
+                                   as_float(inhw),
+                                   params.expAvgFactor,
+                                   params.resultRunningMean,
+                                   params.resultRunningVariance,
+                                   params.epsilon,
+                                   in_cstride,
+                                   in_nstride);
+                        }
+                    }
+                    else
+                    {
+                        if(vn4)
+                        {
+                            kernel(params.x,
+                                   params.y,
+                                   params.bnScale,
+                                   params.bnBias,
+                                   as_float(inhw),
+                                   params.epsilon);
+                        }
+                        else
+                        {
+                            kernel(params.x,
+                                   params.y,
+                                   params.bnScale,
+                                   params.bnBias,
+                                   as_float(inhw),
+                                   params.epsilon,
+                                   in_cstride,
+                                   in_nstride);
+                        }
+                    }
+                }
+                else
+                {
+                    handle_.Run(kernels[0])(params.x, params.y);
+                    profileSequence(handle_, 0, &ctime);
+
+                    if(resultsave && resultrunning)
+                    {
+                        handle_.Run(kernels[1])(params.y,
+                                                as_float(inhw),
+                                                params.expAvgFactor,
+                                                params.resultRunningMean,
+                                                params.resultRunningVariance,
+                                                params.epsilon,
+                                                params.resultSaveMean,
+                                                params.resultSaveInvVariance);
+                    }
+                    else if(resultsave)
+                    {
+                        handle_.Run(kernels[1])(params.y,
+                                                as_float(inhw),
+                                                params.epsilon,
+                                                params.resultSaveMean,
+                                                params.resultSaveInvVariance);
+                    }
+                    else if(resultrunning)
+                    {
+                        handle_.Run(kernels[1])(params.y,
+                                                as_float(inhw),
+                                                params.expAvgFactor,
+                                                params.resultRunningMean,
+                                                params.resultRunningVariance,
+                                                params.epsilon);
+                    }
+                    else
+                    {
+                        handle_.Run(kernels[1])(params.y, as_float(inhw), params.epsilon);
+                    }
+
+                    profileSequence(handle_, 1, &ctime);
+
+                    handle_.Run(kernels[2])(params.x, params.y, params.bnScale, params.bnBias);
+                    profileSequence(handle_, 2, &ctime);
+                }
+            });
+        };
+    };
+
+    return result;
+}
+
+} // namespace batchnorm
+
+} // namespace solver
+
+} // namespace miopen
diff --git a/src/solver/batchnorm/forward_spatial_multiple.cpp b/src/solver/batchnorm/forward_spatial_multiple.cpp
deleted file mode 100644
index 6a2c42743b..0000000000
--- a/src/solver/batchnorm/forward_spatial_multiple.cpp
+++ /dev/null
@@ -1,281 +0,0 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2021 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-
-#include <miopen/batchnorm/solvers.hpp>
-
-#include <miopen/batchnorm/invoke_params.hpp>
-#include <miopen/batch_norm.hpp>
-#include <miopen/stringutils.hpp>
-#include <miopen/visit_float.hpp>
-#include <miopen/kernel_build_params.hpp>
-
-#define WORKAROUND_SWDEV_253606 1
-
-namespace miopen {
-
-namespace solver {
-
-namespace batchnorm {
-
-bool BnFwdTrainingSpatialMultiple::IsApplicable(
-    const ExecutionContext& context, const miopen::batchnorm::ProblemDescription& problem) const
-{
-    if(problem.GetDirection() != miopen::batchnorm::Direction::ForwardTraining ||
-       problem.GetMode() != miopenBNSpatial)
-        return false;
-
-    if(!IsOCLFwdTrainTypeValid(problem))
-        return false;
-
-    return !BnFwdTrainingSpatialSingle{}.IsApplicable(context, problem);
-}
-
-ConvSolution BnFwdTrainingSpatialMultiple::GetSolution(
-    const ExecutionContext& context, const miopen::batchnorm::ProblemDescription& problem) const
-{
-    const auto& handle                 = context.GetStream();
-    const auto& xDesc                  = problem.GetXDesc();
-    const auto& bnScaleBiasMeanVarDesc = problem.GetBnScale();
-
-    int n, c, h, w;
-    std::tie(n, c, h, w) = tien<4>(xDesc.GetLengths());
-
-    unsigned int in_cstride = h * w;
-    unsigned int in_nstride = c * in_cstride;
-    unsigned int in_nhw     = n * in_cstride;
-    unsigned int in_nchw    = n * in_nstride;
-    auto inhw               = float(1.0 / in_nhw);
-
-    size_t xlocalsize = 1024;
-    if(((in_cstride < 256) && (n < 256)) || ((in_cstride < 100) && (n <= 256)))
-        xlocalsize = 256;
-
-    size_t ylocalsize = 1;
-
-    size_t xgridsize = c * xlocalsize;
-    size_t ygridsize = 1;
-
-    bool bfpmixparm   = false;
-    bool bbfpmixparam = false;
-    bool bfp16parm    = false;
-    bool bfp32parm    = true;
-    if(xDesc.GetType() == miopenHalf && bnScaleBiasMeanVarDesc.GetType() == miopenHalf)
-    {
-        bfp16parm = true;
-        bfp32parm = false;
-    }
-    else if(xDesc.GetType() == miopenHalf && bnScaleBiasMeanVarDesc.GetType() == miopenFloat)
-    {
-        bfpmixparm = true;
-        bfp32parm  = false;
-    }
-    else if(problem.GetXDesc().GetType() == miopenBFloat16 &&
-            problem.GetBnScale().GetType() == miopenFloat)
-    {
-        bbfpmixparam = true;
-        bfp32parm    = false;
-    }
-
-    int variant           = 1;
-    unsigned int ldsgcn   = xlocalsize / 64;
-    unsigned int ldsnogcn = xlocalsize;
-
-    if(!problem.IsLayoutNHWC())
-    {
-#if(WORKAROUND_SWDEV_253606 == 0)
-        if(n < 3)
-        {
-            variant    = 4;
-            xlocalsize = 256;
-            xgridsize  = c * xlocalsize;
-            ylocalsize = 1;
-            ygridsize  = 1;
-            ldsgcn     = xlocalsize / 64;
-            ldsnogcn   = xlocalsize;
-        }
-        else
-#endif
-
-            // clang-format off
-        if((in_nhw < 33554432 && in_cstride > 1024) ||
-            ((n >= 256) && (in_cstride > 60) && bfpmixparm) ||
-            ((in_cstride > 512) && bfpmixparm))
-        {
-            variant = 1;
-        }
-        else if(in_cstride <= 512)
-        {
-            variant = 0;
-        }
-        else
-        {
-            variant      = 2;
-            xlocalsize   = 1;
-            ylocalsize   = 1024;
-            auto segment = int(std::ceil(double(in_cstride) / double(ylocalsize)));
-            xgridsize    = c;
-            ygridsize    = segment * ylocalsize;
-            ldsgcn       = ylocalsize / 64;
-            ldsnogcn     = ylocalsize;
-        }
-        // clang-format on
-
-        if((n > 768) && (in_cstride > 150) && bfp32parm)
-        {
-            variant      = 2;
-            xlocalsize   = 1;
-            ylocalsize   = 1024;
-            auto segment = int(std::ceil(double(in_cstride) / double(ylocalsize)));
-            xgridsize    = c;
-            ygridsize    = segment * ylocalsize;
-            ldsgcn       = ylocalsize / 64;
-            ldsnogcn     = ylocalsize;
-        }
-    }
-
-    auto result = ConvSolution{miopenStatusSuccess};
-
-    {
-        auto kernel = KernelInfo{};
-
-        kernel.kernel_name = "MIOpenBatchNormFwdTrainSpatial";
-        kernel.kernel_file = "MIOpenBatchNormFwdTrainSpatial.cl";
-
-        size_t zlocalsize = 1;
-        size_t zgridsize  = 1;
-
-        auto build_params = KernelBuildParameters{
-            {"MIOPEN_USE_FP16", static_cast<int>(bfp16parm)},
-            {"MIOPEN_USE_FP32", static_cast<int>(bfp32parm)},
-            {"MIOPEN_USE_FPMIX", static_cast<int>(bfpmixparm)},
-            {"MIOPEN_USE_BFPMIX", static_cast<int>(bbfpmixparam)},
-            {"MIO_SAVE_MEAN_VARIANCE", static_cast<int>(problem.GetResultSave())},
-            {"MIO_RUNNING_RESULT", static_cast<int>(problem.GetResultRunning())},
-            {"MIO_BN_N", n},
-            {"MIO_BN_C", c},
-            {"MIO_BN_HW", in_cstride},
-            {"MIO_BN_NHW", in_nhw},
-            {"MIO_BN_CHW", in_nstride},
-            {"MIO_BN_NCHW", in_nchw},
-            {"MIO_BN_NGRPS", int(std::ceil(float(ygridsize) / ylocalsize))},
-            {"MIO_BN_LDS_SIZE", ldsnogcn},
-            {"MIO_BN_LDSGCN_SIZE", ldsgcn},
-            {"MIO_BN_VARIANT", variant},
-            {"MIO_BN_GRP0", xlocalsize},
-            {"MIO_BN_GRP1", ylocalsize},
-            {"MIO_BN_GRP2", zlocalsize},
-            {"MIO_BN_GFX103X", (StartsWith(handle.GetDeviceName(), "gfx103") ? "1" : "0")},
-            {"MIO_BN_GFX110X", (StartsWith(handle.GetDeviceName(), "gfx110") ? "1" : "0")},
-            {"MIO_BN_GFX120X", (StartsWith(handle.GetDeviceName(), "gfx120") ? "1" : "0")},
-            {"MIO_LAYOUT_NHWC", static_cast<int>(problem.IsLayoutNHWC())},
-        };
-
-        kernel.comp_options = build_params.GenerateFor(kbp::OpenCL{});
-
-        kernel.l_wk.push_back(xlocalsize);
-        kernel.l_wk.push_back(ylocalsize);
-        kernel.l_wk.push_back(zlocalsize);
-
-        kernel.g_wk.push_back(xgridsize);
-        kernel.g_wk.push_back(ygridsize);
-        kernel.g_wk.push_back(zgridsize);
-
-        auto copy        = kernel;
-        copy.kernel_name = kernel.kernel_name + "MeanVariance";
-        result.construction_params.push_back(copy);
-
-        copy.kernel_name = kernel.kernel_name + "FinalMeanVariance";
-        result.construction_params.push_back(copy);
-
-        copy.kernel_name = kernel.kernel_name + "Norm";
-        result.construction_params.push_back(copy);
-    }
-
-    const auto dtype = bnScaleBiasMeanVarDesc.GetType();
-
-    result.invoker_factory = [=](const std::vector<Kernel>& kernels) {
-        return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
-            decltype(auto) kernel = handle_.Run(kernels.front());
-            decltype(auto) params = raw_params.CastTo<miopen::batchnorm::FwdTrainInvokeParams>();
-            const auto resultsave =
-                params.resultSaveMean != nullptr && params.resultSaveInvVariance != nullptr;
-            const auto resultrunning =
-                params.resultRunningMean != nullptr && params.resultRunningVariance != nullptr;
-
-            float ctime = 0.;
-            visit_float(dtype, [&](auto as_float) {
-                handle_.Run(kernels[0])(params.x, params.y);
-                profileSequence(handle_, 0, &ctime);
-
-                if(resultsave && resultrunning)
-                {
-                    handle_.Run(kernels[1])(params.y,
-                                            as_float(inhw),
-                                            params.expAvgFactor,
-                                            params.resultRunningMean,
-                                            params.resultRunningVariance,
-                                            params.epsilon,
-                                            params.resultSaveMean,
-                                            params.resultSaveInvVariance);
-                }
-                else if(resultsave)
-                {
-                    handle_.Run(kernels[1])(params.y,
-                                            as_float(inhw),
-                                            params.epsilon,
-                                            params.resultSaveMean,
-                                            params.resultSaveInvVariance);
-                }
-                else if(resultrunning)
-                {
-                    handle_.Run(kernels[1])(params.y,
-                                            as_float(inhw),
-                                            params.expAvgFactor,
-                                            params.resultRunningMean,
-                                            params.resultRunningVariance,
-                                            params.epsilon);
-                }
-                else
-                {
-                    handle_.Run(kernels[1])(params.y, as_float(inhw), params.epsilon);
-                }
-
-                profileSequence(handle_, 1, &ctime);
-
-                handle_.Run(kernels[2])(params.x, params.y, params.bnScale, params.bnBias);
-                profileSequence(handle_, 2, &ctime);
-            });
-        };
-    };
-
-    return result;
-}
-
-} // namespace batchnorm
-
-} // namespace solver
-
-} // namespace miopen
diff --git a/src/solver/batchnorm/forward_spatial_single.cpp b/src/solver/batchnorm/forward_spatial_single.cpp
deleted file mode 100644
index ccfebce987..0000000000
--- a/src/solver/batchnorm/forward_spatial_single.cpp
+++ /dev/null
@@ -1,385 +0,0 @@
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2021 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-
-#include <miopen/batchnorm/solvers.hpp>
-
-#include <miopen/batchnorm/invoke_params.hpp>
-#include <miopen/stringutils.hpp>
-#include <miopen/visit_float.hpp>
-#include <miopen/kernel_build_params.hpp>
-
-#define WORKAROUND_SWDEV_253606 1
-
-namespace miopen {
-
-namespace solver {
-
-namespace batchnorm {
-
-bool BnFwdTrainingSpatialSingle::IsApplicable(
-    const ExecutionContext&, const miopen::batchnorm::ProblemDescription& bn_problem) const
-{
-    if(bn_problem.GetDirection() != miopen::batchnorm::Direction::ForwardTraining ||
-       bn_problem.GetMode() != miopenBNSpatial)
-        return false;
-
-    if(!IsOCLFwdTrainTypeValid(bn_problem))
-        return false;
-
-    int n, c, h, w;
-    std::tie(n, c, h, w) = tien<4>(bn_problem.GetXDesc().GetLengths());
-
-    unsigned int in_cstride = h * w;
-    unsigned int in_nhw     = n * in_cstride;
-
-    bool bfpmixparm = false;
-    bool bfp32parm  = true;
-
-    if(bn_problem.GetXDesc().GetType() == miopenHalf &&
-       bn_problem.GetBnScale().GetType() == miopenHalf)
-    {
-        bfp32parm = false;
-    }
-    else if((bn_problem.GetXDesc().GetType() == miopenHalf ||
-             bn_problem.GetXDesc().GetType() == miopenBFloat16) &&
-            bn_problem.GetBnScale().GetType() == miopenFloat)
-    {
-        bfpmixparm = true;
-        bfp32parm  = false;
-    }
-
-    // clang-format off
-    if(!(WORKAROUND_SWDEV_253606 == 0 && n < 3) &&
-        !((in_nhw < 33554432 && in_cstride > 1024) ||
-          ((n >= 256) && (in_cstride > 60) && bfpmixparm) ||
-          ((in_cstride > 512) && bfpmixparm) ||
-          in_cstride <= 512))
-        return false;
-    // clang-format on
-
-    if((n > 768) && (in_cstride > 150) && bfp32parm)
-    {
-        return false;
-    }
-
-    return true;
-}
-
-ConvSolution
-BnFwdTrainingSpatialSingle::GetSolution(const ExecutionContext& context,
-                                        const miopen::batchnorm::ProblemDescription& problem) const
-{
-    const auto& handle = context.GetStream();
-
-    bool bfpmixparm   = false;
-    bool bbfpmixparam = false;
-    bool bfp16parm    = false;
-    bool bfp32parm    = true;
-
-    if(problem.GetXDesc().GetType() == miopenHalf && problem.GetBnScale().GetType() == miopenHalf)
-    {
-        bfp16parm = true;
-        bfp32parm = false;
-    }
-    else if(problem.GetXDesc().GetType() == miopenHalf &&
-            problem.GetBnScale().GetType() == miopenFloat)
-    {
-        bfpmixparm = true;
-        bfp32parm  = false;
-    }
-    else if(problem.GetXDesc().GetType() == miopenBFloat16 &&
-            problem.GetBnScale().GetType() == miopenFloat)
-    {
-        bbfpmixparam = true;
-        bfp32parm    = false;
-    }
-
-    int n, c, h, w;
-    std::tie(n, c, h, w) = tien<4>(problem.GetXDesc().GetLengths());
-
-    unsigned int in_cstride = h * w;
-    unsigned int in_nstride = c * in_cstride;
-    unsigned int in_nhw     = n * in_cstride;
-    unsigned int in_nchw    = n * in_nstride;
-    auto inhw               = float(1.0 / in_nhw);
-
-    size_t xlocalsize = 1024;
-    if(((in_cstride < 256) && (n < 256)) || ((in_cstride < 100) && (n <= 256)))
-        xlocalsize = 256;
-
-    size_t ylocalsize = 1;
-
-    size_t xgridsize = c * xlocalsize;
-    size_t ygridsize = 1;
-
-    int variant           = 1;
-    unsigned int ldsgcn   = xlocalsize / 64;
-    unsigned int ldsnogcn = xlocalsize;
-
-    if(!problem.IsLayoutNHWC())
-    {
-#if(WORKAROUND_SWDEV_253606 == 0)
-        if(n < 3)
-        {
-            variant    = 4;
-            xlocalsize = 256;
-            xgridsize  = c * xlocalsize;
-            ylocalsize = 1;
-            ygridsize  = 1;
-            ldsgcn     = xlocalsize / 64;
-            ldsnogcn   = xlocalsize;
-        }
-        else
-#endif
-        {
-            // clang-format off
-            if( (in_nhw < 33554432 && in_cstride > 1024) ||
-                    ((n >= 256) && (in_cstride > 60) && (bfpmixparm || bbfpmixparam)) ||
-                    ((in_cstride > 512) && (bfpmixparm || bbfpmixparam)))
-            {
-                variant = 1;
-            }
-            else if(in_cstride <= 512)
-            {
-                variant = 0;
-            }
-            else
-            {
-                variant      = 2;
-                xlocalsize   = 1;
-                ylocalsize   = 1024;
-                auto segment = int(std::ceil(double(in_cstride) / double(ylocalsize)));
-                xgridsize    = c;
-                ygridsize    = segment * ylocalsize;
-                ldsgcn       = ylocalsize / 64;
-                ldsnogcn     = ylocalsize;
-            }
-            // clang-format on
-
-            if((n > 768) && (in_cstride > 150) && bfp32parm)
-            {
-                variant      = 2;
-                xlocalsize   = 1;
-                ylocalsize   = 1024;
-                auto segment = int(std::ceil(double(in_cstride) / double(ylocalsize)));
-                xgridsize    = c;
-                ygridsize    = segment * ylocalsize;
-                ldsgcn       = ylocalsize / 64;
-                ldsnogcn     = ylocalsize;
-            }
-        }
-    }
-
-    auto result = ConvSolution{miopenStatusSuccess};
-
-    {
-        size_t zlocalsize = 1;
-        size_t zgridsize  = 1;
-
-        auto kernel = KernelInfo{};
-
-        kernel.kernel_name = "MIOpenBatchNormFwdTrainSpatial";
-        kernel.kernel_file = "MIOpenBatchNormFwdTrainSpatial.cl";
-
-        auto build_params = KernelBuildParameters{
-            {"MIOPEN_USE_FP16", static_cast<int>(bfp16parm)},
-            {"MIOPEN_USE_FP32", static_cast<int>(bfp32parm)},
-            {"MIOPEN_USE_FPMIX", static_cast<int>(bfpmixparm)},
-            {"MIOPEN_USE_BFPMIX", static_cast<int>(bbfpmixparam)},
-            {"MIO_SAVE_MEAN_VARIANCE", static_cast<int>(problem.GetResultSave())},
-            {"MIO_RUNNING_RESULT", static_cast<int>(problem.GetResultRunning())},
-            {"MIO_BN_VARIANT", variant},
-            {"MIO_BN_LDS_SIZE", ldsnogcn},
-            {"MIO_BN_LDSGCN_SIZE", std::to_string(ldsgcn)},
-            {"MIO_BN_N", n},
-            {"MIO_BN_GRP0", xlocalsize},
-            {"MIO_BN_GRP1", ylocalsize},
-            {"MIO_BN_GRP2", zlocalsize},
-            {"MIO_BN_GFX103X", (StartsWith(handle.GetDeviceName(), "gfx103") ? "1" : "0")},
-            {"MIO_BN_GFX110X", (StartsWith(handle.GetDeviceName(), "gfx110") ? "1" : "0")},
-            {"MIO_BN_GFX120X", (StartsWith(handle.GetDeviceName(), "gfx120") ? "1" : "0")},
-            {"MIO_LAYOUT_NHWC", static_cast<int>(problem.IsLayoutNHWC())},
-        };
-
-        if(variant != 4)
-        {
-            build_params.Define("MIO_BN_C", c);
-            build_params.Define("MIO_BN_HW", in_cstride);
-            build_params.Define("MIO_BN_NHW", in_nhw);
-            build_params.Define("MIO_BN_CHW", in_nstride);
-            build_params.Define("MIO_BN_NCHW", in_nchw);
-        }
-
-        kernel.comp_options = build_params.GenerateFor(kbp::OpenCL{});
-
-        kernel.l_wk.push_back(xlocalsize);
-        kernel.l_wk.push_back(ylocalsize);
-        kernel.l_wk.push_back(zlocalsize);
-
-        kernel.g_wk.push_back(xgridsize);
-        kernel.g_wk.push_back(ygridsize);
-        kernel.g_wk.push_back(zgridsize);
-
-        result.construction_params.push_back(kernel);
-    }
-
-    const auto dtype = problem.GetBnScale().GetType();
-    const auto vn4   = (variant != 4);
-
-    result.invoker_factory = [=](const std::vector<Kernel>& kernels) {
-        return [=](const Handle& handle_, const AnyInvokeParams& raw_params) {
-            decltype(auto) kernel = handle_.Run(kernels.front());
-            decltype(auto) params = raw_params.CastTo<miopen::batchnorm::FwdTrainInvokeParams>();
-            const auto resultsave =
-                params.resultSaveMean != nullptr && params.resultSaveInvVariance != nullptr;
-            const auto resultrunning =
-                params.resultRunningMean != nullptr && params.resultRunningVariance != nullptr;
-
-            visit_float(dtype, [&](auto as_float) {
-                if(resultsave && resultrunning)
-                {
-                    if(vn4)
-                    {
-                        kernel(params.x,
-                               params.y,
-                               params.bnScale,
-                               params.bnBias,
-                               as_float(inhw),
-                               params.expAvgFactor,
-                               params.resultRunningMean,
-                               params.resultRunningVariance,
-                               params.epsilon,
-                               params.resultSaveMean,
-                               params.resultSaveInvVariance);
-                    }
-                    else
-                    {
-                        kernel(params.x,
-                               params.y,
-                               params.bnScale,
-                               params.bnBias,
-                               as_float(inhw),
-                               params.expAvgFactor,
-                               params.resultRunningMean,
-                               params.resultRunningVariance,
-                               params.epsilon,
-                               params.resultSaveMean,
-                               params.resultSaveInvVariance,
-                               in_cstride,
-                               in_nstride);
-                    }
-                }
-                else if(resultsave)
-                {
-                    if(vn4)
-                    {
-                        kernel(params.x,
-                               params.y,
-                               params.bnScale,
-                               params.bnBias,
-                               as_float(inhw),
-                               params.epsilon,
-                               params.resultSaveMean,
-                               params.resultSaveInvVariance);
-                    }
-                    else
-                    {
-                        kernel(params.x,
-                               params.y,
-                               params.bnScale,
-                               params.bnBias,
-                               as_float(inhw),
-                               params.epsilon,
-                               params.resultSaveMean,
-                               params.resultSaveInvVariance,
-                               in_cstride,
-                               in_nstride);
-                    }
-                }
-                else if(resultrunning)
-                {
-                    if(vn4)
-                    {
-                        kernel(params.x,
-                               params.y,
-                               params.bnScale,
-                               params.bnBias,
-                               as_float(inhw),
-                               params.expAvgFactor,
-                               params.resultRunningMean,
-                               params.resultRunningVariance,
-                               params.epsilon);
-                    }
-                    else
-                    {
-                        kernel(params.x,
-                               params.y,
-                               params.bnScale,
-                               params.bnBias,
-                               as_float(inhw),
-                               params.expAvgFactor,
-                               params.resultRunningMean,
-                               params.resultRunningVariance,
-                               params.epsilon,
-                               in_cstride,
-                               in_nstride);
-                    }
-                }
-                else
-                {
-                    if(vn4)
-                    {
-                        kernel(params.x,
-                               params.y,
-                               params.bnScale,
-                               params.bnBias,
-                               as_float(inhw),
-                               params.epsilon);
-                    }
-                    else
-                    {
-                        kernel(params.x,
-                               params.y,
-                               params.bnScale,
-                               params.bnBias,
-                               as_float(inhw),
-                               params.epsilon,
-                               in_cstride,
-                               in_nstride);
-                    }
-                }
-            });
-        };
-    };
-
-    return result;
-}
-
-} // namespace batchnorm
-
-} // namespace solver
-
-} // namespace miopen
diff --git a/test/fusionHost.hpp b/test/fusionHost.hpp
index 6fbc428b1a..1e64290c19 100644
--- a/test/fusionHost.hpp
+++ b/test/fusionHost.hpp
@@ -162,7 +162,6 @@ void batchNormSpatialHostInference(const tensor<T>& input,
                     output(bidx, cidx, row, column) =
                         static_cast<T>(scale(0, cidx, 0, 0) * inhat + bias(0, cidx, 0, 0));
                     // printf("output: %f\n",scale(0, cidx, 0, 0) * inhat + bias(0, cidx, 0, 0));
-                    // std::cout << output(bidx, cidx, row, column) << ",";
                 }
             }
         }
@@ -542,32 +541,44 @@ void batchNormPerActHostFwdTrain(const tensor<T>& input,
                         scale(0, cidx, row, column) * inhat + bias(0, cidx, row, column));
                 } // end for(n_batch)
 
-                newRunMean = runMean(0, cidx, row, column) * (1.0 - expAvgFactor);
-                runMean(0, cidx, row, column) =
-                    mean_accum * expAvgFactor + newRunMean; // newMean*factor + tmp
-
+                if(!runMean.data.empty())
+                {
+                    newRunMean = runMean(0, cidx, row, column) * (1.0 - expAvgFactor);
+                    runMean(0, cidx, row, column) =
+                        mean_accum * expAvgFactor + newRunMean; // newMean*factor + tmp
+                }
                 // var(n+1) = p * var(n-1) + (1 - p)*(b/b-1)*var(n)
-                adjust = (n_batch == 1) ? variance_accum : (n / (n - 1.0)) * variance_accum;
-                runVar(0, cidx, row, column) =
-                    (1 - expAvgFactor) * runVar(0, cidx, row, column) + expAvgFactor * adjust;
-
-                saveMean(0, cidx, row, column)   = static_cast<Tref>(mean_accum);
-                saveInvVar(0, cidx, row, column) = static_cast<Tref>(elemInvVar);
+                if(!runVar.data.empty())
+                {
+                    adjust = (n_batch == 1) ? variance_accum : (n / (n - 1.0)) * variance_accum;
+                    runVar(0, cidx, row, column) =
+                        (1 - expAvgFactor) * runVar(0, cidx, row, column) + expAvgFactor * adjust;
+                }
+                if(!saveMean.data.empty() || !saveInvVar.data.empty())
+                {
+                    saveMean(0, cidx, row, column)   = static_cast<Tref>(mean_accum);
+                    saveInvVar(0, cidx, row, column) = static_cast<Tref>(elemInvVar);
+                }
 
             } // for (column)
         }     // for (row)
     });
 }
 
-template <class T, class U, class Tref>
-void batchNormPerActHostBwdTrain(const tensor<T>& x_input,
-                                 const tensor<T>& dy_input,
-                                 const tensor<U>& scale,
-                                 tensor<Tref>& dscale,
-                                 tensor<Tref>& dbias,
-                                 tensor<Tref>& dx_out,
-                                 const tensor<U>& savedMean,
-                                 const tensor<U>& savedInvVar)
+template <typename XDataType,
+          typename DxDataType,
+          typename DyDataType = XDataType,
+          typename ScaleDataType,
+          typename AccDataType = ScaleDataType,
+          typename RefDataType = DxDataType>
+void batchNormPerActHostBwdTrain(const tensor<XDataType>& x_input,
+                                 const tensor<DyDataType>& dy_input,
+                                 tensor<DxDataType>& dx_out,
+                                 const tensor<ScaleDataType>& scale,
+                                 tensor<RefDataType>& dscale,
+                                 tensor<RefDataType>& dbias,
+                                 const tensor<AccDataType>& savedMean,
+                                 const tensor<AccDataType>& savedInvVar)
 {
 
     int height, width, n_batch, channels;
@@ -621,7 +632,7 @@ void batchNormPerActHostBwdTrain(const tensor<T>& x_input,
                         n_batch * scale(0, cidx, row, column) * dy_input(bidx, cidx, row, column) -
                         tmp1;
                     double tmp3                     = elemInvVar / (double(n));
-                    dx_out(bidx, cidx, row, column) = static_cast<T>(tmp3 * tmp2);
+                    dx_out(bidx, cidx, row, column) = static_cast<DxDataType>(tmp3 * tmp2);
                 } // end for(n_batchs)
             }     // for (column)
         }         // for (row)
diff --git a/test/gtest/CMakeLists.txt b/test/gtest/CMakeLists.txt
index 544e0526cc..648048c4ee 100644
--- a/test/gtest/CMakeLists.txt
+++ b/test/gtest/CMakeLists.txt
@@ -105,7 +105,12 @@ function(add_gtest TEST_NAME TEST_CPP)
     # Enable CMake to discover the test binary
     # Note: Due to the following cmake issue with gtest_discover_tests https://gitlab.kitware.com/cmake/cmake/-/issues/17812 you cannot pass PROPERTIES as a list.
     #       To work around this limitation, we are passing the environment variables in the format ENVIRONMENT;value1=${value1};ENVIRONMENT;value2=${value2}.
-    gtest_discover_tests(${TEST_NAME} DISCOVERY_TIMEOUT 300 DISCOVERY_MODE PRE_TEST WORKING_DIRECTORY ${PROJECT_BINARY_DIR}/${DATABASE_INSTALL_DIR} PROPERTIES ${TEST_ENVIRONMENT_VARIABLES} TEST_FILTER "${MIOPEN_GTEST_FILTER}")
+    # bn tests are run sequentially since running tests in parallel was causing large tensor case fail with insufficient memory error.
+    if("${TEST_NAME}" STREQUAL "test_bn_bwd_serial_run" OR "${TEST_NAME}" STREQUAL "test_bn_fwd_train_serial_run" OR "${TEST_NAME}" STREQUAL "test_bn_infer_serial_run") 
+      gtest_discover_tests(${TEST_NAME} DISCOVERY_TIMEOUT 300 DISCOVERY_MODE PRE_TEST WORKING_DIRECTORY ${PROJECT_BINARY_DIR}/${DATABASE_INSTALL_DIR} PROPERTIES RUN_SERIAL TRUE ${TEST_ENVIRONMENT_VARIABLES} TEST_FILTER "${MIOPEN_GTEST_FILTER}")
+    else()
+      gtest_discover_tests(${TEST_NAME} DISCOVERY_TIMEOUT 300 DISCOVERY_MODE PRE_TEST WORKING_DIRECTORY ${PROJECT_BINARY_DIR}/${DATABASE_INSTALL_DIR} PROPERTIES ${TEST_ENVIRONMENT_VARIABLES} TEST_FILTER "${MIOPEN_GTEST_FILTER}")
+    endif()
   endif()
   target_link_libraries(${TEST_NAME} BZip2::BZip2)
   if(WIN32)
diff --git a/test/gtest/bn.hpp b/test/gtest/bn.hpp
index 6734d0f99d..d89d950c13 100644
--- a/test/gtest/bn.hpp
+++ b/test/gtest/bn.hpp
@@ -62,12 +62,23 @@ static std::string ApiVerisonToString(int api_version)
     }
 }
 
+static std::string BNModeToString(int bn_mode)
+{
+    switch(bn_mode)
+    {
+    case miopenBNPerActivation: return "BNPerActivation";
+    case miopenBNSpatial: return "BNSpatial";
+    default: return "UnknownBNMode";
+    }
+}
+
 // Custom test name generator to handle enums
 template <typename TestCase>
 struct TestNameGenerator
 {
-    std::string operator()(
-        const testing::TestParamInfo<std::tuple<TestCase, miopenTensorLayout_t, BNApiType>>& info)
+    std::string
+    operator()(const testing::TestParamInfo<
+               std::tuple<TestCase, miopenTensorLayout_t, miopenBatchNormMode_t, BNApiType>>& info)
         const
     {
         constexpr int dimension = std::is_same<TestCase, BN2DTestCase>::value   ? 2
@@ -75,15 +86,17 @@ struct TestNameGenerator
                                                                                 : -1;
         static_assert(dimension > 0);
 
-        const auto& layout_type = std::get<1>(info.param);
-        const auto& api_type    = std::get<2>(info.param);
+        const auto& layout_type    = std::get<1>(info.param);
+        const auto& batchnorm_mode = std::get<2>(info.param);
+        const auto& api_type       = std::get<3>(info.param);
 
-        std::string tensor_name = LayoutToString(layout_type);
-        std::string api_name    = ApiVerisonToString(api_type);
+        std::string tensor_name  = LayoutToString(layout_type);
+        std::string bn_mode_name = BNModeToString(batchnorm_mode);
+        std::string api_name     = ApiVerisonToString(api_type);
 
         std::ostringstream oss;
-        oss << tensor_name + "_" + api_name + "_Dim_" + std::to_string(dimension) + "_test_id_" +
-                   std::to_string(info.index);
+        oss << tensor_name + "_" + bn_mode_name + "_" + api_name + "_Dim_" +
+                   std::to_string(dimension) + "_test_id_" + std::to_string(info.index);
         return oss.str();
     }
 };
@@ -93,15 +106,17 @@ template <typename XDataType,
           typename ScaleDataType,
           typename BiasDataType,
           typename MeanVarDataType,
+          typename AccDataType,
           typename TestCase>
 struct BNInferTest
-    : public ::testing::TestWithParam<std::tuple<TestCase, miopenTensorLayout_t, BNApiType>>
+    : public ::testing::TestWithParam<
+          std::tuple<TestCase, miopenTensorLayout_t, miopenBatchNormMode_t, BNApiType>>
 {
 protected:
     void SetUp() override
     {
-        std::tie(bn_config, tensor_layout, api_type) = this->GetParam();
-        bn_infer_test_data.SetUpImpl(bn_config, tensor_layout);
+        std::tie(bn_config, tensor_layout, bn_mode, api_type) = this->GetParam();
+        bn_infer_test_data.SetUpImpl(bn_config, bn_mode, tensor_layout);
 
         auto&& handle = get_handle();
         if(!miopen::solver::ck_utility::is_ck_whitelist(handle.GetStream()))
@@ -113,7 +128,7 @@ struct BNInferTest
         if(api_type == BNApiType::testBNAPIV1)
         {
             res = miopenBatchNormalizationForwardInference(&handle,
-                                                           bn_config.mode,
+                                                           bn_mode,
                                                            &bn_infer_test_data.alpha,
                                                            &bn_infer_test_data.beta,
                                                            &bn_infer_test_data.input.desc,
@@ -131,7 +146,7 @@ struct BNInferTest
         {
             res = miopenBatchNormalizationForwardInference_V2(
                 &handle,
-                bn_config.mode,
+                bn_mode,
                 &bn_infer_test_data.alpha,
                 &bn_infer_test_data.beta,
                 &bn_infer_test_data.input.desc,
@@ -172,33 +187,40 @@ struct BNInferTest
             bn_infer_test_data.out_dev, bn_infer_test_data.output.data.size());
         test::ComputeCPUBNInference(bn_infer_test_data);
         // 4e-3 is tolerance used by CK kernel.
-        test::CompareTensor<YDataType>(bn_infer_test_data.output, bn_infer_test_data.ref_out, 4e-3);
+        test::CompareTensor<YDataType>(bn_infer_test_data.output, bn_infer_test_data.out_ref, 4e-3);
     }
 
     TestCase bn_config;
     bool test_skipped = false;
-    BNInferTestData<XDataType, YDataType, ScaleDataType, BiasDataType, MeanVarDataType, TestCase>
+    BNInferTestData<XDataType,
+                    YDataType,
+                    ScaleDataType,
+                    BiasDataType,
+                    MeanVarDataType,
+                    AccDataType,
+                    TestCase>
         bn_infer_test_data;
     miopenTensorLayout_t tensor_layout;
+    miopenBatchNormMode_t bn_mode;
     BNApiType api_type;
 };
 
 template <typename XDataType,
           typename DxDataType,
           typename DyDataType,
-          typename AccDataType,
           typename ScaleDataType,
           typename DscaleDbiasDataType,
           typename MeanVarDataType,
+          typename AccDataType,
           typename TestCase>
-struct BNBwdTest
-    : public ::testing::TestWithParam<std::tuple<TestCase, miopenTensorLayout_t, BNApiType>>
+struct BNBwdTest : public ::testing::TestWithParam<
+                       std::tuple<TestCase, miopenTensorLayout_t, miopenBatchNormMode_t, BNApiType>>
 {
 protected:
     void SetUp() override
     {
-        std::tie(bn_config, tensor_layout, api_type) = this->GetParam();
-        bn_bwd_test_data.SetUpImpl(bn_config, tensor_layout);
+        std::tie(bn_config, tensor_layout, bn_mode, api_type) = this->GetParam();
+        bn_bwd_test_data.SetUpImpl(bn_config, bn_mode, tensor_layout);
 
         auto&& handle = get_handle();
         if(!miopen::solver::ck_utility::is_ck_whitelist(handle.GetStream()))
@@ -210,7 +232,7 @@ struct BNBwdTest
         if(api_type == BNApiType::testBNAPIV1)
         {
             res = miopenBatchNormalizationBackward(&handle,
-                                                   bn_config.mode,
+                                                   bn_mode,
                                                    &bn_bwd_test_data.alphaDataDiff,
                                                    &bn_bwd_test_data.betaDataDiff,
                                                    &bn_bwd_test_data.alphaParamDiff,
@@ -232,7 +254,7 @@ struct BNBwdTest
         else if(api_type == BNApiType::testBNAPIV2)
         {
             res = miopenBatchNormalizationBackward_V2(&handle,
-                                                      bn_config.mode,
+                                                      bn_mode,
                                                       &bn_bwd_test_data.alphaDataDiff,
                                                       &bn_bwd_test_data.betaDataDiff,
                                                       &bn_bwd_test_data.alphaParamDiff,
@@ -283,10 +305,11 @@ struct BNBwdTest
 
         test::ComputeCPUBNBwd(bn_bwd_test_data);
 
-        test::CompareTensor<DxDataType>(bn_bwd_test_data.output, bn_bwd_test_data.ref_out, bwd_tol);
-        test::CompareTensor<DscaleDbiasDataType>(
+        test::CompareTensor<DxDataType, AccDataType>(
+            bn_bwd_test_data.output, bn_bwd_test_data.out_ref, bwd_tol);
+        test::CompareTensor<DscaleDbiasDataType, AccDataType>(
             bn_bwd_test_data.dScale, bn_bwd_test_data.dScale_ref, bwd_tol);
-        test::CompareTensor<DscaleDbiasDataType>(
+        test::CompareTensor<DscaleDbiasDataType, AccDataType>(
             bn_bwd_test_data.dBias, bn_bwd_test_data.dBias_ref, bwd_tol);
     }
 
@@ -295,13 +318,14 @@ struct BNBwdTest
     BNBwdTestData<XDataType,
                   DxDataType,
                   DyDataType,
-                  AccDataType,
                   ScaleDataType,
                   DscaleDbiasDataType,
                   MeanVarDataType,
+                  AccDataType,
                   TestCase>
         bn_bwd_test_data;
     miopenTensorLayout_t tensor_layout;
+    miopenBatchNormMode_t bn_mode;
     BNApiType api_type;
     double bwd_tol = 4e-3;
 };
@@ -310,16 +334,18 @@ template <typename XDataType,
           typename YDataType,
           typename ScaleDataType,
           typename BiasDataType,
+          typename RunSaveDataType,
           typename AccDataType,
           typename TestCase>
 struct BNFwdTrainTest
-    : public ::testing::TestWithParam<std::tuple<TestCase, miopenTensorLayout_t, BNApiType>>
+    : public ::testing::TestWithParam<
+          std::tuple<TestCase, miopenTensorLayout_t, miopenBatchNormMode_t, BNApiType>>
 {
 protected:
     void SetUp() override
     {
-        std::tie(bn_config, tensor_layout, api_type) = this->GetParam();
-        bn_fwd_train_test_data.SetUpImpl(bn_config, tensor_layout);
+        std::tie(bn_config, tensor_layout, bn_mode, api_type) = this->GetParam();
+        bn_fwd_train_test_data.SetUpImpl(bn_config, bn_mode, tensor_layout);
 
         auto&& handle = get_handle();
         if(!miopen::solver::ck_utility::is_ck_whitelist(handle.GetStream()))
@@ -332,7 +358,7 @@ struct BNFwdTrainTest
         {
             res = miopenBatchNormalizationForwardTraining(
                 &handle,
-                bn_config.mode,
+                bn_mode,
                 &bn_fwd_train_test_data.alpha,
                 &bn_fwd_train_test_data.beta,
                 &bn_fwd_train_test_data.input.desc,
@@ -353,7 +379,7 @@ struct BNFwdTrainTest
         {
             res = miopenBatchNormalizationForwardTraining_V2(
                 &handle,
-                bn_config.mode,
+                bn_mode,
                 &bn_fwd_train_test_data.alpha,
                 &bn_fwd_train_test_data.beta,
                 &bn_fwd_train_test_data.input.desc,
@@ -402,35 +428,42 @@ struct BNFwdTrainTest
         bn_fwd_train_test_data.output.data = handle.Read<YDataType>(
             bn_fwd_train_test_data.out_dev, bn_fwd_train_test_data.output.data.size());
 
-        bn_fwd_train_test_data.saveMean.data = handle.Read<AccDataType>(
+        bn_fwd_train_test_data.saveMean.data = handle.Read<RunSaveDataType>(
             bn_fwd_train_test_data.saveMean_dev, bn_fwd_train_test_data.saveMean.data.size());
         bn_fwd_train_test_data.saveVariance.data =
-            handle.Read<AccDataType>(bn_fwd_train_test_data.saveVariance_dev,
-                                     bn_fwd_train_test_data.saveVariance_ref.data.size());
-        bn_fwd_train_test_data.runMean.data = handle.Read<AccDataType>(
+            handle.Read<RunSaveDataType>(bn_fwd_train_test_data.saveVariance_dev,
+                                         bn_fwd_train_test_data.saveVariance_ref.data.size());
+        bn_fwd_train_test_data.runMean.data = handle.Read<RunSaveDataType>(
             bn_fwd_train_test_data.runMean_dev, bn_fwd_train_test_data.runMean_ref.data.size());
         bn_fwd_train_test_data.runVariance.data =
-            handle.Read<AccDataType>(bn_fwd_train_test_data.runVariance_dev,
-                                     bn_fwd_train_test_data.runVariance_ref.data.size());
+            handle.Read<RunSaveDataType>(bn_fwd_train_test_data.runVariance_dev,
+                                         bn_fwd_train_test_data.runVariance_ref.data.size());
         test::ComputeCPUBNFwdTrain(bn_fwd_train_test_data);
 
         // 4e-3 is tolerance used by CK kernel.
         test::CompareTensor<YDataType>(
-            bn_fwd_train_test_data.output, bn_fwd_train_test_data.ref_out, 4e-3);
-        test::CompareTensor<AccDataType>(
+            bn_fwd_train_test_data.output, bn_fwd_train_test_data.out_ref, 4e-3);
+        test::CompareTensor<RunSaveDataType>(
             bn_fwd_train_test_data.saveMean, bn_fwd_train_test_data.saveMean_ref, 4e-3);
-        test::CompareTensor<AccDataType>(
+        test::CompareTensor<RunSaveDataType>(
             bn_fwd_train_test_data.saveVariance, bn_fwd_train_test_data.saveVariance_ref, 4e-3);
-        test::CompareTensor<AccDataType>(
+        test::CompareTensor<RunSaveDataType>(
             bn_fwd_train_test_data.runMean, bn_fwd_train_test_data.runMean_ref, 4e-3);
-        test::CompareTensor<AccDataType>(
+        test::CompareTensor<RunSaveDataType>(
             bn_fwd_train_test_data.runVariance, bn_fwd_train_test_data.runVariance_ref, 4e-3);
     }
 
     TestCase bn_config;
     bool test_skipped = false;
-    BNFwdTrainTestData<XDataType, YDataType, ScaleDataType, BiasDataType, AccDataType, TestCase>
+    BNFwdTrainTestData<XDataType,
+                       YDataType,
+                       ScaleDataType,
+                       BiasDataType,
+                       RunSaveDataType,
+                       AccDataType,
+                       TestCase>
         bn_fwd_train_test_data;
     miopenTensorLayout_t tensor_layout;
+    miopenBatchNormMode_t bn_mode;
     BNApiType api_type;
 };
diff --git a/test/gtest/bn_bwd.cpp b/test/gtest/bn_bwd.cpp
index 5e8c3856f5..0854ec2134 100644
--- a/test/gtest/bn_bwd.cpp
+++ b/test/gtest/bn_bwd.cpp
@@ -28,13 +28,19 @@
 /* typename XDataType,
    typename DxDataType,
    typename DyDataType,
-   typename AccDataType,
    typename ScaleDataType,
    typename DscaleDbiasDataType,
-   typename MeanVarDataType> */
-
-struct GPU_BNBWDSmall_FP32
-    : BNBwdTest<half_float::half, float, float, float, half_float::half, float, float, BN2DTestCase>
+   typename MeanVarDataType,
+   typename AccDataType> */
+
+struct GPU_BNBWDSmall_FP32 : BNBwdTest<half_float::half,
+                                       float,
+                                       float,
+                                       float,
+                                       half_float::half,
+                                       float,
+                                       double,
+                                       BN2DTestCase>
 {
 };
 
@@ -44,7 +50,7 @@ struct GPU_BNOCLBWDLarge2D_FP16 : BNBwdTest<half_float::half,
                                             float,
                                             float,
                                             float,
-                                            float,
+                                            double,
                                             BN2DTestCase>
 {
 };
@@ -55,38 +61,38 @@ struct GPU_BNOCLBWDLarge3D_FP16 : BNBwdTest<half_float::half,
                                             float,
                                             float,
                                             float,
-                                            float,
+                                            double,
                                             BN3DTestCase>
 {
 };
 
 struct GPU_BNCKBWDLarge2D_BFP16
-    : BNBwdTest<bfloat16, float, float, float, bfloat16, float, float, BN2DTestCase>
+    : BNBwdTest<bfloat16, float, float, float, bfloat16, float, double, BN2DTestCase>
 {
 };
 
 struct GPU_BNOCLBWDLarge2D_BFP16
-    : BNBwdTest<bfloat16, bfloat16, bfloat16, float, float, float, float, BN2DTestCase>
+    : BNBwdTest<bfloat16, bfloat16, bfloat16, float, float, float, double, BN2DTestCase>
 {
 };
 
 struct GPU_BNOCLBWDLarge3D_BFP16
-    : BNBwdTest<bfloat16, bfloat16, bfloat16, float, float, float, float, BN3DTestCase>
+    : BNBwdTest<bfloat16, bfloat16, bfloat16, float, float, float, double, BN3DTestCase>
 {
 };
 
 struct GPU_BNBWDSmall2D_FP32
-    : BNBwdTest<float, float, float, float, float, float, float, BN2DTestCase>
+    : BNBwdTest<float, float, float, float, float, float, double, BN2DTestCase>
 {
 };
 
 struct GPU_BNBWDLarge2D_FP32
-    : BNBwdTest<float, float, float, float, float, float, float, BN2DTestCase>
+    : BNBwdTest<float, float, float, float, float, float, double, BN2DTestCase>
 {
 };
 
 struct GPU_BNBWDLarge3D_FP32
-    : BNBwdTest<float, float, float, float, float, float, float, BN3DTestCase>
+    : BNBwdTest<float, float, float, float, float, float, double, BN3DTestCase>
 {
 };
 
@@ -124,20 +130,26 @@ INSTANTIATE_TEST_SUITE_P(Smoke,
                          GPU_BNBWDSmall_FP32,
                          testing::Combine(testing::ValuesIn(Network2DSmall<BN2DTestCase>()),
                                           testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
+                                          testing::ValuesIn({miopenBNSpatial,
+                                                             miopenBNPerActivation}),
                                           testing::ValuesIn({testBNAPIV2})),
                          TestNameGenerator<BN2DTestCase>());
 
 INSTANTIATE_TEST_SUITE_P(Smoke,
                          GPU_BNOCLBWDLarge2D_FP16,
                          testing::Combine(testing::ValuesIn(Network2DLarge<BN2DTestCase>()),
-                                          testing::ValuesIn({miopenTensorNCHW}),
+                                          testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
+                                          testing::ValuesIn({miopenBNSpatial,
+                                                             miopenBNPerActivation}),
                                           testing::ValuesIn({testBNAPIV2})),
                          TestNameGenerator<BN2DTestCase>());
 
 INSTANTIATE_TEST_SUITE_P(Smoke,
                          GPU_BNOCLBWDLarge3D_FP16,
                          testing::Combine(testing::ValuesIn(Network3DBN<BN3DTestCase>()),
-                                          testing::ValuesIn({miopenTensorNCDHW}),
+                                          testing::ValuesIn({miopenTensorNCDHW, miopenTensorNDHWC}),
+                                          testing::ValuesIn({miopenBNSpatial,
+                                                             miopenBNPerActivation}),
                                           testing::ValuesIn({testBNAPIV2})),
                          TestNameGenerator<BN3DTestCase>());
 
@@ -146,20 +158,26 @@ INSTANTIATE_TEST_SUITE_P(Smoke,
                          GPU_BNCKBWDLarge2D_BFP16,
                          testing::Combine(testing::ValuesIn(Network2DLarge<BN2DTestCase>()),
                                           testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
+                                          testing::ValuesIn({miopenBNSpatial,
+                                                             miopenBNPerActivation}),
                                           testing::ValuesIn({testBNAPIV2})),
                          TestNameGenerator<BN2DTestCase>());
 
 INSTANTIATE_TEST_SUITE_P(Smoke,
                          GPU_BNOCLBWDLarge2D_BFP16,
                          testing::Combine(testing::ValuesIn(Network2DLarge<BN2DTestCase>()),
-                                          testing::ValuesIn({miopenTensorNCHW}),
+                                          testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
+                                          testing::ValuesIn({miopenBNSpatial,
+                                                             miopenBNPerActivation}),
                                           testing::ValuesIn({testBNAPIV2})),
                          TestNameGenerator<BN2DTestCase>());
 
 INSTANTIATE_TEST_SUITE_P(Smoke,
                          GPU_BNOCLBWDLarge3D_BFP16,
                          testing::Combine(testing::ValuesIn(Network3DBN<BN3DTestCase>()),
-                                          testing::ValuesIn({miopenTensorNCDHW}),
+                                          testing::ValuesIn({miopenTensorNCDHW, miopenTensorNDHWC}),
+                                          testing::ValuesIn({miopenBNSpatial,
+                                                             miopenBNPerActivation}),
                                           testing::ValuesIn({testBNAPIV2})),
                          TestNameGenerator<BN3DTestCase>());
 
@@ -167,20 +185,26 @@ INSTANTIATE_TEST_SUITE_P(Smoke,
 INSTANTIATE_TEST_SUITE_P(Smoke,
                          GPU_BNBWDSmall2D_FP32,
                          testing::Combine(testing::ValuesIn(Network2DSmall<BN2DTestCase>()),
-                                          testing::ValuesIn({miopenTensorNCHW}),
+                                          testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
+                                          testing::ValuesIn({miopenBNSpatial,
+                                                             miopenBNPerActivation}),
                                           testing::ValuesIn({testBNAPIV1})),
                          TestNameGenerator<BN2DTestCase>());
 
 INSTANTIATE_TEST_SUITE_P(Smoke,
                          GPU_BNBWDLarge2D_FP32,
                          testing::Combine(testing::ValuesIn(Network2DLarge<BN2DTestCase>()),
-                                          testing::ValuesIn({miopenTensorNCHW}),
+                                          testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
+                                          testing::ValuesIn({miopenBNSpatial,
+                                                             miopenBNPerActivation}),
                                           testing::ValuesIn({testBNAPIV2})),
                          TestNameGenerator<BN2DTestCase>());
 INSTANTIATE_TEST_SUITE_P(Smoke,
                          GPU_BNBWDLarge3D_FP32,
                          testing::Combine(testing::ValuesIn(Network3DBN<BN3DTestCase>()),
-                                          testing::ValuesIn({miopenTensorNCDHW}),
+                                          testing::ValuesIn({miopenTensorNCDHW, miopenTensorNDHWC}),
+                                          testing::ValuesIn({miopenBNSpatial,
+                                                             miopenBNPerActivation}),
                                           testing::ValuesIn({testBNAPIV2})),
                          TestNameGenerator<BN3DTestCase>());
 // fp64
@@ -188,6 +212,8 @@ INSTANTIATE_TEST_SUITE_P(Smoke,
                          GPU_BNBWDSmall2D_FP64,
                          testing::Combine(testing::ValuesIn(Network2DSmall<BN2DTestCase>()),
                                           testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
+                                          testing::ValuesIn({miopenBNSpatial,
+                                                             miopenBNPerActivation}),
                                           testing::ValuesIn({testBNAPIV1})),
                          TestNameGenerator<BN2DTestCase>());
 
@@ -195,5 +221,7 @@ INSTANTIATE_TEST_SUITE_P(Smoke,
                          GPU_BNBWDLarge2D_FP64,
                          testing::Combine(testing::ValuesIn(Network2DLarge<BN2DTestCase>()),
                                           testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
+                                          testing::ValuesIn({miopenBNSpatial,
+                                                             miopenBNPerActivation}),
                                           testing::ValuesIn({testBNAPIV2})),
                          TestNameGenerator<BN2DTestCase>());
diff --git a/test/gtest/bn_bwd_serial_run.cpp b/test/gtest/bn_bwd_serial_run.cpp
new file mode 100644
index 0000000000..66f1d56d26
--- /dev/null
+++ b/test/gtest/bn_bwd_serial_run.cpp
@@ -0,0 +1,111 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2025 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include "bn.hpp"
+/* typename XDataType,
+   typename DxDataType,
+   typename DyDataType,
+   typename ScaleDataType,
+   typename DscaleDbiasDataType,
+   typename MeanVarDataType,
+   typename AccDataType> */
+
+struct GPU_BNOCLBWDSerialRun3D_FP16 : BNBwdTest<half_float::half,
+                                                half_float::half,
+                                                half_float::half,
+                                                float,
+                                                float,
+                                                float,
+                                                double,
+                                                BN3DTestCase>
+{
+};
+
+struct GPU_BNOCLBWDSerialRun3D_BFP16
+    : BNBwdTest<bfloat16, bfloat16, bfloat16, float, float, float, double, BN3DTestCase>
+{
+};
+
+struct GPU_BNBWDSerialRun3D_FP32
+    : BNBwdTest<float, float, float, float, float, float, double, BN3DTestCase>
+{
+};
+
+struct GPU_BNBWDSerialRun3D_FP64
+    : BNBwdTest<double, double, double, double, double, double, double, BN3DTestCase>
+{
+};
+
+// fp16
+TEST_P(GPU_BNOCLBWDSerialRun3D_FP16, BnV2SerialRunBWDOCL3D_fp16) {}
+
+// bfp16
+TEST_P(GPU_BNOCLBWDSerialRun3D_BFP16, BnV2SerialRunBWDOCLbfp16_3D) {}
+
+// fp32 (float)
+TEST_P(GPU_BNBWDSerialRun3D_FP32, BnV2SerialRunBWDCKfp32_3D) {}
+
+// fp64
+TEST_P(GPU_BNBWDSerialRun3D_FP64, DISABLED_BnV2SerialRunBWDCKfp64_3D) {}
+
+// fp16
+INSTANTIATE_TEST_SUITE_P(Full,
+                         GPU_BNOCLBWDSerialRun3D_FP16,
+                         testing::Combine(testing::ValuesIn(Network3DSerialCase<BN3DTestCase>()),
+                                          testing::ValuesIn({miopenTensorNCDHW, miopenTensorNDHWC}),
+                                          testing::ValuesIn({miopenBNSpatial,
+                                                             miopenBNPerActivation}),
+                                          testing::ValuesIn({testBNAPIV2})),
+                         TestNameGenerator<BN3DTestCase>());
+
+// bfp16
+INSTANTIATE_TEST_SUITE_P(Full,
+                         GPU_BNOCLBWDSerialRun3D_BFP16,
+                         testing::Combine(testing::ValuesIn(Network3DSerialCase<BN3DTestCase>()),
+                                          testing::ValuesIn({miopenTensorNCDHW, miopenTensorNDHWC}),
+                                          testing::ValuesIn({miopenBNSpatial,
+                                                             miopenBNPerActivation}),
+                                          testing::ValuesIn({testBNAPIV2})),
+                         TestNameGenerator<BN3DTestCase>());
+
+// fp32
+INSTANTIATE_TEST_SUITE_P(Full,
+                         GPU_BNBWDSerialRun3D_FP32,
+                         testing::Combine(testing::ValuesIn(Network3DSerialCase<BN3DTestCase>()),
+                                          testing::ValuesIn({miopenTensorNCDHW, miopenTensorNDHWC}),
+                                          testing::ValuesIn({miopenBNSpatial,
+                                                             miopenBNPerActivation}),
+                                          testing::ValuesIn({testBNAPIV2})),
+                         TestNameGenerator<BN3DTestCase>());
+// fp64
+INSTANTIATE_TEST_SUITE_P(Full,
+                         GPU_BNBWDSerialRun3D_FP64,
+                         testing::Combine(testing::ValuesIn(Network3DSerialCase<BN3DTestCase>()),
+                                          testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
+                                          testing::ValuesIn({miopenBNSpatial,
+                                                             miopenBNPerActivation}),
+                                          testing::ValuesIn({testBNAPIV2})),
+                         TestNameGenerator<BN3DTestCase>());
diff --git a/test/gtest/bn_fwd_train.cpp b/test/gtest/bn_fwd_train.cpp
index a2c2f6da72..2483c0905f 100644
--- a/test/gtest/bn_fwd_train.cpp
+++ b/test/gtest/bn_fwd_train.cpp
@@ -26,76 +26,75 @@
 
 #include "bn.hpp"
 
-// XDataType       : half
-// YDataYype       : half
-// ScaleDataType   : half
-// BiasDataType    : half
-// MeanVarDataType : float
+// XDataType
+// YDataYype
+// ScaleDataType
+// BiasDataType
+// RunSaveDataType
+// AccDataType
 
 struct GPU_BNCKFWDTrainLarge2D_FP16 : BNFwdTrainTest<half_float::half,
                                                      half_float::half,
                                                      half_float::half,
                                                      half_float::half,
                                                      float,
+                                                     double,
                                                      BN2DTestCase>
 {
 };
 
 struct GPU_BNOCLFWDTrainLarge2D_FP16
-    : BNFwdTrainTest<half_float::half, half_float::half, float, float, float, BN2DTestCase>
+    : BNFwdTrainTest<half_float::half, half_float::half, float, float, float, double, BN2DTestCase>
 {
 };
 
 struct GPU_BNOCLFWDTrainLarge3D_FP16
-    : BNFwdTrainTest<half_float::half, half_float::half, float, float, float, BN3DTestCase>
+    : BNFwdTrainTest<half_float::half, half_float::half, float, float, float, double, BN3DTestCase>
 {
 };
 
-// XDataType       : bfloat16
-// YDataYype       : bfloat16
-// ScaleDataType   : bfloat16
-// BiasDataType    : bfloat16
-// MeanVarDataType : float
-
 struct GPU_BNCKFWDTrainLarge2D_BFP16
-    : BNFwdTrainTest<bfloat16, bfloat16, bfloat16, bfloat16, float, BN2DTestCase>
+    : BNFwdTrainTest<bfloat16, bfloat16, bfloat16, bfloat16, float, double, BN2DTestCase>
 {
 };
 
 struct GPU_BNOCLFWDTrainLarge2D_BFP16
-    : BNFwdTrainTest<bfloat16, bfloat16, float, float, float, BN2DTestCase>
+    : BNFwdTrainTest<bfloat16, bfloat16, float, float, float, double, BN2DTestCase>
 {
 };
 
 struct GPU_BNOCLFWDTrainLarge3D_BFP16
-    : BNFwdTrainTest<bfloat16, bfloat16, float, float, float, BN3DTestCase>
+    : BNFwdTrainTest<bfloat16, bfloat16, float, float, float, double, BN3DTestCase>
 {
 };
 
-struct GPU_BNFWDTrainSmall2D_FP32 : BNFwdTrainTest<float, float, float, float, float, BN2DTestCase>
+struct GPU_BNFWDTrainSmall2D_FP32
+    : BNFwdTrainTest<float, float, float, float, float, double, BN2DTestCase>
 {
 };
 
-struct GPU_BNFWDTrainSmall3D_FP32 : BNFwdTrainTest<float, float, float, float, float, BN3DTestCase>
+struct GPU_BNFWDTrainSmall3D_FP32
+    : BNFwdTrainTest<float, float, float, float, float, double, BN3DTestCase>
 {
 };
 
-struct GPU_BNFWDTrainLarge2D_FP32 : BNFwdTrainTest<float, float, float, float, float, BN2DTestCase>
+struct GPU_BNFWDTrainLarge2D_FP32
+    : BNFwdTrainTest<float, float, float, float, float, double, BN2DTestCase>
 {
 };
 
 struct GPU_BNFWDTrainSmall2D_FP64
-    : BNFwdTrainTest<double, double, double, double, double, BN2DTestCase>
+    : BNFwdTrainTest<double, double, double, double, double, double, BN2DTestCase>
 {
 };
 
 struct GPU_BNFWDTrainLarge2D_FP64
-    : BNFwdTrainTest<double, double, double, double, double, BN2DTestCase>
+    : BNFwdTrainTest<double, double, double, double, double, double, BN2DTestCase>
 {
 };
 
 struct GPU_BNFWDTrainSmall3D_FP64
-    : BNFwdTrainTest<double, double, double, double, double, BN3DTestCase>
+    : BNFwdTrainTest<double, double, double, double, double, double, BN3DTestCase>
 {
 };
 
@@ -125,20 +124,26 @@ INSTANTIATE_TEST_SUITE_P(Smoke,
                          GPU_BNCKFWDTrainLarge2D_FP16,
                          testing::Combine(testing::ValuesIn(Network2DSmall<BN2DTestCase>()),
                                           testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
+                                          testing::ValuesIn({miopenBNSpatial,
+                                                             miopenBNPerActivation}),
                                           testing::ValuesIn({testBNAPIV2})),
                          TestNameGenerator<BN2DTestCase>());
 
 INSTANTIATE_TEST_SUITE_P(Smoke,
                          GPU_BNOCLFWDTrainLarge2D_FP16,
                          testing::Combine(testing::ValuesIn(Network2DLarge<BN2DTestCase>()),
-                                          testing::ValuesIn({miopenTensorNCHW}),
+                                          testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
+                                          testing::ValuesIn({miopenBNSpatial,
+                                                             miopenBNPerActivation}),
                                           testing::ValuesIn({testBNAPIV1, testBNAPIV2})),
                          TestNameGenerator<BN2DTestCase>());
 
 INSTANTIATE_TEST_SUITE_P(Smoke,
                          GPU_BNOCLFWDTrainLarge3D_FP16,
                          testing::Combine(testing::ValuesIn(Network3DBN<BN3DTestCase>()),
-                                          testing::ValuesIn({miopenTensorNCDHW}),
+                                          testing::ValuesIn({miopenTensorNCDHW, miopenTensorNDHWC}),
+                                          testing::ValuesIn({miopenBNSpatial,
+                                                             miopenBNPerActivation}),
                                           testing::ValuesIn({testBNAPIV1, testBNAPIV2})),
                          TestNameGenerator<BN3DTestCase>());
 
@@ -147,20 +152,26 @@ INSTANTIATE_TEST_SUITE_P(Smoke,
                          GPU_BNCKFWDTrainLarge2D_BFP16,
                          testing::Combine(testing::ValuesIn(Network2DSmall<BN2DTestCase>()),
                                           testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
+                                          testing::ValuesIn({miopenBNSpatial,
+                                                             miopenBNPerActivation}),
                                           testing::ValuesIn({testBNAPIV2})),
                          TestNameGenerator<BN2DTestCase>());
 
 INSTANTIATE_TEST_SUITE_P(Smoke,
                          GPU_BNOCLFWDTrainLarge2D_BFP16,
                          testing::Combine(testing::ValuesIn(Network2DLarge<BN2DTestCase>()),
-                                          testing::ValuesIn({miopenTensorNCHW}),
+                                          testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
+                                          testing::ValuesIn({miopenBNSpatial,
+                                                             miopenBNPerActivation}),
                                           testing::ValuesIn({testBNAPIV1, testBNAPIV2})),
                          TestNameGenerator<BN2DTestCase>());
 
 INSTANTIATE_TEST_SUITE_P(Smoke,
                          GPU_BNOCLFWDTrainLarge3D_BFP16,
                          testing::Combine(testing::ValuesIn(Network3DBN<BN3DTestCase>()),
-                                          testing::ValuesIn({miopenTensorNCDHW}),
+                                          testing::ValuesIn({miopenTensorNCDHW, miopenTensorNDHWC}),
+                                          testing::ValuesIn({miopenBNSpatial,
+                                                             miopenBNPerActivation}),
                                           testing::ValuesIn({testBNAPIV1, testBNAPIV2})),
                          TestNameGenerator<BN3DTestCase>());
 // fp32
@@ -168,27 +179,36 @@ INSTANTIATE_TEST_SUITE_P(Smoke,
                          GPU_BNFWDTrainSmall2D_FP32,
                          testing::Combine(testing::ValuesIn(Network2DSmall<BN2DTestCase>()),
                                           testing::ValuesIn({miopenTensorNCHW}),
+                                          testing::ValuesIn({miopenBNSpatial,
+                                                             miopenBNPerActivation}),
                                           testing::ValuesIn({testBNAPIV1})),
                          TestNameGenerator<BN2DTestCase>());
 
-INSTANTIATE_TEST_SUITE_P(Smoke,
-                         GPU_BNFWDTrainSmall3D_FP32,
-                         testing::Combine(testing::ValuesIn(Network3DBN<BN3DTestCase>()),
-                                          testing::ValuesIn({miopenTensorNCDHW}),
-                                          testing::ValuesIn({testBNAPIV1, testBNAPIV2})),
-                         TestNameGenerator<BN3DTestCase>());
-
 INSTANTIATE_TEST_SUITE_P(Smoke,
                          GPU_BNFWDTrainLarge2D_FP32,
                          testing::Combine(testing::ValuesIn(Network2DLarge<BN2DTestCase>()),
                                           testing::ValuesIn({miopenTensorNCHW}),
+                                          testing::ValuesIn({miopenBNSpatial,
+                                                             miopenBNPerActivation}),
                                           testing::ValuesIn({testBNAPIV2})),
                          TestNameGenerator<BN2DTestCase>());
+
+INSTANTIATE_TEST_SUITE_P(Smoke,
+                         GPU_BNFWDTrainSmall3D_FP32,
+                         testing::Combine(testing::ValuesIn(Network3DBN<BN3DTestCase>()),
+                                          testing::ValuesIn({miopenTensorNCDHW, miopenTensorNDHWC}),
+                                          testing::ValuesIn({miopenBNSpatial,
+                                                             miopenBNPerActivation}),
+                                          testing::ValuesIn({testBNAPIV1, testBNAPIV2})),
+                         TestNameGenerator<BN3DTestCase>());
+
 // fp64
 INSTANTIATE_TEST_SUITE_P(Smoke,
                          GPU_BNFWDTrainSmall2D_FP64,
                          testing::Combine(testing::ValuesIn(Network2DSmall<BN2DTestCase>()),
                                           testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
+                                          testing::ValuesIn({miopenBNSpatial,
+                                                             miopenBNPerActivation}),
                                           testing::ValuesIn({testBNAPIV1})),
                          TestNameGenerator<BN2DTestCase>());
 
@@ -196,12 +216,16 @@ INSTANTIATE_TEST_SUITE_P(Smoke,
                          GPU_BNFWDTrainLarge2D_FP64,
                          testing::Combine(testing::ValuesIn(Network2DSmall<BN2DTestCase>()),
                                           testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
+                                          testing::ValuesIn({miopenBNSpatial,
+                                                             miopenBNPerActivation}),
                                           testing::ValuesIn({testBNAPIV2})),
                          TestNameGenerator<BN2DTestCase>());
 
 INSTANTIATE_TEST_SUITE_P(Smoke,
                          GPU_BNFWDTrainSmall3D_FP64,
                          testing::Combine(testing::ValuesIn(Network3DBN<BN3DTestCase>()),
-                                          testing::ValuesIn({miopenTensorNCDHW}),
+                                          testing::ValuesIn({miopenTensorNCDHW, miopenTensorNDHWC}),
+                                          testing::ValuesIn({miopenBNSpatial,
+                                                             miopenBNPerActivation}),
                                           testing::ValuesIn({testBNAPIV1, testBNAPIV2})),
                          TestNameGenerator<BN3DTestCase>());
diff --git a/test/gtest/bn_fwd_train_serial_run.cpp b/test/gtest/bn_fwd_train_serial_run.cpp
new file mode 100644
index 0000000000..d4e254434c
--- /dev/null
+++ b/test/gtest/bn_fwd_train_serial_run.cpp
@@ -0,0 +1,105 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2025 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include "bn.hpp"
+
+// XDataType
+// YDataYype
+// ScaleDataType
+// BiasDataType
+// RunSaveDataType
+// AccDataType
+
+struct GPU_BNOCLFWDTrainSerialRun3D_FP16
+    : BNFwdTrainTest<half_float::half, half_float::half, float, float, float, double, BN3DTestCase>
+{
+};
+
+struct GPU_BNOCLFWDTrainSerialRun3D_BFP16
+    : BNFwdTrainTest<bfloat16, bfloat16, float, float, float, double, BN3DTestCase>
+{
+};
+
+struct GPU_BNFWDTrainSerialRun3D_FP32
+    : BNFwdTrainTest<float, float, float, float, float, double, BN3DTestCase>
+{
+};
+
+struct GPU_BNFWDTrainSerialRun3D_FP64
+    : BNFwdTrainTest<double, double, double, double, double, double, BN3DTestCase>
+{
+};
+
+// fp16
+TEST_P(GPU_BNOCLFWDTrainSerialRun3D_FP16, BnV2SerialRunFWD_TrainOCL_3D_fp16) {}
+
+// bfp16
+TEST_P(GPU_BNOCLFWDTrainSerialRun3D_BFP16, BnV2SerialRunFWD_TrainOCL_3Dbfp16) {}
+
+// fp32 (float)
+TEST_P(GPU_BNFWDTrainSerialRun3D_FP32, BnV1SerialRunFWD_Train3Dfp32) {}
+
+// fp64
+TEST_P(GPU_BNFWDTrainSerialRun3D_FP64, DISABLED_BnV2SerialRunFWD_Train3Dfp64) {}
+
+// fp16
+INSTANTIATE_TEST_SUITE_P(Full,
+                         GPU_BNOCLFWDTrainSerialRun3D_FP16,
+                         testing::Combine(testing::ValuesIn(Network3DSerialCase<BN3DTestCase>()),
+                                          testing::ValuesIn({miopenTensorNCDHW, miopenTensorNDHWC}),
+                                          testing::ValuesIn({miopenBNSpatial,
+                                                             miopenBNPerActivation}),
+                                          testing::ValuesIn({testBNAPIV1, testBNAPIV2})),
+                         TestNameGenerator<BN3DTestCase>());
+
+// bfp16
+INSTANTIATE_TEST_SUITE_P(Full,
+                         GPU_BNOCLFWDTrainSerialRun3D_BFP16,
+                         testing::Combine(testing::ValuesIn(Network3DSerialCase<BN3DTestCase>()),
+                                          testing::ValuesIn({miopenTensorNCDHW, miopenTensorNDHWC}),
+                                          testing::ValuesIn({miopenBNSpatial,
+                                                             miopenBNPerActivation}),
+                                          testing::ValuesIn({testBNAPIV1, testBNAPIV2})),
+                         TestNameGenerator<BN3DTestCase>());
+// fp32
+INSTANTIATE_TEST_SUITE_P(Full,
+                         GPU_BNFWDTrainSerialRun3D_FP32,
+                         testing::Combine(testing::ValuesIn(Network3DSerialCase<BN3DTestCase>()),
+                                          testing::ValuesIn({miopenTensorNCDHW, miopenTensorNDHWC}),
+                                          testing::ValuesIn({miopenBNSpatial,
+                                                             miopenBNPerActivation}),
+                                          testing::ValuesIn({testBNAPIV1, testBNAPIV2})),
+                         TestNameGenerator<BN3DTestCase>());
+
+// fp64
+INSTANTIATE_TEST_SUITE_P(Full,
+                         GPU_BNFWDTrainSerialRun3D_FP64,
+                         testing::Combine(testing::ValuesIn(Network3DSerialCase<BN3DTestCase>()),
+                                          testing::ValuesIn({miopenTensorNCDHW, miopenTensorNDHWC}),
+                                          testing::ValuesIn({miopenBNSpatial,
+                                                             miopenBNPerActivation}),
+                                          testing::ValuesIn({testBNAPIV1, testBNAPIV2})),
+                         TestNameGenerator<BN3DTestCase>());
diff --git a/test/gtest/bn_infer.cpp b/test/gtest/bn_infer.cpp
index ad9edf694e..7ddbbd11d7 100644
--- a/test/gtest/bn_infer.cpp
+++ b/test/gtest/bn_infer.cpp
@@ -31,22 +31,24 @@
 // ScaleDataType   : half_float::half
 // BiasDataType    : half_float::half
 // MeanVarDataType : float
+// AccDataType     : double
 struct GPU_BNCKInferLarge2D_FP16 : BNInferTest<half_float::half,
                                                half_float::half,
                                                half_float::half,
                                                half_float::half,
                                                float,
+                                               double,
                                                BN2DTestCase>
 {
 };
 
 struct GPU_BNOCLInferLarge2D_FP16
-    : BNInferTest<half_float::half, half_float::half, float, float, float, BN2DTestCase>
+    : BNInferTest<half_float::half, half_float::half, float, float, float, double, BN2DTestCase>
 {
 };
 
 struct GPU_BNOCLInferLarge3D_FP16
-    : BNInferTest<half_float::half, half_float::half, float, float, float, BN3DTestCase>
+    : BNInferTest<half_float::half, half_float::half, float, float, float, double, BN3DTestCase>
 {
 };
 
@@ -56,7 +58,7 @@ struct GPU_BNOCLInferLarge3D_FP16
 // BiasDataType    : bfloat16
 // MeanVarDataType : float
 struct GPU_BNCKInferLarge2D_BFP16
-    : BNInferTest<bfloat16, bfloat16, bfloat16, bfloat16, float, BN2DTestCase>
+    : BNInferTest<bfloat16, bfloat16, bfloat16, bfloat16, float, double, BN2DTestCase>
 {
 };
 
@@ -66,30 +68,35 @@ struct GPU_BNCKInferLarge2D_BFP16
 // BiasDataType    : float
 // MeanVarDataType : float
 struct GPU_BNOCLInferLarge2D_BFP16
-    : BNInferTest<bfloat16, bfloat16, float, float, float, BN2DTestCase>
+    : BNInferTest<bfloat16, bfloat16, float, float, float, double, BN2DTestCase>
 {
 };
 
 struct GPU_BNOCLInferLarge3D_BFP16
-    : BNInferTest<bfloat16, bfloat16, float, float, float, BN3DTestCase>
+    : BNInferTest<bfloat16, bfloat16, float, float, float, double, BN3DTestCase>
 {
 };
 
-struct GPU_BNInferSmall2D_FP32 : BNInferTest<float, float, float, float, float, BN2DTestCase>
+struct GPU_BNInferSmall2D_FP32
+    : BNInferTest<float, float, float, float, float, double, BN2DTestCase>
 {
 };
-struct GPU_BNInferSmall3D_FP32 : BNInferTest<float, float, float, float, float, BN3DTestCase>
+struct GPU_BNInferSmall3D_FP32
+    : BNInferTest<float, float, float, float, float, double, BN3DTestCase>
 {
 };
-struct GPU_BNInferLarge2D_FP32 : BNInferTest<float, float, float, float, float, BN2DTestCase>
+struct GPU_BNInferLarge2D_FP32
+    : BNInferTest<float, float, float, float, float, double, BN2DTestCase>
 {
 };
 
-struct GPU_BNInferSmall2D_FP64 : BNInferTest<double, double, double, double, double, BN2DTestCase>
+struct GPU_BNInferSmall2D_FP64
+    : BNInferTest<double, double, double, double, double, double, BN2DTestCase>
 {
 };
 
-struct GPU_BNInferLarge2D_FP64 : BNInferTest<double, double, double, double, double, BN2DTestCase>
+struct GPU_BNInferLarge2D_FP64
+    : BNInferTest<double, double, double, double, double, double, BN2DTestCase>
 {
 };
 
@@ -117,20 +124,26 @@ INSTANTIATE_TEST_SUITE_P(Smoke,
                          GPU_BNCKInferLarge2D_FP16,
                          testing::Combine(testing::ValuesIn(Network2DSmall<BN2DTestCase>()),
                                           testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
+                                          testing::ValuesIn({miopenBNSpatial,
+                                                             miopenBNPerActivation}),
                                           testing::ValuesIn({testBNAPIV2})),
                          TestNameGenerator<BN2DTestCase>());
 
 INSTANTIATE_TEST_SUITE_P(Smoke,
                          GPU_BNOCLInferLarge2D_FP16,
                          testing::Combine(testing::ValuesIn(Network2DLarge<BN2DTestCase>()),
-                                          testing::ValuesIn({miopenTensorNCHW}),
+                                          testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
+                                          testing::ValuesIn({miopenBNSpatial,
+                                                             miopenBNPerActivation}),
                                           testing::ValuesIn({testBNAPIV1, testBNAPIV2})),
                          TestNameGenerator<BN2DTestCase>());
 
 INSTANTIATE_TEST_SUITE_P(Smoke,
                          GPU_BNOCLInferLarge3D_FP16,
                          testing::Combine(testing::ValuesIn(Network3DBN<BN3DTestCase>()),
-                                          testing::ValuesIn({miopenTensorNCDHW}),
+                                          testing::ValuesIn({miopenTensorNCDHW, miopenTensorNDHWC}),
+                                          testing::ValuesIn({miopenBNSpatial,
+                                                             miopenBNPerActivation}),
                                           testing::ValuesIn({testBNAPIV1, testBNAPIV2})),
                          TestNameGenerator<BN3DTestCase>());
 // bfp16
@@ -138,20 +151,26 @@ INSTANTIATE_TEST_SUITE_P(Smoke,
                          GPU_BNCKInferLarge2D_BFP16,
                          testing::Combine(testing::ValuesIn(Network2DSmall<BN2DTestCase>()),
                                           testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
+                                          testing::ValuesIn({miopenBNSpatial,
+                                                             miopenBNPerActivation}),
                                           testing::ValuesIn({testBNAPIV2})),
                          TestNameGenerator<BN2DTestCase>());
 
 INSTANTIATE_TEST_SUITE_P(Smoke,
                          GPU_BNOCLInferLarge2D_BFP16,
                          testing::Combine(testing::ValuesIn(Network2DLarge<BN2DTestCase>()),
-                                          testing::ValuesIn({miopenTensorNCHW}),
+                                          testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
+                                          testing::ValuesIn({miopenBNSpatial,
+                                                             miopenBNPerActivation}),
                                           testing::ValuesIn({testBNAPIV1, testBNAPIV2})),
                          TestNameGenerator<BN2DTestCase>());
 
 INSTANTIATE_TEST_SUITE_P(Smoke,
                          GPU_BNOCLInferLarge3D_BFP16,
                          testing::Combine(testing::ValuesIn(Network3DBN<BN3DTestCase>()),
-                                          testing::ValuesIn({miopenTensorNCDHW}),
+                                          testing::ValuesIn({miopenTensorNCDHW, miopenTensorNDHWC}),
+                                          testing::ValuesIn({miopenBNSpatial,
+                                                             miopenBNPerActivation}),
                                           testing::ValuesIn({testBNAPIV1, testBNAPIV2})),
                          TestNameGenerator<BN3DTestCase>());
 
@@ -159,21 +178,27 @@ INSTANTIATE_TEST_SUITE_P(Smoke,
 INSTANTIATE_TEST_SUITE_P(Smoke,
                          GPU_BNInferSmall2D_FP32,
                          testing::Combine(testing::ValuesIn(Network2DLarge<BN2DTestCase>()),
-                                          testing::ValuesIn({miopenTensorNCHW}),
+                                          testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
+                                          testing::ValuesIn({miopenBNSpatial,
+                                                             miopenBNPerActivation}),
                                           testing::ValuesIn({testBNAPIV1})),
                          TestNameGenerator<BN2DTestCase>());
 
 INSTANTIATE_TEST_SUITE_P(Smoke,
                          GPU_BNInferLarge2D_FP32,
                          testing::Combine(testing::ValuesIn(Network2DSmall<BN2DTestCase>()),
-                                          testing::ValuesIn({miopenTensorNCHW}),
+                                          testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
+                                          testing::ValuesIn({miopenBNSpatial,
+                                                             miopenBNPerActivation}),
                                           testing::ValuesIn({testBNAPIV2})),
                          TestNameGenerator<BN2DTestCase>());
 
 INSTANTIATE_TEST_SUITE_P(Smoke,
                          GPU_BNInferSmall3D_FP32,
                          testing::Combine(testing::ValuesIn(Network3DBN<BN3DTestCase>()),
-                                          testing::ValuesIn({miopenTensorNCDHW}),
+                                          testing::ValuesIn({miopenTensorNCDHW, miopenTensorNDHWC}),
+                                          testing::ValuesIn({miopenBNSpatial,
+                                                             miopenBNPerActivation}),
                                           testing::ValuesIn({testBNAPIV2})),
                          TestNameGenerator<BN3DTestCase>());
 // fp64
@@ -181,6 +206,8 @@ INSTANTIATE_TEST_SUITE_P(Smoke,
                          GPU_BNInferSmall2D_FP64,
                          testing::Combine(testing::ValuesIn(Network2DSmall<BN2DTestCase>()),
                                           testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
+                                          testing::ValuesIn({miopenBNSpatial,
+                                                             miopenBNPerActivation}),
                                           testing::ValuesIn({testBNAPIV1})),
                          TestNameGenerator<BN2DTestCase>());
 
@@ -188,5 +215,7 @@ INSTANTIATE_TEST_SUITE_P(Smoke,
                          GPU_BNInferLarge2D_FP64,
                          testing::Combine(testing::ValuesIn(Network2DSmall<BN2DTestCase>()),
                                           testing::ValuesIn({miopenTensorNCHW, miopenTensorNHWC}),
+                                          testing::ValuesIn({miopenBNSpatial,
+                                                             miopenBNPerActivation}),
                                           testing::ValuesIn({testBNAPIV2})),
                          TestNameGenerator<BN2DTestCase>());
diff --git a/test/gtest/bn_infer_serial_run.cpp b/test/gtest/bn_infer_serial_run.cpp
new file mode 100644
index 0000000000..18277eb757
--- /dev/null
+++ b/test/gtest/bn_infer_serial_run.cpp
@@ -0,0 +1,96 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2025 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include "bn.hpp"
+
+// XDataType       : half_float::half
+// YDataYype       : half_float::half
+// ScaleDataType   : half_float::half
+// BiasDataType    : half_float::half
+// MeanVarDataType : float
+// AccDataType     : double
+struct GPU_BNOCLInferSerialRun3D_FP16
+    : BNInferTest<half_float::half, half_float::half, float, float, float, double, BN3DTestCase>
+{
+};
+
+// XDataType       : bfloat16
+// YDataYype       : bfloat16
+// ScaleDataType   : float
+// BiasDataType    : float
+// MeanVarDataType : float
+struct GPU_BNOCLInferSerialRun3D_BFP16
+    : BNInferTest<bfloat16, bfloat16, float, float, float, double, BN3DTestCase>
+{
+};
+
+struct GPU_BNInferSerialRun3D_FP32
+    : BNInferTest<float, float, float, float, float, double, BN3DTestCase>
+{
+};
+
+struct GPU_BNInferSerialRun3D_FP64
+    : BNInferTest<double, double, double, double, double, double, BN3DTestCase>
+{
+};
+
+// fp16
+TEST_P(GPU_BNOCLInferSerialRun3D_FP16, BnV2SerialRunInferOCLfp16_3D) {}
+
+// bfp16
+TEST_P(GPU_BNOCLInferSerialRun3D_BFP16, BnV2SerialRunInferOCLbfp16_3D) {}
+
+// fp32 (float)
+TEST_P(GPU_BNInferSerialRun3D_FP32, DISABLED_BnV2SerialRunInferfp32_3D) {}
+
+// fp16
+INSTANTIATE_TEST_SUITE_P(Full,
+                         GPU_BNOCLInferSerialRun3D_FP16,
+                         testing::Combine(testing::ValuesIn(Network3DSerialCase<BN3DTestCase>()),
+                                          testing::ValuesIn({miopenTensorNCDHW, miopenTensorNDHWC}),
+                                          testing::ValuesIn({miopenBNSpatial,
+                                                             miopenBNPerActivation}),
+                                          testing::ValuesIn({testBNAPIV1, testBNAPIV2})),
+                         TestNameGenerator<BN3DTestCase>());
+// bfp16
+INSTANTIATE_TEST_SUITE_P(Full,
+                         GPU_BNOCLInferSerialRun3D_BFP16,
+                         testing::Combine(testing::ValuesIn(Network3DSerialCase<BN3DTestCase>()),
+                                          testing::ValuesIn({miopenTensorNCDHW, miopenTensorNDHWC}),
+                                          testing::ValuesIn({miopenBNSpatial,
+                                                             miopenBNPerActivation}),
+                                          testing::ValuesIn({testBNAPIV1, testBNAPIV2})),
+                         TestNameGenerator<BN3DTestCase>());
+
+// fp32
+INSTANTIATE_TEST_SUITE_P(Full,
+                         GPU_BNInferSerialRun3D_FP32,
+                         testing::Combine(testing::ValuesIn(Network3DSerialCase<BN3DTestCase>()),
+                                          testing::ValuesIn({miopenTensorNCDHW, miopenTensorNDHWC}),
+                                          testing::ValuesIn({miopenBNSpatial,
+                                                             miopenBNPerActivation}),
+                                          testing::ValuesIn({testBNAPIV2})),
+                         TestNameGenerator<BN3DTestCase>());
diff --git a/test/gtest/bn_test_data.hpp b/test/gtest/bn_test_data.hpp
index 2bbb87d937..163cd0c90f 100644
--- a/test/gtest/bn_test_data.hpp
+++ b/test/gtest/bn_test_data.hpp
@@ -41,7 +41,6 @@ struct BN2DTestCase
     size_t C;
     size_t H;
     size_t W;
-    miopenBatchNormMode_t mode;
     miopen::batchnorm::Direction Direction;
     bool save;
     bool keepRunning;
@@ -49,8 +48,8 @@ struct BN2DTestCase
     friend std::ostream& operator<<(std::ostream& ss, const BN2DTestCase& tc)
     {
         return ss << "(N: " << tc.N << " C:" << tc.C << " H:" << tc.H << " W:" << tc.W
-                  << " mode: " << tc.mode << " Direction: " << static_cast<int>(tc.Direction)
-                  << " save: " << tc.save << " keepRunning: " << tc.keepRunning;
+                  << " Direction: " << static_cast<int>(tc.Direction) << " save: " << tc.save
+                  << " keepRunning: " << tc.keepRunning;
     }
     std::vector<size_t> GetInput() const { return {N, C, H, W}; }
 };
@@ -62,7 +61,6 @@ struct BN3DTestCase
     size_t D;
     size_t H;
     size_t W;
-    miopenBatchNormMode_t mode;
     miopen::batchnorm::Direction Direction;
     bool save;
     bool keepRunning;
@@ -70,9 +68,8 @@ struct BN3DTestCase
     friend std::ostream& operator<<(std::ostream& ss, const BN3DTestCase& tc)
     {
         return ss << "(N: " << tc.N << " C:" << tc.C << " D:" << tc.D << " H:" << tc.H
-                  << " W:" << tc.W << " mode: " << tc.mode
-                  << " Direction: " << static_cast<int>(tc.Direction) << " save: " << tc.save
-                  << " keepRunning: " << tc.keepRunning;
+                  << " W:" << tc.W << " Direction: " << static_cast<int>(tc.Direction)
+                  << " save: " << tc.save << " keepRunning: " << tc.keepRunning;
     }
     std::vector<size_t> GetInput() const { return {N, C, D, H, W}; }
 };
@@ -86,88 +83,114 @@ std::vector<T> Network2DLarge();
 template <typename T>
 std::vector<T> Network3DBN();
 
+template <typename T>
+std::vector<T> Network3DSerialCase();
+
 template <>
 inline std::vector<BN2DTestCase> Network2DLarge()
 {
     // pyt_mlperf_resnet50v1.5
+    // clang-format off
     return {
-        {192, 1, 8, 8, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 1, 0},
-        {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
-        {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
-        {64, 2048, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
-        {64, 256, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
-        {64, 256, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
-        {64, 256, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
-        {64, 256, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
-        {64, 256, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
-        {64, 256, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
-        {64, 256, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
-        {64, 256, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
-        {64, 256, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
-        {64, 512, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
-        {64, 512, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
-        {64, 512, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
-        {64, 512, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
-        {64, 512, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
-        {64, 512, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
-        {64, 512, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
-        {64, 512, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
-        {64, 512, 7, 7, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
-        {64, 64, 112, 112, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
-        {64, 64, 112, 112, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
-        {64, 64, 112, 112, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
-        {64, 64, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
-        {64, 64, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
-        {64, 64, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::ForwardInference, 1, 0},
-        {768, 1, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
-        {768, 1, 23, 23, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
-        {832, 1, 14, 14, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
-        {832, 1, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 1}};
+        {64, 1, 1024, 1024, miopen::batchnorm::Direction::Backward, 1, 0},
+        {192, 1, 8, 8, miopen::batchnorm::Direction::Backward, 1, 0},
+        {12, 40, 122, 122, miopen::batchnorm::Direction::Backward, 1, 0},
+        {64, 256, 14, 14, miopen::batchnorm::Direction::Backward, 0, 1},
+        {64, 256, 14, 14, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
+        {64, 256, 14, 14, miopen::batchnorm::Direction::ForwardInference, 1, 0},
+        {64, 256, 28, 28, miopen::batchnorm::Direction::Backward, 0, 1},
+        {64, 256, 28, 28, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
+        {64, 256, 28, 28, miopen::batchnorm::Direction::ForwardInference, 1, 0},
+        {64, 256, 56, 56, miopen::batchnorm::Direction::Backward, 0, 1},
+        {64, 256, 56, 56, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
+        {64, 256, 56, 56, miopen::batchnorm::Direction::ForwardInference, 1, 0},
+        {64, 512, 14, 14, miopen::batchnorm::Direction::Backward, 0, 1},
+        {64, 512, 14, 14, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
+        {64, 512, 14, 14, miopen::batchnorm::Direction::ForwardInference, 1, 0},
+        {64, 512, 28, 28, miopen::batchnorm::Direction::Backward, 0, 1},
+        {64, 512, 28, 28, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
+        {64, 512, 28, 28, miopen::batchnorm::Direction::ForwardInference, 1, 0},
+        {64, 512, 7, 7, miopen::batchnorm::Direction::Backward, 0, 1},
+        {64, 512, 7, 7, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
+        {64, 512, 7, 7, miopen::batchnorm::Direction::ForwardInference, 1, 0},
+        {64, 64, 112, 112, miopen::batchnorm::Direction::Backward, 0, 1},
+        {64, 64, 112, 112, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
+        {64, 64, 112, 112, miopen::batchnorm::Direction::ForwardInference, 1, 0},
+        {64, 64, 56, 56, miopen::batchnorm::Direction::Backward, 0, 1},
+        {64, 64, 56, 56, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
+        {64, 64, 56, 56, miopen::batchnorm::Direction::ForwardInference, 1, 0},
+        {64, 2048, 7, 7, miopen::batchnorm::Direction::Backward, 0, 1},
+        {64, 2048, 17, 17, miopen::batchnorm::Direction::Backward, 0, 1},
+        {128, 256, 14, 14, miopen::batchnorm::Direction::Backward, 0, 1},
+        {128, 256, 16, 16, miopen::batchnorm::Direction::Backward, 0, 1},
+        {670, 1, 224, 224, miopen::batchnorm::Direction::Backward, 0, 1},
+        {768, 1, 14, 14, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
+        {768, 1, 23, 23, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
+        {832, 1, 14, 14, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
+        {832, 1, 28, 28, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
+        // edge cases
+        {69328, 1, 22, 22, miopen::batchnorm::Direction::ForwardTraining, 1, 1},
+        {69328, 1, 13, 79, miopen::batchnorm::Direction::ForwardTraining, 1, 1}
+        };
+    // clang-format on
+}
+
+// These are very large tensors which caused memory insufficient error
+// when ran parallely by ctest. Hence, these are run serially.
+// Shape: (2, 2048, 16, 128, 128) --> Size: 1.07e+09
+// For now any test case with tensor size greater then 1e09 need to be run serially.
+template <>
+inline std::vector<BN3DTestCase> Network3DSerialCase()
+{
+    return {{2, 2048, 16, 128, 128, miopen::batchnorm::Direction::Backward, 0, 1}};
 }
 
 template <>
 inline std::vector<BN2DTestCase> Network2DSmall()
 {
     // pyt_mlperf_resnet50v1.5
+    // clang-format off
     return {
-        {192, 2, 8, 8, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 1, 0},
-        {16, 8, 56, 56, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 1, 0},
-        {16, 8, 128, 256, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 0},
-        {64, 2048, 17, 17, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
-
+        {12, 40, 122, 122, miopen::batchnorm::Direction::Backward, 1, 0},
+        {16, 8, 132, 28, miopen::batchnorm::Direction::Backward, 1, 0},
+        {192, 2, 8, 8, miopen::batchnorm::Direction::Backward, 1, 0},
+        {16, 8, 56, 56, miopen::batchnorm::Direction::Backward, 1, 0},
+        {16, 8, 128, 256, miopen::batchnorm::Direction::ForwardTraining, 1, 0},
     };
+    // clang-format on
 }
 
 template <>
 inline std::vector<BN3DTestCase> Network3DBN()
 {
+    // clang-format off
     return {
-        {2, 2, 3, 224, 224, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 1, 0},
-        {16, 8, 132, 28, 28, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 1, 0},
-        {16, 8, 16, 128, 128, miopenBNSpatial, miopen::batchnorm::Direction::ForwardTraining, 1, 0},
-        {2, 2048, 16, 128, 128, miopenBNSpatial, miopen::batchnorm::Direction::Backward, 0, 1},
-
+        {2, 2, 3, 224, 224, miopen::batchnorm::Direction::Backward, 1, 0},
+        {16, 8, 132, 28, 28, miopen::batchnorm::Direction::Backward, 1, 0},
+        {16, 8, 16, 128, 128, miopen::batchnorm::Direction::ForwardTraining, 1, 0}
     };
+    // clang-format on
 }
 
-template <typename XDataType, typename YDataType, typename TConfig>
+template <typename XDataType, typename YDataType, typename AccDataType, typename TConfig>
 struct BNTestData
 {
-    void SetUpImpl(const TConfig& config, miopenTensorLayout_t t_layout)
+    void
+    SetUpImpl(const TConfig& config, miopenBatchNormMode_t t_bnmode, miopenTensorLayout_t t_layout)
     {
         bn_config     = config;
         tensor_layout = t_layout;
+        bn_mode       = t_bnmode;
         CreateTensors();
         InitTensorsWithRandValue();
         SetDirection();
-        SetBNMode();
         WriteToGPU();
     }
     const miopen::TensorDescriptor& GetInputDesc() const { return input.desc; }
 
     tensor<XDataType> input;
     tensor<YDataType> output;
-    tensor<YDataType> ref_out;
+    tensor<AccDataType> out_ref;
     miopen::Allocator::ManageDataPtr in_dev;
     miopen::Allocator::ManageDataPtr out_dev;
 
@@ -184,17 +207,16 @@ struct BNTestData
     {
         input   = tensor<XDataType>{tensor_layout, bn_config.GetInput()};
         output  = tensor<YDataType>{tensor_layout, bn_config.GetInput()};
-        ref_out = tensor<YDataType>{tensor_layout, bn_config.GetInput()};
+        out_ref = tensor<AccDataType>{tensor_layout, bn_config.GetInput()};
     }
 
     void InitTensorsWithRandValue()
     {
-        input.generate(
-            [](auto...) { return prng::gen_descreet_uniform_sign<XDataType>(1e-2, 100); });
+        // -2.0 to 2.0
+        input.generate(uniform_signed_initializer<XDataType>(2e-3 /*scale*/, 1000 /*range*/));
     }
 
     void SetDirection() { direction = bn_config.Direction; }
-    void SetBNMode() { bn_mode = bn_config.mode; }
     void WriteToGPU()
     {
         auto&& handle = get_handle();
@@ -208,12 +230,15 @@ template <typename XDataType,
           typename ScaleDataType,
           typename BiasDataType,
           typename MeanVarDataType,
+          typename AccDataType,
           typename TConfig>
-struct BNInferTestData : public BNTestData<XDataType, YDataType, TConfig>
+struct BNInferTestData : public BNTestData<XDataType, YDataType, AccDataType, TConfig>
 {
-    void SetUpImpl(const TConfig& config, miopenTensorLayout_t t_layout)
+    void
+    SetUpImpl(const TConfig& config, miopenBatchNormMode_t t_bnmode, miopenTensorLayout_t t_layout)
     {
-        BNTestData<XDataType, YDataType, TConfig>::SetUpImpl(config, t_layout);
+        BNTestData<XDataType, YDataType, AccDataType, TConfig>::SetUpImpl(
+            config, t_bnmode, t_layout);
         CreateTensors();
         InitTensorsWithRandValue();
         WriteToGPU();
@@ -238,32 +263,35 @@ struct BNInferTestData : public BNTestData<XDataType, YDataType, TConfig>
     void CreateTensors()
     {
         auto derivedBnDesc = miopen::TensorDescriptor{};
-        miopen::DeriveBNTensorDescriptor(derivedBnDesc,
-                                         BNTestData<XDataType, YDataType, TConfig>::input.desc,
-                                         BNTestData<XDataType, YDataType, TConfig>::bn_mode);
-        scale   = tensor<ScaleDataType>{BNTestData<XDataType, YDataType, TConfig>::tensor_layout,
-                                      derivedBnDesc.GetLengths()};
-        shift   = tensor<BiasDataType>{BNTestData<XDataType, YDataType, TConfig>::tensor_layout,
-                                     derivedBnDesc.GetLengths()};
-        estMean = tensor<MeanVarDataType>{BNTestData<XDataType, YDataType, TConfig>::tensor_layout,
-                                          derivedBnDesc.GetLengths()};
+        miopen::DeriveBNTensorDescriptor(
+            derivedBnDesc,
+            BNTestData<XDataType, YDataType, AccDataType, TConfig>::input.desc,
+            BNTestData<XDataType, YDataType, AccDataType, TConfig>::bn_mode);
+        scale = tensor<ScaleDataType>{
+            BNTestData<XDataType, YDataType, AccDataType, TConfig>::tensor_layout,
+            derivedBnDesc.GetLengths()};
+        shift = tensor<BiasDataType>{
+            BNTestData<XDataType, YDataType, AccDataType, TConfig>::tensor_layout,
+            derivedBnDesc.GetLengths()};
+        estMean = tensor<MeanVarDataType>{
+            BNTestData<XDataType, YDataType, AccDataType, TConfig>::tensor_layout,
+            derivedBnDesc.GetLengths()};
         estVariance = tensor<MeanVarDataType>{
-            BNTestData<XDataType, YDataType, TConfig>::tensor_layout, derivedBnDesc.GetLengths()};
+            BNTestData<XDataType, YDataType, AccDataType, TConfig>::tensor_layout,
+            derivedBnDesc.GetLengths()};
     }
 
     void InitTensorsWithRandValue()
     {
-        auto gen_value = [](auto...) {
-            return prng::gen_descreet_uniform_sign<ScaleDataType>(1e-2, 100);
-        };
-        scale.generate(gen_value);
-        shift.generate(gen_value);
-        estMean.generate(gen_value);
-
-        auto gen_var = [](auto...) {
-            return static_cast<MeanVarDataType>(1e-2 * (prng::gen_0_to_B(100) + 1));
-        };
-        estVariance.generate(gen_var);
+        // -2.0 to 2.0
+        scale.generate(uniform_signed_initializer<ScaleDataType>(2e-3 /*scale*/, 1000 /*range*/));
+        shift.generate(uniform_signed_initializer<BiasDataType>(2e-3 /*scale*/, 1000 /*range*/));
+        estMean.generate(
+            uniform_signed_initializer<MeanVarDataType>(2e-3 /*scale*/, 1000 /*range*/));
+        // estVaraince has to be +ve number otherwise 1/sqrt(-ve) would
+        // give img number
+        estVariance.generate(
+            uniform_unsigned_initializer<MeanVarDataType>(2e-3 /*scale*/, 1000 /*range*/));
     }
     void WriteToGPU()
     {
@@ -278,16 +306,18 @@ struct BNInferTestData : public BNTestData<XDataType, YDataType, TConfig>
 template <typename XDataType,
           typename DxDataType,
           typename DyDataType,
-          typename AccDataType,
           typename ScaleDataType,
           typename DscaleDbiasDataType,
           typename MeanVarDataType,
+          typename AccDataType,
           typename TConfig>
-struct BNBwdTestData : public BNTestData<XDataType, DyDataType, TConfig>
+struct BNBwdTestData : public BNTestData<XDataType, DyDataType, AccDataType, TConfig>
 {
-    void SetUpImpl(const TConfig& config, miopenTensorLayout_t t_layout)
+    void
+    SetUpImpl(const TConfig& config, miopenBatchNormMode_t t_bnmode, miopenTensorLayout_t t_layout)
     {
-        BNTestData<XDataType, DxDataType, TConfig>::SetUpImpl(config, t_layout);
+        BNTestData<XDataType, DxDataType, AccDataType, TConfig>::SetUpImpl(
+            config, t_bnmode, t_layout);
         CreateTensors();
         InitTensorsWithRandValue();
         WriteToGPU();
@@ -301,8 +331,8 @@ struct BNBwdTestData : public BNTestData<XDataType, DyDataType, TConfig>
     tensor<DyDataType> dy;
     tensor<DscaleDbiasDataType> dScale;
     tensor<DscaleDbiasDataType> dBias;
-    tensor<DscaleDbiasDataType> dScale_ref;
-    tensor<DscaleDbiasDataType> dBias_ref;
+    tensor<AccDataType> dScale_ref;
+    tensor<AccDataType> dBias_ref;
 
     miopen::Allocator::ManageDataPtr bnScale_dev;
     miopen::Allocator::ManageDataPtr savedMean_dev;
@@ -321,40 +351,46 @@ struct BNBwdTestData : public BNTestData<XDataType, DyDataType, TConfig>
 private:
     void CreateTensors()
     {
-        dy = tensor<DyDataType>{BNTestData<XDataType, DyDataType, TConfig>::tensor_layout,
-                                BNTestData<XDataType, DyDataType, TConfig>::bn_config.GetInput()};
+        dy = tensor<DyDataType>{
+            BNTestData<XDataType, DyDataType, AccDataType, TConfig>::tensor_layout,
+            BNTestData<XDataType, DyDataType, AccDataType, TConfig>::bn_config.GetInput()};
 
         auto derivedBnDesc = miopen::TensorDescriptor{};
-        miopen::DeriveBNTensorDescriptor(derivedBnDesc,
-                                         BNTestData<XDataType, DyDataType, TConfig>::input.desc,
-                                         BNTestData<XDataType, DyDataType, TConfig>::bn_mode);
-        bnScale   = tensor<ScaleDataType>{BNTestData<XDataType, DyDataType, TConfig>::tensor_layout,
-                                        derivedBnDesc.GetLengths()};
+        miopen::DeriveBNTensorDescriptor(
+            derivedBnDesc,
+            BNTestData<XDataType, DyDataType, AccDataType, TConfig>::input.desc,
+            BNTestData<XDataType, DyDataType, AccDataType, TConfig>::bn_mode);
+        bnScale = tensor<ScaleDataType>{
+            BNTestData<XDataType, DyDataType, AccDataType, TConfig>::tensor_layout,
+            derivedBnDesc.GetLengths()};
         savedMean = tensor<MeanVarDataType>{
-            BNTestData<XDataType, DyDataType, TConfig>::tensor_layout, derivedBnDesc.GetLengths()};
+            BNTestData<XDataType, DyDataType, AccDataType, TConfig>::tensor_layout,
+            derivedBnDesc.GetLengths()};
         savedInvVar = tensor<MeanVarDataType>{
-            BNTestData<XDataType, DyDataType, TConfig>::tensor_layout, derivedBnDesc.GetLengths()};
+            BNTestData<XDataType, DyDataType, AccDataType, TConfig>::tensor_layout,
+            derivedBnDesc.GetLengths()};
         dScale = tensor<DscaleDbiasDataType>{
-            BNTestData<XDataType, DyDataType, TConfig>::tensor_layout, derivedBnDesc.GetLengths()};
+            BNTestData<XDataType, DyDataType, AccDataType, TConfig>::tensor_layout,
+            derivedBnDesc.GetLengths()};
         dBias = tensor<DscaleDbiasDataType>{
-            BNTestData<XDataType, DyDataType, TConfig>::tensor_layout, derivedBnDesc.GetLengths()};
-        dScale_ref = dScale;
-        dBias_ref  = dBias;
+            BNTestData<XDataType, DyDataType, AccDataType, TConfig>::tensor_layout,
+            derivedBnDesc.GetLengths()};
+        dScale_ref = tensor<AccDataType>{
+            BNTestData<XDataType, DyDataType, AccDataType, TConfig>::tensor_layout,
+            derivedBnDesc.GetLengths()};
+        dBias_ref = tensor<AccDataType>{
+            BNTestData<XDataType, DyDataType, AccDataType, TConfig>::tensor_layout,
+            derivedBnDesc.GetLengths()};
     }
 
     void InitTensorsWithRandValue()
     {
-        auto gen_value = [](auto...) {
-            return prng::gen_descreet_uniform_sign<ScaleDataType>(1e-2, 100);
-        };
-        dy.generate(gen_value);
-        bnScale.generate(gen_value);
-        savedMean.generate(gen_value);
-
-        auto gen_var = [](auto...) {
-            return static_cast<MeanVarDataType>(1e-2 * (prng::gen_0_to_B(100) + 1));
-        };
-        savedInvVar.generate(gen_var);
+        dy.generate(uniform_signed_initializer<DyDataType>(2e-3 /*scale*/, 1000 /*range*/));
+        bnScale.generate(uniform_signed_initializer<ScaleDataType>(2e-3 /*scale*/, 1000 /*range*/));
+        savedMean.generate(
+            uniform_signed_initializer<MeanVarDataType>(2e-3 /*scale*/, 1000 /*range*/));
+        savedInvVar.generate(
+            uniform_signed_initializer<MeanVarDataType>(2e-3 /*scale*/, 1000 /*range*/));
 
         std::fill(dScale.begin(), dScale.end(), 0.);
         std::fill(dBias.begin(), dBias.end(), 0.);
@@ -380,13 +416,16 @@ template <typename XDataType,
           typename YDataType,
           typename ScaleDataType,
           typename BiasDataType,
+          typename RunSaveDataType,
           typename AccDataType,
           typename TConfig>
-struct BNFwdTrainTestData : public BNTestData<XDataType, YDataType, TConfig>
+struct BNFwdTrainTestData : public BNTestData<XDataType, YDataType, AccDataType, TConfig>
 {
-    void SetUpImpl(const TConfig& config, miopenTensorLayout_t t_layout)
+    void
+    SetUpImpl(const TConfig& config, miopenBatchNormMode_t t_bnmode, miopenTensorLayout_t t_layout)
     {
-        BNTestData<XDataType, YDataType, TConfig>::SetUpImpl(config, t_layout);
+        BNTestData<XDataType, YDataType, AccDataType, TConfig>::SetUpImpl(
+            config, t_bnmode, t_layout);
         CreateTensors();
         InitTensorsWithRandValue();
         WriteToGPU();
@@ -394,10 +433,10 @@ struct BNFwdTrainTestData : public BNTestData<XDataType, YDataType, TConfig>
 
     tensor<ScaleDataType> scale;
     tensor<BiasDataType> shift;
-    tensor<AccDataType> saveMean;
-    tensor<AccDataType> saveVariance;
-    tensor<AccDataType> runMean;
-    tensor<AccDataType> runVariance;
+    tensor<RunSaveDataType> saveMean;
+    tensor<RunSaveDataType> saveVariance;
+    tensor<RunSaveDataType> runMean;
+    tensor<RunSaveDataType> runVariance;
 
     tensor<AccDataType> saveMean_ref;
     tensor<AccDataType> saveVariance_ref;
@@ -422,41 +461,69 @@ struct BNFwdTrainTestData : public BNTestData<XDataType, YDataType, TConfig>
     void CreateTensors()
     {
         auto derivedBnDesc = miopen::TensorDescriptor{};
-        miopen::DeriveBNTensorDescriptor(derivedBnDesc,
-                                         BNTestData<XDataType, YDataType, TConfig>::input.desc,
-                                         BNTestData<XDataType, YDataType, TConfig>::bn_mode);
-        scale    = tensor<ScaleDataType>{BNTestData<XDataType, YDataType, TConfig>::tensor_layout,
-                                      derivedBnDesc.GetLengths()};
-        shift    = tensor<BiasDataType>{BNTestData<XDataType, YDataType, TConfig>::tensor_layout,
-                                     derivedBnDesc.GetLengths()};
-        saveMean = tensor<AccDataType>{BNTestData<XDataType, YDataType, TConfig>::tensor_layout,
-                                       derivedBnDesc.GetLengths()};
-        saveVariance = tensor<AccDataType>{BNTestData<XDataType, YDataType, TConfig>::tensor_layout,
-                                           derivedBnDesc.GetLengths()};
-        runMean      = tensor<AccDataType>{BNTestData<XDataType, YDataType, TConfig>::tensor_layout,
-                                      derivedBnDesc.GetLengths()};
-        runVariance  = tensor<AccDataType>{BNTestData<XDataType, YDataType, TConfig>::tensor_layout,
-                                          derivedBnDesc.GetLengths()};
+        miopen::DeriveBNTensorDescriptor(
+            derivedBnDesc,
+            BNTestData<XDataType, YDataType, AccDataType, TConfig>::input.desc,
+            BNTestData<XDataType, YDataType, AccDataType, TConfig>::bn_mode);
+        scale = tensor<ScaleDataType>{
+            BNTestData<XDataType, YDataType, AccDataType, TConfig>::tensor_layout,
+            derivedBnDesc.GetLengths()};
+        shift = tensor<BiasDataType>{
+            BNTestData<XDataType, YDataType, AccDataType, TConfig>::tensor_layout,
+            derivedBnDesc.GetLengths()};
+        saveMean = tensor<RunSaveDataType>{
+            BNTestData<XDataType, YDataType, AccDataType, TConfig>::tensor_layout,
+            derivedBnDesc.GetLengths()};
+        saveVariance = tensor<RunSaveDataType>{
+            BNTestData<XDataType, YDataType, AccDataType, TConfig>::tensor_layout,
+            derivedBnDesc.GetLengths()};
+        runMean = tensor<RunSaveDataType>{
+            BNTestData<XDataType, YDataType, AccDataType, TConfig>::tensor_layout,
+            derivedBnDesc.GetLengths()};
+        runVariance = tensor<RunSaveDataType>{
+            BNTestData<XDataType, YDataType, AccDataType, TConfig>::tensor_layout,
+            derivedBnDesc.GetLengths()};
+        // ref
+        saveMean_ref = tensor<AccDataType>{
+            BNTestData<XDataType, YDataType, AccDataType, TConfig>::tensor_layout,
+            derivedBnDesc.GetLengths()};
+        saveVariance_ref = tensor<AccDataType>{
+            BNTestData<XDataType, YDataType, AccDataType, TConfig>::tensor_layout,
+            derivedBnDesc.GetLengths()};
+        runMean_ref = tensor<AccDataType>{
+            BNTestData<XDataType, YDataType, AccDataType, TConfig>::tensor_layout,
+            derivedBnDesc.GetLengths()};
+        runVariance_ref = tensor<AccDataType>{
+            BNTestData<XDataType, YDataType, AccDataType, TConfig>::tensor_layout,
+            derivedBnDesc.GetLengths()};
     }
 
     void InitTensorsWithRandValue()
     {
-        auto gen_value = [](auto...) {
-            return prng::gen_descreet_uniform_sign<ScaleDataType>(1e-2, 100);
-        };
-        scale.generate(gen_value);
-        shift.generate(gen_value);
-
-        auto gen_var = [](auto...) {
-            return static_cast<AccDataType>(1e-2 * (prng::gen_0_to_B(100) + 1));
-        };
-        runMean.generate(gen_var);
-        runVariance.generate(gen_var);
-
-        saveMean_ref     = saveMean;
-        saveVariance_ref = saveVariance;
-        runMean_ref      = runMean;
-        runVariance_ref  = runVariance;
+        // -2.0 to 2.0
+        scale.generate(uniform_signed_initializer<ScaleDataType>(2e-3 /*scale*/, 1000 /*range*/));
+        shift.generate(uniform_signed_initializer<BiasDataType>(2e-3 /*scale*/, 1000 /*range*/));
+        runMean.generate(
+            uniform_signed_initializer<RunSaveDataType>(2e-3 /*scale*/, 1000 /*range*/));
+        runVariance.generate(
+            uniform_signed_initializer<RunSaveDataType>(2e-3 /*scale*/, 1000 /*range*/));
+
+        std::transform(saveMean.data.begin(),
+                       saveMean.data.end(),
+                       saveMean_ref.data.begin(),
+                       [](float val) { return static_cast<AccDataType>(val); });
+        std::transform(saveVariance.data.begin(),
+                       saveVariance.data.end(),
+                       saveVariance_ref.data.begin(),
+                       [](float val) { return static_cast<AccDataType>(val); });
+        std::transform(runMean.data.begin(),
+                       runMean.data.end(),
+                       runMean_ref.data.begin(),
+                       [](float val) { return static_cast<AccDataType>(val); });
+        std::transform(runVariance.data.begin(),
+                       runVariance.data.end(),
+                       runVariance_ref.data.begin(),
+                       [](float val) { return static_cast<AccDataType>(val); });
     }
     void WriteToGPU()
     {
diff --git a/test/gtest/test_operations.hpp b/test/gtest/test_operations.hpp
index cff9c90f55..6fcef9fb01 100644
--- a/test/gtest/test_operations.hpp
+++ b/test/gtest/test_operations.hpp
@@ -39,19 +39,37 @@ void ComputeCPUBNInference(DLModule& dl_module)
         }
     };
     ReshapeIfNeeded(dl_module.input.desc);
-    ReshapeIfNeeded(dl_module.ref_out.desc);
+    ReshapeIfNeeded(dl_module.out_ref.desc);
     ReshapeIfNeeded(dl_module.scale.desc);
     ReshapeIfNeeded(dl_module.shift.desc);
     ReshapeIfNeeded(dl_module.estMean.desc);
     ReshapeIfNeeded(dl_module.estVariance.desc);
 
-    batchNormSpatialHostInference(dl_module.input,
-                                  dl_module.ref_out,
-                                  dl_module.scale,
-                                  dl_module.shift,
-                                  dl_module.epsilon,
-                                  dl_module.estMean,
-                                  dl_module.estVariance);
+    if(dl_module.bn_mode == miopenBNSpatial)
+    {
+        batchNormSpatialHostInference(dl_module.input,
+                                      dl_module.out_ref,
+                                      dl_module.scale,
+                                      dl_module.shift,
+                                      dl_module.epsilon,
+                                      dl_module.estMean,
+                                      dl_module.estVariance);
+    }
+    else if(dl_module.bn_mode == miopenBNPerActivation)
+    {
+        batchNormPerActivHostInference(dl_module.input,
+                                       dl_module.out_ref,
+                                       dl_module.scale,
+                                       dl_module.shift,
+                                       dl_module.epsilon,
+                                       dl_module.estMean,
+                                       dl_module.estVariance);
+    }
+    else
+    {
+        std::cout << "\nUnknown inference batch miopenBatchNormMode_t\n";
+        exit(EXIT_FAILURE);
+    }
 }
 
 template <typename DLModule>
@@ -68,21 +86,40 @@ void ComputeCPUBNBwd(DLModule& dl_module)
     };
     ReshapeIfNeeded(dl_module.input.desc);
     ReshapeIfNeeded(dl_module.dy.desc);
-    ReshapeIfNeeded(dl_module.ref_out.desc);
+    ReshapeIfNeeded(dl_module.out_ref.desc);
     ReshapeIfNeeded(dl_module.bnScale.desc);
     ReshapeIfNeeded(dl_module.dScale_ref.desc);
     ReshapeIfNeeded(dl_module.dBias_ref.desc);
     ReshapeIfNeeded(dl_module.savedMean.desc);
     ReshapeIfNeeded(dl_module.savedInvVar.desc);
 
-    batchNormSpatialHostBwdTrain(dl_module.input,
-                                 dl_module.dy,
-                                 dl_module.ref_out,
-                                 dl_module.bnScale,
-                                 dl_module.dScale_ref,
-                                 dl_module.dBias_ref,
-                                 dl_module.savedMean,
-                                 dl_module.savedInvVar);
+    if(dl_module.bn_mode == miopenBNSpatial)
+    {
+        batchNormSpatialHostBwdTrain(dl_module.input,
+                                     dl_module.dy,
+                                     dl_module.out_ref,
+                                     dl_module.bnScale,
+                                     dl_module.dScale_ref,
+                                     dl_module.dBias_ref,
+                                     dl_module.savedMean,
+                                     dl_module.savedInvVar);
+    }
+    else if(dl_module.bn_mode == miopenBNPerActivation)
+    {
+        batchNormPerActHostBwdTrain(dl_module.input,
+                                    dl_module.dy,
+                                    dl_module.out_ref,
+                                    dl_module.bnScale,
+                                    dl_module.dScale_ref,
+                                    dl_module.dBias_ref,
+                                    dl_module.savedMean,
+                                    dl_module.savedInvVar);
+    }
+    else
+    {
+        std::cout << "\nUnknown BwdTrain batch miopenBatchNormMode_t\n";
+        exit(EXIT_FAILURE);
+    }
 }
 
 template <typename DLModule>
@@ -98,7 +135,7 @@ void ComputeCPUBNFwdTrain(DLModule& dl_module)
         }
     };
     ReshapeIfNeeded(dl_module.input.desc);
-    ReshapeIfNeeded(dl_module.ref_out.desc);
+    ReshapeIfNeeded(dl_module.out_ref.desc);
     ReshapeIfNeeded(dl_module.scale.desc);
     ReshapeIfNeeded(dl_module.shift.desc);
     ReshapeIfNeeded(dl_module.saveMean_ref.desc);
@@ -106,30 +143,51 @@ void ComputeCPUBNFwdTrain(DLModule& dl_module)
     ReshapeIfNeeded(dl_module.runMean_ref.desc);
     ReshapeIfNeeded(dl_module.runVariance_ref.desc);
 
-    batchNormSpatialHostFwdTrain(dl_module.input,
-                                 dl_module.ref_out,
-                                 dl_module.scale,
-                                 dl_module.shift,
-                                 dl_module.epsilon,
-                                 dl_module.averageFactor,
-                                 dl_module.saveMean_ref,
-                                 dl_module.saveVariance_ref,
-                                 dl_module.runMean_ref,
-                                 dl_module.runVariance_ref);
+    if(dl_module.bn_mode == miopenBNSpatial)
+    {
+        batchNormSpatialHostFwdTrain(dl_module.input,
+                                     dl_module.out_ref,
+                                     dl_module.scale,
+                                     dl_module.shift,
+                                     dl_module.epsilon,
+                                     dl_module.averageFactor,
+                                     dl_module.saveMean_ref,
+                                     dl_module.saveVariance_ref,
+                                     dl_module.runMean_ref,
+                                     dl_module.runVariance_ref);
+    }
+    else if(dl_module.bn_mode == miopenBNPerActivation)
+    {
+        batchNormPerActHostFwdTrain(dl_module.input,
+                                    dl_module.out_ref,
+                                    dl_module.scale,
+                                    dl_module.shift,
+                                    dl_module.epsilon,
+                                    dl_module.averageFactor,
+                                    dl_module.saveMean_ref,
+                                    dl_module.saveVariance_ref,
+                                    dl_module.runMean_ref,
+                                    dl_module.runVariance_ref);
+    }
+    else
+    {
+        std::cout << "\nUnknown FwdTrain batch miopenBatchNormMode_t\n";
+        exit(EXIT_FAILURE);
+    }
 }
 
-template <typename T>
+template <typename T, typename U = double>
 void CompareTensor(const tensor<T>& output,
-                   const tensor<T>& ref_out,
+                   const tensor<U>& out_ref,
                    const double threshold = std::numeric_limits<T>::epsilon())
 {
-    EXPECT_FALSE(miopen::range_zero(ref_out)) << "CPU data is all zeros";
+    EXPECT_FALSE(miopen::range_zero(out_ref)) << "CPU data is all zeros";
     EXPECT_FALSE(miopen::range_zero(output)) << "GPU data is all zeros";
     EXPECT_FALSE(miopen::find_idx(output, miopen::not_finite) >= 0)
         << "Non finite number found in the GPU data";
-    EXPECT_TRUE(miopen::range_distance(ref_out) == miopen::range_distance(output));
-    auto error = miopen::rms_range(ref_out, output);
-    EXPECT_FALSE(miopen::find_idx(ref_out, miopen::not_finite) >= 0)
+    EXPECT_TRUE(miopen::range_distance(out_ref) == miopen::range_distance(output));
+    auto error = miopen::rms_range(out_ref, output);
+    EXPECT_FALSE(miopen::find_idx(out_ref, miopen::not_finite) >= 0)
         << "Non finite number found in the CPU data";
     EXPECT_TRUE(error < threshold)
         << "Error beyond tolerance Error:" << error << ",  Threshold: " << threshold;
diff --git a/test/gtest/unit_FinInterface.cpp b/test/gtest/unit_FinInterface.cpp
index 07d8208d01..acd9b8be3f 100644
--- a/test/gtest/unit_FinInterface.cpp
+++ b/test/gtest/unit_FinInterface.cpp
@@ -320,16 +320,15 @@ const auto& GetSolversInfo<BatchNormSolverInfo>()
     /// \ref fin_interface_solver_info_coverage
     static const std::unordered_map<std::string, BatchNormSolverInfo> solver_info = {
         // clang-format off
-        {"BnFwdTrainingSpatialSingle",      {113,   false,  false}},
-        {"BnFwdTrainingSpatialMultiple",    {115,   false,  false}},
-        {"BnFwdTrainingPerActivation",      {116,   false,  false}},
-        {"BnBwdTrainingSpatialSingle",      {117,   false,  false}},
-        {"BnBwdTrainingSpatialMultiple",    {118,   false,  false}},
-        {"BnBwdTrainingPerActivation",      {119,   false,  false}},
-        {"BnFwdInference",                  {120,   true,   false}},
-        {"BnCKFwdInference",                {142,   true,   true}},
-        {"BnCKBwdBackward",                 {143,   true,   true}},
-        {"BnCKFwdTraining",                 {144,   true,   true}},
+        //   solver-name               id,  isDynamic, isTunable
+        {"BnFwdTrainingSpatial",       {113,   false,  true}},
+        {"BnFwdTrainingPerActivation", {116,   false,  false}},
+        {"BnBwdTrainingSpatial",       {117,   false,  true}},
+        {"BnBwdTrainingPerActivation", {119,   false,  false}},
+        {"BnFwdInference",             {120,   true,  false}},
+        {"BnCKFwdInference",           {142,   true,   true}},
+        {"BnCKBwdBackward",            {143,   true,   true}},
+        {"BnCKFwdTraining",            {144,   true,   true}},
         // clang-format on
     };
 
diff --git a/test/random.hpp b/test/random.hpp
index 9b4815bc1d..62443abb10 100644
--- a/test/random.hpp
+++ b/test/random.hpp
@@ -40,5 +40,23 @@ inline T gen_descreet_unsigned(double scale, int32_t range)
 {
     return static_cast<T>(scale * static_cast<double>(gen_0_to_B(range)));
 }
+
 } // namespace prng
+
+// lambda factory
+template <typename T, typename ScaleT, typename RangeT>
+auto uniform_signed_initializer(ScaleT scale_arg, RangeT range_arg)
+{
+    return [=](auto&&...) -> T {
+        // uniform sign give balance of both negative and positive values
+        return prng::gen_descreet_uniform_sign<T>(scale_arg, range_arg);
+    };
+}
+
+template <typename T, typename ScaleT, typename RangeT>
+auto uniform_unsigned_initializer(ScaleT scale_arg, RangeT range_arg)
+{
+    return [=](auto&&...) -> T { return prng::gen_descreet_unsigned<T>(scale_arg, range_arg); };
+}
+
 #endif // GUARD_MIOPEN_TEST_RANDOM_HPP