Skip to content

Release fixes/fix nhwc support #3657

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 13 commits into
base: release/rocm-rel-6.4
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,11 @@ Full documentation for MIOpen is available [here](https://rocm.docs.amd.com/proj
* [Conv] Enabled tuning through the `miopenSetConvolutionFindMode` API
* [RNN] Added the new algorithm type `miopenRNNroundedDynamic` for LSTM
* [TunaNet] Enabled NHWC for MI300
* [BatchNorm] Enabled broad support for NHWC
* [BatchNorm] Enabled tuning through MIOPEN_FIND_ENFORCE
### Optimized
* Updated KernelTuningNet for CK solvers
* NHWC Batchnorm

### Resolved issues

Expand Down
6 changes: 5 additions & 1 deletion Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -245,7 +245,11 @@ def buildHipClangJob(Map conf=[:]){
if (conf.get("enforce_xnack_on", false)) {
dockerOpts = dockerOpts + " --env HSA_XNACK=1"
}

def video_id = sh(returnStdout: true, script: 'getent group video | cut -d: -f3')
def render_id = sh(returnStdout: true, script: 'getent group render | cut -d: -f3')
dockerOpts = dockerOpts + " --group-add=${video_id} --group-add=${render_id} "
echo "Docker flags: ${dockerOpts}"

def variant = env.STAGE_NAME

def needs_gpu = conf.get("needs_gpu", true)
Expand Down
96 changes: 44 additions & 52 deletions driver/bn_driver.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -195,10 +195,8 @@ int BatchNormDriver<TInput, Tref, TAcc, TScaleBias, TOut>::GetandSetData()
SetBNParametersFromCmdLineArgs();

in.AllocOnHost(tensor<TInput>{bn_layout, in_len});
for(size_t i = 0; i < in.GetVector().size(); i++)
{
in.GetVector()[i] = prng::gen_canonical<TInput>();
}
// 0.0 to 2.0
in.GetTensor().generate(uniform_unsigned_initializer<TInput>(2e-3 /*scale*/, 1000 /*range*/));

auto derivedBnDesc = miopen::TensorDescriptor{};
miopen::DeriveBNTensorDescriptor(derivedBnDesc, in.GetTensor().desc, bn_mode);
Expand All @@ -208,20 +206,25 @@ int BatchNormDriver<TInput, Tref, TAcc, TScaleBias, TOut>::GetandSetData()
out.AllocOnHost(tensor<TInput>{bn_layout, in_len});
scale.AllocOnHost(tensor<TScaleBias>{bn_layout, derivedBnDesc.GetLengths()});
bias.AllocOnHost(tensor<TScaleBias>{bn_layout, derivedBnDesc.GetLengths()});

for(int i = 0; i < scale.GetVector().size(); i++)
{
scale.GetVector()[i] = prng::gen_canonical<TInput>();
bias.GetVector()[i] = prng::gen_canonical<TInput>();
}
// -2.0 to 2.0
scale.GetTensor().generate(
uniform_signed_initializer<TScaleBias>(2e-3 /*scale*/, 1000 /*range*/));
bias.GetTensor().generate(
uniform_signed_initializer<TScaleBias>(2e-3 /*scale*/, 1000 /*range*/));
}
if(isFwdInfer)
{
estMean.AllocOnHost(tensor<TAcc>{bn_layout, derivedBnDesc.GetLengths()});
estVariance.AllocOnHost(tensor<TAcc>{bn_layout, derivedBnDesc.GetLengths()});

auto gen_value_emean = [](auto...) { return prng::gen_descreet_unsigned<TAcc>(1e-2, 100); };
estMean.InitHostData(estMean.GetTensor().desc.GetElementSize(), true, gen_value_emean);
// -2.0 to 2.0
estMean.InitHostData(estMean.GetTensor().desc.GetElementSize(),
true,
uniform_signed_initializer<TAcc>(2e-3 /*scale*/, 1000 /*range*/));
// estVaraince has to be +ve number otherwise 1/sqrt(-ve) would
// give img number
estVariance.GetTensor().generate(
uniform_unsigned_initializer<TAcc>(2e-3 /*scale*/, 1000 /*range*/));
}
else if(isFwdTrain)
{
Expand All @@ -230,45 +233,45 @@ int BatchNormDriver<TInput, Tref, TAcc, TScaleBias, TOut>::GetandSetData()
runMean.AllocOnHost(tensor<TAcc>{bn_layout, derivedBnDesc.GetLengths()});
runVariance.AllocOnHost(tensor<TAcc>{bn_layout, derivedBnDesc.GetLengths()});

for(int i = 0; i < runVariance.GetVector().size(); i++)
{
runMean.GetVector()[i] = prng::gen_canonical<TAcc>();
runVariance.GetVector()[i] = prng::gen_canonical<TAcc>();
}
// -2.0 to 2.0
runMean.GetTensor().generate(
uniform_signed_initializer<TAcc>(2e-3 /*scale*/, 1000 /*range*/));
runVariance.GetTensor().generate(
uniform_signed_initializer<TAcc>(2e-3 /*scale*/, 1000 /*range*/));
}
else if(isBwd)
{
out_bwd.AllocOnHost(tensor<TOut>{bn_layout, in_len});

bnScale.AllocOnHost(tensor<TScaleBias>{bn_layout, derivedBnDesc.GetLengths()});
dy.AllocOnHost(tensor<TOut>{bn_layout, in_len});

auto gen_var_bwd = [](auto...) {
return static_cast<TOut>(1e-2 * (prng::gen_0_to_B(100) + 1));
};

dy.InitHostData(dy.GetTensor().desc.GetElementSize(), true, gen_var_bwd);
// -2.0 to 2.0
dy.InitHostData(dy.GetTensor().desc.GetElementSize(),
true,
uniform_signed_initializer<TOut>(2e-3 /*scale*/, 1000 /*range*/));

dScale.AllocOnHost(tensor<TAcc>{bn_layout, derivedBnDesc.GetLengths()});
dBias.AllocOnHost(tensor<TAcc>{bn_layout, derivedBnDesc.GetLengths()});
savedMean.AllocOnHost(tensor<TAcc>{bn_layout, derivedBnDesc.GetLengths()});
savedInvVar.AllocOnHost(tensor<TAcc>{bn_layout, derivedBnDesc.GetLengths()});

auto gen_value = [](auto...) { return prng::gen_descreet_unsigned<TScaleBias>(1e-2, 100); };
bnScale.InitHostData(bnScale.GetTensor().desc.GetElementSize(), true, gen_value);

auto gen_in_var = [](auto...) {
return static_cast<TAcc>(1e-2 * (prng::gen_0_to_B(100) + 1));
};
savedMean.InitHostData(savedMean.GetTensor().desc.GetElementSize(), true, gen_in_var);
savedInvVar.InitHostData(savedInvVar.GetTensor().desc.GetElementSize(), true, gen_in_var);
bnScale.InitHostData(
bnScale.GetTensor().desc.GetElementSize(),
true,
uniform_signed_initializer<TScaleBias>(2e-3 /*scale*/, 1000 /*range*/));
// -2.0 to 2.0
savedMean.InitHostData(savedMean.GetTensor().desc.GetElementSize(),
true,
uniform_signed_initializer<TAcc>(2e-3 /*scale*/, 1000 /*range*/));
savedInvVar.InitHostData(savedInvVar.GetTensor().desc.GetElementSize(),
true,
uniform_signed_initializer<TAcc>(2e-3 /*scale*/, 1000 /*range*/));
}
else
{
std::cout << "\nUnknown batch norm state!\n";
exit(EXIT_FAILURE);
}

return miopenStatusSuccess;
}

Expand Down Expand Up @@ -297,7 +300,7 @@ int BatchNormDriver<TInput, Tref, TAcc, TScaleBias, TOut>::AddCmdLineArgs()

inflags.AddInputFlag("alpha", 'A', "1.0", "Alpha (Default=1.0)", "float");
inflags.AddInputFlag("beta", 'B', "0.", "Beta (Default=0.)", "float");
inflags.AddInputFlag("iter", 'i', "1", "Number of Iterations (Default=1)", "int");
inflags.AddInputFlag("iter", 'i', "10", "Number of Iterations (Default=10)", "int");
inflags.AddInputFlag("verify", 'V', "1", "Verify Each Layer (Default=1)", "int");
inflags.AddInputFlag("time", 't', "0", "Time Each Layer (Default=0)", "int");
inflags.AddInputFlag("printconv", 'P', "1", "Print Convolution Dimensions (Default=1)", "int");
Expand Down Expand Up @@ -1364,11 +1367,6 @@ int BatchNormDriver<TInput, Tref, TAcc, TScaleBias, TOut>::RunBackwardCPU()
if(!back)
return miopenStatusSuccess;

// T alphaDiff = 1, betaDiff = 0;
// T alphaParam = 1, betaParam = 0;
double alpha = static_cast<double>(1), beta = static_cast<double>(0),
gamma = static_cast<double>(1);

// float alphaDataDiff = static_cast<float>(1), betaDataDiff = static_cast<float>(0);
// float alphaParamDiff = static_cast<float>(1), betaParamDiff = static_cast<float>(0);
int size{0};
Expand All @@ -1394,20 +1392,14 @@ int BatchNormDriver<TInput, Tref, TAcc, TScaleBias, TOut>::RunBackwardCPU()
if(bn_mode == miopenBNPerActivation)
{
// 1xCxHxW
batchNormActivSpatialHostBwdTrain(activ_mode,
gamma,
beta,
alpha,
in.GetTensor(),
dy.GetTensor(),
out.GetTensor(),
out_ref,
bnScale.GetTensor(),
dBias.GetTensor(),
dScale_ref,
dBias_ref,
savedMean.GetTensor(),
savedInvVar.GetTensor());
batchNormPerActHostBwdTrain(in.GetTensor(),
dy.GetTensor(),
out_ref,
bnScale.GetTensor(),
dScale_ref,
dBias_ref,
savedMean.GetTensor(),
savedInvVar.GetTensor());
}
else if(bn_mode == miopenBNSpatial)
{ // 1xCx1x1
Expand Down
2 changes: 1 addition & 1 deletion fin
6 changes: 2 additions & 4 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -211,15 +211,13 @@ set( MIOpen_Source
solver/batchnorm/backward_ck.cpp
solver/batchnorm/backward_per_activation.cpp
solver/batchnorm/backward_per_activation_fused.cpp
solver/batchnorm/backward_spatial_multiple.cpp
solver/batchnorm/backward_spatial_single.cpp
solver/batchnorm/backward_spatial.cpp
solver/batchnorm/forward_inference.cpp
solver/batchnorm/forward_inference_ck.cpp
solver/batchnorm/forward_inference_fused.cpp
solver/batchnorm/forward_per_activation.cpp
solver/batchnorm/forward_per_activation_fused.cpp
solver/batchnorm/forward_spatial_multiple.cpp
solver/batchnorm/forward_spatial_single.cpp
solver/batchnorm/forward_spatial.cpp
solver/batchnorm/forward_training_ck.cpp
solver/cat/forward_cat.cpp
solver/conv/conv_asm_1x1u.cpp
Expand Down
36 changes: 12 additions & 24 deletions src/batch_norm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -73,28 +73,23 @@ TensorDescriptor BuildReshaped4DTensorDescriptor(const miopen::TensorDescriptor&
if(layout == miopenTensorNCDHW)
{
layout = miopenTensorNCHW;

// NxCxDxHxW -> NxCx(D*H)xW
dims[2] *= dims[3];
dims[3] = dims[4];
dims.pop_back();
}
else if(layout == miopenTensorNDHWC)
{
layout = miopenTensorNHWC;

// NxDxHxWxC -> Nx(D*H)xWxC
dims[1] *= dims[2];
dims[2] = dims[3];
dims[3] = dims[4];
dims.pop_back();
}
else
{
std::cout << "Cannot handle layout : " << layout << "\n";
exit(EXIT_FAILURE); // NOLINT (concurrency-mt-unsafe)
}

// Both NCDHW and NDHWC layout store the lens in NCHDW form
// hence : NxCxDxHxW -> NxCx(D*H)xW
dims[2] *= dims[3];
dims[3] = dims[4];
dims.pop_back();

return {dataType, layout, dims};
}

Expand All @@ -109,14 +104,11 @@ void profileSequence(const Handle& handle, unsigned char select, float* ctime)
case 0:
if(handle.IsProfilingEnabled())
{
*ctime = 0.;
handle.ResetKernelTime();
ktime = handle.GetKernelTime();
*ctime = ktime;

#if(MIO_BN_CPP_PROF == 1)
printf("ktime0: %lf\n", ktime);
printf("ctime: %f\n", *ctime);
printf("kernel0: %7.3f ms total: %7.3f ms\n", ktime, *ctime);
#endif
}
#if(MIOPEN_BN_SYNCH == 1)
Expand All @@ -133,8 +125,7 @@ void profileSequence(const Handle& handle, unsigned char select, float* ctime)
*ctime += ktime;

#if(MIO_BN_CPP_PROF == 1)
printf("ktime1: %lf\n", ktime);
printf("ctime: %f\n", *ctime);
printf("kernel1: %7.3f ms total: %7.3f ms\n", ktime, *ctime);
#endif
}
#if(MIOPEN_BN_SYNCH == 1)
Expand All @@ -148,15 +139,12 @@ void profileSequence(const Handle& handle, unsigned char select, float* ctime)
case 2:
if(handle.IsProfilingEnabled())
{

#if(MIO_BN_CPP_PROF == 1)
ktime = handle.GetKernelTime();
*ctime += ktime;
handle.ResetKernelTime();
handle.AccumKernelTime(*ctime);
printf("ktime2: %lf\n", ktime);
printf("ctime: %f\n", *ctime + ktime);
#else
handle.GetKernelTime();
handle.AccumKernelTime(*ctime);
#if(MIO_BN_CPP_PROF == 1)
printf("kernel2: %7.3f ms total: %7.3f ms\n", ktime, *ctime);
#endif
}
break;
Expand Down
Loading