ROCm
diff --git a/‎include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp
Lines changed: 8 additions & 3 deletions b/‎include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp
Lines changed: 8 additions & 3 deletions
diff --git a/‎include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp
Lines changed: 19 additions & 3 deletions b/‎include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp
Lines changed: 19 additions & 3 deletions
diff --git a/‎include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_large_tensor_cshuffle.hpp
Lines changed: 4 additions & 1 deletion b/‎include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_large_tensor_cshuffle.hpp
Lines changed: 4 additions & 1 deletion
diff --git a/‎include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
Lines changed: 179 additions & 0 deletions b/‎include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
Lines changed: 179 additions & 0 deletions
@@ -311,8 +311,9 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
 
     static_assert(NumGroupsToMerge >= 1);
 
-    static constexpr bool isMultiA = is_detected<is_tuple, ADataType>::value;
-    static constexpr bool isMultiB = is_detected<is_tuple, BDataType>::value;
+    static constexpr bool isMultiA  = is_detected<is_tuple, ADataType>::value;
+    static constexpr bool isMultiB  = is_detected<is_tuple, BDataType>::value;
+    static constexpr bool isMultiAB = isMultiA || isMultiB;
 
     // NGCHW is not supported for multiAB
     static_assert(!(is_NGCHW_NGKHW<ALayout, BLayout, ELayout>() ||
@@ -323,6 +324,10 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
     static constexpr index_t NumBTensor = GetNumABTensors<isMultiB, BDataType>();
     static constexpr index_t NumDTensor = DsDataType::Size();
 
+    static constexpr bool DoElementwiseBeforeCShuffle =
+        NumDTensor == 0 && !isMultiAB && is_same_v<EDataType, bhalf_t> &&
+        !is_same_v<CDEElementwiseOperation, tensor_operation::element_wise::PassThrough>;
+
     static constexpr auto I0 = Number<0>{};
     static constexpr auto I1 = Number<1>{};
     static constexpr auto I2 = Number<2>{};
@@ -465,7 +470,7 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
         BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle,         \
         CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,                      \
         CDEBlockTransferScalarPerVector_NPerBlock, LoopSched, PipelineVersion::v1,             \
-        BComputeDataType
+        BComputeDataType, DoElementwiseBeforeCShuffle
     // Use appropriate gridwise gemm
     using GridwiseGemm = std::conditional_t<
         isMultiA || isMultiB,
 
@@ -279,6 +279,10 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
     static constexpr bool isMultiD   = DsDataType::Size() > 0;
     static constexpr bool isMultiABD = isMultiA || isMultiB || isMultiD;
 
+    static constexpr bool DoElementwiseBeforeCShuffle =
+        !isMultiABD && is_same_v<EDataType, bhalf_t> &&
+        !is_same_v<CDEElementwiseOperation, tensor_operation::element_wise::PassThrough>;
+
     static constexpr index_t NumATensor = GetNumABTensors<isMultiA, ADataType>();
     static constexpr index_t NumBTensor = GetNumABTensors<isMultiB, BDataType>();
     static constexpr index_t NumDTensor = DsDataType::Size();
@@ -412,7 +416,7 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
         BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle,         \
         CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,                      \
         CDEBlockTransferScalarPerVector_NPerBlock, BlkGemmPipeSched, BlkGemmPipelineVer,       \
-        AComputeDataType, BComputeDataType
+        AComputeDataType, BComputeDataType, false, false, DoElementwiseBeforeCShuffle
 
     // Use appropriate gridwise gemm
     using GridwiseGemm = GridwiseGemm_xdl_cshuffle_v3<GridwiseGemmV3TemplateParams>;
@@ -780,8 +784,20 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
                         sizeof(EDataType);
             }
 
-            typename GridwiseGemm::Argument gemm_arg{
-                p_a_grid, p_b_grid, p_e_grid, GemmM, GemmN, GemmK, I0, I0, I0, I1};
+            typename GridwiseGemm::Argument gemm_arg{p_a_grid,
+                                                     p_b_grid,
+                                                     p_e_grid,
+                                                     GemmM,
+                                                     GemmN,
+                                                     GemmK,
+                                                     I0,
+                                                     I0,
+                                                     I0,
+                                                     I1,
+                                                     false,
+                                                     arg.a_element_op_,
+                                                     arg.b_element_op_,
+                                                     arg.cde_element_op_};
 
             const auto Run = [&](const auto& kernel) {
                 if(stream_config.flush_cache)
 
@@ -192,6 +192,9 @@ struct DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor
 
     static constexpr index_t NumDTensor  = DsDataType::Size();
     static constexpr index_t MaxGemmsNum = 32;
+    static constexpr bool DoElementwiseBeforeCShuffle =
+        NumDTensor == 0 && is_same_v<EDataType, bhalf_t> &&
+        !is_same_v<CDEElementwiseOperation, tensor_operation::element_wise::PassThrough>;
 
     static constexpr auto I0 = Number<0>{};
     static constexpr auto I1 = Number<1>{};
@@ -361,7 +364,7 @@ struct DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor
         BBlockLdsExtraN, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle,            \
         CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,                         \
         CDEBlockTransferScalarPerVector_NPerBlock, LoopSched, PipelineVersion::v1,                \
-        AComputeDataType
+        AComputeDataType, DoElementwiseBeforeCShuffle
     // Use appropriate gridwise gemm
     using GridwiseGemm = GridwiseGemmMultipleD_xdl_cshuffle<GridwiseGemmTemplateParameters>;
 
 
@@ -730,6 +730,15 @@ struct UnaryAbs
     {
         y = ck::type_convert<f8_t>(ck::math::abs(ck::type_convert<float>(x)));
     };
+
+    template <typename Y, typename X>
+    __host__ __device__ constexpr void operator()(Y& y, const X& x) const;
+
+    template <>
+    __host__ __device__ void operator()<bhalf_t, float>(bhalf_t& y, const float& x) const
+    {
+        y = ck::type_convert<bhalf_t>(ck::math::abs(x));
+    };
 };
 
 struct UnarySqrt
@@ -744,6 +753,79 @@ struct UnarySqrt
     };
 };
 
+struct Clamp
+{
+    Clamp(float floor = 0.f, float ceil = NumericLimits<float>::Max())
+        : floor_(floor), ceil_(ceil){};
+
+    template <typename Y, typename X>
+    __host__ __device__ constexpr void operator()(Y& y, const X& x) const;
+
+    template <>
+    __host__ __device__ constexpr void operator()<float, float>(float& y, const float& x) const
+    {
+        const float& a = x;
+        y              = a > floor_ ? (a < ceil_ ? a : ceil_) : floor_;
+    };
+
+    template <>
+    __host__ __device__ constexpr void operator()<double, double>(double& y, const double& x) const
+    {
+        const double& a = x;
+        y               = a > floor_ ? (a < ceil_ ? a : ceil_) : floor_;
+    };
+
+    template <>
+    __host__ __device__ constexpr void operator()<half_t, half_t>(half_t& y, const half_t& x) const
+    {
+        const float a = type_convert<half_t>(x);
+        const float b = a > floor_ ? (a < ceil_ ? a : ceil_) : floor_;
+        y             = type_convert<half_t>(b);
+    };
+
+    template <>
+    __host__ __device__ constexpr void operator()<half_t, float>(half_t& y, const float& x) const
+    {
+        const float& a = x;
+        const float b  = a > floor_ ? (a < ceil_ ? a : ceil_) : floor_;
+        y              = type_convert<half_t>(b);
+    };
+
+    template <>
+    __host__ __device__ constexpr void operator()<bhalf_t, float>(bhalf_t& y, const float& x) const
+    {
+        const float& a = x;
+        const float b  = a > floor_ ? (a < ceil_ ? a : ceil_) : floor_;
+        y              = type_convert<bhalf_t>(b);
+    };
+
+    template <>
+    __host__ __device__ constexpr void operator()<bhalf_t, bhalf_t>(bhalf_t& y,
+                                                                    const bhalf_t& x) const
+    {
+        const float a = type_convert<float>(x);
+        const float b = a > floor_ ? (a < ceil_ ? a : ceil_) : floor_;
+        y             = type_convert<bhalf_t>(b);
+    };
+
+    template <>
+    __host__ __device__ constexpr void operator()<int, int>(int& y, const int& x) const
+    {
+        const int8_t& a = x;
+        y               = a > floor_ ? (a < ceil_ ? a : ceil_) : floor_;
+    };
+
+    template <>
+    __host__ __device__ constexpr void operator()<int8_t, int8_t>(int8_t& y, const int8_t& x) const
+    {
+        const int8_t& a = x;
+        y               = a > floor_ ? (a < ceil_ ? a : ceil_) : floor_;
+    };
+
+    const float floor_;
+    const float ceil_;
+};
+
 struct Relu
 {
     template <typename T>
@@ -756,13 +838,23 @@ struct Relu
         y = x > 0 ? x : 0;
     }
 
+    template <typename Y, typename X>
+    __host__ __device__ constexpr void operator()(Y& y, const X& x) const;
+
     template <>
     __host__ __device__ void operator()(bhalf_t& y, const bhalf_t& x) const
     {
         float x_f32 = type_convert<float>(x);
         float y_f32 = x_f32 > 0 ? x_f32 : 0;
         y           = type_convert<bhalf_t>(y_f32);
     }
+
+    template <>
+    __host__ __device__ void operator()<bhalf_t, float>(bhalf_t& y, const float& x) const
+    {
+        float y_f32 = x > 0 ? x : 0;
+        y           = type_convert<bhalf_t>(y_f32);
+    };
 };
 
 // Fast GeLU
@@ -915,6 +1007,16 @@ struct Sigmoid
         constexpr T one = type_convert<T>(1);
         y               = one / (one + math::exp(-x));
     };
+
+    template <typename Y, typename X>
+    __host__ __device__ constexpr void operator()(Y& y, const X& x) const;
+
+    template <>
+    __host__ __device__ void operator()<bhalf_t, float>(bhalf_t& y, const float& x) const
+    {
+        constexpr float one = 1.f;
+        y                   = type_convert<bhalf_t>(one / (one + math::exp(-x)));
+    };
 };
 
 struct Silu
@@ -942,6 +1044,15 @@ struct TanH
 
         y = math::tanh(x);
     };
+
+    template <typename Y, typename X>
+    __host__ __device__ constexpr void operator()(Y& y, const X& x) const;
+
+    template <>
+    __host__ __device__ void operator()<bhalf_t, float>(bhalf_t& y, const float& x) const
+    {
+        y = type_convert<bhalf_t>(math::tanh(x));
+    };
 };
 
 struct ACos
@@ -1201,6 +1312,13 @@ struct Swish
         y        = type_convert<Y>(x / (1.f + math::exp(bx)));
     };
 
+    template <>
+    __host__ __device__ void operator()<bhalf_t, float>(bhalf_t& y, const float& x) const
+    {
+        float bx = -beta_ * x;
+        y        = type_convert<bhalf_t>(x / (1.f + math::exp(bx)));
+    };
+
     const float beta_;
 };
 
@@ -1219,6 +1337,16 @@ struct SoftRelu
         constexpr T one = type_convert<T>(1);
         y               = math::log(one + math::exp(x * casted_alpha)) / casted_alpha;
     }
+
+    template <typename Y, typename X>
+    __host__ __device__ constexpr void operator()(Y& y, const X& x) const;
+
+    template <>
+    __host__ __device__ void operator()<bhalf_t, float>(bhalf_t& y, const float& x) const
+    {
+        constexpr float one = 1.f;
+        y = type_convert<bhalf_t>(math::log(one + math::exp(x * alpha_)) / alpha_);
+    };
     const float alpha_;
 };
 
@@ -1240,6 +1368,17 @@ struct Power
         T shifted_scaled_x = casted_alpha + casted_beta * x;
         y                  = math::pow(shifted_scaled_x, casted_gamma);
     }
+
+    template <typename Y, typename X>
+    __host__ __device__ constexpr void operator()(Y& y, const X& x) const;
+
+    template <>
+    __host__ __device__ void operator()<bhalf_t, float>(bhalf_t& y, const float& x) const
+    {
+        const float shifted_scaled_x = alpha_ + beta_ * x;
+        y                            = type_convert<bhalf_t>(math::pow(shifted_scaled_x, gamma_));
+    };
+
     const float alpha_;
     const float beta_;
     const float gamma_;
@@ -1260,6 +1399,16 @@ struct ClippedRelu
         T casted_beta  = type_convert<T>(beta_);
         y              = math::min(casted_beta, math::max(casted_alpha, x));
     }
+
+    template <typename Y, typename X>
+    __host__ __device__ constexpr void operator()(Y& y, const X& x) const;
+
+    template <>
+    __host__ __device__ void operator()<bhalf_t, float>(bhalf_t& y, const float& x) const
+    {
+        y = type_convert<bhalf_t>(math::min(beta_, math::max(alpha_, x)));
+    };
+
     const float alpha_;
     const float beta_;
 };
@@ -1278,6 +1427,16 @@ struct LeakyRelu
         T casted_alpha = type_convert<T>(alpha_);
         y              = x >= 0 ? x : x * casted_alpha;
     }
+
+    template <typename Y, typename X>
+    __host__ __device__ constexpr void operator()(Y& y, const X& x) const;
+
+    template <>
+    __host__ __device__ void operator()<bhalf_t, float>(bhalf_t& y, const float& x) const
+    {
+        y = type_convert<bhalf_t>(x >= 0 ? x : x * alpha_);
+    };
+
     const float alpha_;
 };
 
@@ -1295,6 +1454,16 @@ struct Elu
         T casted_alpha = type_convert<T>(alpha_);
         y              = x > 0 ? x : casted_alpha * math::expm1(x);
     }
+
+    template <typename Y, typename X>
+    __host__ __device__ constexpr void operator()(Y& y, const X& x) const;
+
+    template <>
+    __host__ __device__ void operator()<bhalf_t, float>(bhalf_t& y, const float& x) const
+    {
+        y = type_convert<bhalf_t>(x > 0 ? x : alpha_ * math::expm1(x));
+    };
+
     const float alpha_;
 };
 
@@ -1313,6 +1482,16 @@ struct Logistic
         constexpr T one = type_convert<T>(1);
         y               = casted_alpha / (one + ck::math::exp(-x) * casted_alpha);
     }
+
+    template <typename Y, typename X>
+    __host__ __device__ constexpr void operator()(Y& y, const X& x) const;
+
+    template <>
+    __host__ __device__ void operator()<bhalf_t, float>(bhalf_t& y, const float& x) const
+    {
+        constexpr float one = 1.f;
+        y                   = type_convert<bhalf_t>(alpha_ / (one + ck::math::exp(-x) * alpha_));
+    };
     const float alpha_;
 };