[GPU] sink reshape for reorder+reshape+permute pattern opt (openvinot…

…oolkit#28183) ### Details: - when looking at yolov6s, there is quite a lot of conv+reorder+reshape+permute pattern as below ![image](https://github.com/user-attachments/assets/6c4706c8-cada-4761-977f-9e36161add0e) convolution ([N,W,C,H], byxf) -> reorder ([N,W,C,H), bfyx) -> reshape [N, W, CxH], bfyx -> permute order (0, 2,1) to [N, CxH, W] bfyx, which is equivalent/alias to ([N,W,C,H] byxf), and this execution chain can be optimized - in this pr, we are sinking reshape to after permute if applicable, so that we can leverage existing optimizations like reorder/conv+permute to achieve overall opt ### Tickets: - CVS-159840 --------- Signed-off-by: fishbell <[email protected]> Co-authored-by: River Li <[email protected]> Co-authored-by: Chen Peter <[email protected]>
wangleis · Feb 11, 2025 · a5f7ec3 · a5f7ec3
1 parent f47bfc2
commit a5f7ec3
Show file tree

Hide file tree

Showing 4 changed files with 312 additions and 0 deletions.
diff --git a/src/plugins/intel_gpu/src/plugin/transformations/sink_reshape.cpp b/src/plugins/intel_gpu/src/plugin/transformations/sink_reshape.cpp
@@ -0,0 +1,165 @@
+// Copyright (C) 2018-2025 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "sink_reshape.hpp"
+
+#include "intel_gpu/op/convolution.hpp"
+#include "openvino/core/rt_info.hpp"
+#include "openvino/opsets/opset1.hpp"
+#include "openvino/pass/pattern/op/or.hpp"
+#include "openvino/pass/pattern/op/wrap_type.hpp"
+#include "transformations/utils/utils.hpp"
+
+namespace ov {
+namespace intel_gpu {
+
+SinkReshape::SinkReshape() {
+    using namespace ov::pass::pattern;
+    using ov::pass::pattern::op::Or;
+    using namespace ov::op;
+
+    auto reshape_predicate = [](const ov::Output<ov::Node>& output) -> bool {
+        auto supported_conv_act_post_ops_for_fuse = [](const std::shared_ptr<const Node>& node) -> bool {
+            return ov::is_type<v0::Relu>(node) || ov::is_type<v0::Elu>(node) || ov::is_type<v0::Sigmoid>(node) ||
+                   ov::is_type<v5::HSigmoid>(node) || ov::is_type<v0::Clamp>(node) || ov::is_type<v4::Swish>(node) ||
+                   ov::is_type<v4::HSwish>(node) || ov::is_type<v4::Mish>(node) || ov::is_type<v5::Round>(node) ||
+                   ov::is_type<v4::Mish>(node) || ov::is_type<v5::Round>(node);
+        };
+        auto supported_conv_eltwise_post_ops_for_fuse = [](const std::shared_ptr<const Node>& node) -> bool {
+            if (ov::is_type<v1::Add>(node) || ov::is_type<v1::Subtract>(node) || ov::is_type<v1::Multiply>(node) ||
+                ov::is_type<v1::Divide>(node))
+                return std::dynamic_pointer_cast<v0::Constant>(node->get_input_node_shared_ptr(1)) != nullptr;
+            return ov::is_type<v0::Exp>(node);
+        };
+        std::function<bool(const std::shared_ptr<ov::Node>&)> is_suitable_parent;
+        is_suitable_parent = [&](const std::shared_ptr<ov::Node>& node) -> bool {
+            if (node->get_users().size() != 1 || node->is_dynamic())
+                return false;
+            if (ov::as_type_ptr<op::Convolution>(node))
+                return true;
+            for (size_t idx = 0; idx < node->get_input_size(); idx++) {
+                auto input = node->get_input_node_shared_ptr(idx);
+                if (ov::as_type_ptr<v0::Constant>(node))
+                    continue;
+                if (supported_conv_eltwise_post_ops_for_fuse(node)) {
+                    return is_suitable_parent(input);
+                } else if (supported_conv_act_post_ops_for_fuse(node)) {
+                    return is_suitable_parent(input);
+                }
+                return false;
+            }
+            return false;
+        };
+        // reshape supported only in one case, if two consecutive input dims are merged into 1
+        auto is_suitable_reshape = [](const std::shared_ptr<ov::Node>& node) -> bool {
+            if (node->is_dynamic())
+                return false;
+            auto& in_ps = node->get_input_partial_shape(0);
+            auto& out_ps = node->get_output_partial_shape(0);
+            if (in_ps.size() - out_ps.size() != 1)
+                return false;
+            size_t mismatch_count = 0;
+            for (size_t i = 0; i < out_ps.size(); ++i) {
+                if (i + mismatch_count >= in_ps.size())
+                    return false;
+                if (out_ps[i] != in_ps[i + mismatch_count]) {
+                    mismatch_count++;
+                }
+            }
+            return mismatch_count == 1;
+        };
+        const auto reshape = ov::as_type_ptr<v1::Reshape>(output.get_node_shared_ptr());
+        return is_suitable_reshape(reshape) && is_suitable_parent(reshape->get_input_node_shared_ptr(0));
+    };
+
+    auto reshape_m = wrap_type<v1::Reshape>(reshape_predicate);
+    auto transpose_const_m = wrap_type<v0::Constant>();
+    auto transpose_m = wrap_type<v1::Transpose>({reshape_m, transpose_const_m});
+
+    ov::matcher_pass_callback callback = [OV_CAPTURE_CPY_AND_THIS](ov::pass::pattern::Matcher& m) {
+        const auto& pattern_map = m.get_pattern_value_map();
+        auto reshape = std::dynamic_pointer_cast<v1::Reshape>(pattern_map.at(reshape_m).get_node_shared_ptr());
+        if (!reshape || transformation_callback(reshape)) {
+            return false;
+        }
+
+        auto update_order = [](std::vector<uint16_t> original_order, const std::shared_ptr<v1::Reshape>& reshape_node) {
+            // Example. For this sequence, there is Reshape node which merges 2 consecutive dims into one
+            // order must be updated like permute is done before reshape
+            // [1,3,4,6] -> Reshape[1,3,24]-> permute(0,2,1) -> [1,24,3]
+            // updated order must be (0,2,3,1):
+            // dim with index=2 is split into 2 parts: 2 and 3
+            auto reshape_in_shape = reshape_node->get_input_partial_shape(0).to_shape();
+            auto reshape_out_shape = reshape_node->get_output_partial_shape(0).to_shape();
+            auto transformed_order = original_order;
+            ov::Shape new_shape(transformed_order.size());
+            const uint16_t merge_dim_idx = [&]() {
+                for (uint16_t i = 0; i < reshape_out_shape.size(); ++i) {
+                    if (reshape_in_shape[i] != reshape_out_shape[i])
+                        return i;
+                }
+                OPENVINO_THROW("same input/output for reshape node");
+            }();
+            auto insertIt = transformed_order.end();
+            for (auto it = transformed_order.begin(); it != transformed_order.end(); ++it) {
+                auto& elem = *it;
+                if (elem > merge_dim_idx) {
+                    elem++;
+                } else if (elem == merge_dim_idx) {
+                    insertIt = it + 1;
+                }
+            }
+            transformed_order.insert(insertIt, merge_dim_idx + 1);
+            return transformed_order;
+        };
+
+        // allow tranposes which rotate feature dim to back to be taken as inner-most axis
+        auto check_transpose_order = [](std::vector<uint16_t>& order) -> bool {
+            if (order.size() <= 2)
+                return false;
+            if ((int32_t)order[order.size() - 2] != order.size() - 1)
+                return false;
+            if ((int32_t)order[0] != 0)
+                return false;
+            for (int32_t i = 2; i < (int32_t)order.size(); ++i) {
+                if ((int32_t)order[i - 1] != i)
+                    return false;
+            }
+            return true;
+        };
+
+        auto transpose = std::dynamic_pointer_cast<v1::Transpose>(pattern_map.at(transpose_m).get_node_shared_ptr());
+        if (pattern_map.count(transpose_const_m) > 0) {
+            auto org_transpose_m = pattern_map.at(transpose_const_m).get_node_shared_ptr();
+            auto org_transpose_os = transpose->get_output_shape(0);
+            auto tranpose_order = std::dynamic_pointer_cast<v0::Constant>(org_transpose_m);
+            auto updated_order = update_order(tranpose_order->cast_vector<uint16_t>(), reshape);
+            if (check_transpose_order(updated_order)) {
+                auto updated_transpose_order = std::make_shared<v0::Constant>(tranpose_order->get_element_type(),
+                                                                              ov::Shape(1, updated_order.size()),
+                                                                              updated_order);
+                updated_transpose_order->set_friendly_name(tranpose_order->get_friendly_name() + "_updated");
+                auto new_transpose =
+                    std::make_shared<v1::Transpose>(reshape->input(0).get_source_output(), updated_transpose_order);
+                new_transpose->set_friendly_name(transpose->get_friendly_name() + "_with_updated_order");
+                copy_runtime_info(transpose, new_transpose);
+                ov::replace_node(reshape, new_transpose);
+                auto new_pattern_const = std::make_shared<ov::op::v0::Constant>(ov::element::i32,
+                                                                                ov::Shape{org_transpose_os.size()},
+                                                                                org_transpose_os);
+                auto new_reshape = std::make_shared<ov::op::v1::Reshape>(new_transpose,
+                                                                         new_pattern_const,
+                                                                         reshape->get_special_zero());
+                new_reshape->set_friendly_name(reshape->get_friendly_name() + "_sinked_after_transpose");
+                copy_runtime_info(reshape, new_reshape);
+                ov::replace_node(transpose, new_reshape);
+            }
+        }
+        return true;
+    };
+    auto m = std::make_shared<ov::pass::pattern::Matcher>(transpose_m, "SinkReshapeIfNeeded");
+    this->register_matcher(m, callback);
+}
+}  // namespace intel_gpu
+}  // namespace ov
diff --git a/src/plugins/intel_gpu/src/plugin/transformations/sink_reshape.hpp b/src/plugins/intel_gpu/src/plugin/transformations/sink_reshape.hpp
@@ -0,0 +1,19 @@
+// Copyright (C) 2018-2025 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "openvino/pass/graph_rewrite.hpp"
+
+namespace ov {
+namespace intel_gpu {
+
+class SinkReshape: public ov::pass::MatcherPass {
+public:
+    OPENVINO_MATCHER_PASS_RTTI("SinkReshapeIfNeeded");
+    SinkReshape();
+};
+
+}   // namespace intel_gpu
+}   // namespace ov
diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp
@@ -86,6 +86,7 @@
 #include "plugin/transformations/dynamic_quantize_fully_connected.hpp"
 #include "plugin/transformations/optimize_subsequent_reshapes.hpp"
 #include "plugin/transformations/lora_horizontal_fusion.hpp"
+#include "plugin/transformations/sink_reshape.hpp"
 #include "transformations/common_optimizations/nop_elimination.hpp"
 #include "transformations/common_optimizations/rms_fusion.hpp"
 #include "transformations/common_optimizations/broadcast_elementwise_fusion.hpp"
@@ -1097,6 +1098,8 @@ void TransformationsPipeline::apply(std::shared_ptr<ov::Model> func) {
         // This Validate is needed for proper data type propagation after applying IncreasePositionIdsPrecision pass
         manager.register_pass<ov::pass::Validate>();
 
+        manager.register_pass<ov::intel_gpu::SinkReshape>();
+
         if (device_info.supports_immad) {
             auto dynamic_quantization_group_size = config.get_property(ov::hint::dynamic_quantization_group_size);
             pass_config->set_callback<ov::intel_gpu::DynamicQuantizeFullyConnected>([=](const_node_ptr& root) -> bool {

diff --git a/src/plugins/intel_gpu/tests/unit/transformations/sink_reshape_test.cpp b/src/plugins/intel_gpu/tests/unit/transformations/sink_reshape_test.cpp
@@ -0,0 +1,125 @@
+// Copyright (C) 2018-2025 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+#include <gtest/gtest.h>
+
+#include <string>
+#include <memory>
+
+#include <openvino/core/model.hpp>
+#include <openvino/opsets/opset1.hpp>
+#include "openvino/op/softmax.hpp"
+#include <transformations/init_node_info.hpp>
+#include <transformations/utils/utils.hpp>
+#include "plugin/transformations/sink_reshape.hpp"
+#include "plugin/transformations/convert_convolution.hpp"
+#include "common_test_utils/ov_test_utils.hpp"
+
+using namespace testing;
+using namespace ov::intel_gpu;
+
+using SinkReshapeParams = std::tuple<bool,                                    // add eltwise
+                                     bool,                                    // add activation
+                                     bool,                                    // eligible rotation
+                                     bool>;                                   // eligible reshape                                  
+
+class SinkReshapeTests : public TransformationTestsF, public WithParamInterface<SinkReshapeParams> {
+public:
+    static std::string get_test_case_name(testing::TestParamInfo<SinkReshapeParams> obj) {
+        std::pair<ov::PartialShape, ov::Shape> input_shapes;
+        bool add_eltwise;
+        bool add_activation;
+        bool eligible_rotataion;
+        bool eligible_reshape;
+        std::tie(add_eltwise, add_activation, eligible_rotataion, eligible_reshape) = obj.param;
+
+        std::ostringstream result;
+        result << ")_add_eltwise=" << add_eltwise << "_add_activationt=" << add_activation << "_eligible_rotation=" << eligible_rotataion << "_eligible_reshape=" << eligible_reshape;
+        return result.str();
+    }
+
+    static std::shared_ptr<ov::Model> init_model(const bool add_eltwise,
+                                                 const bool add_activation,
+                                                 const bool eligible_rotation,
+                                                 const bool eligible_reshape,
+                                                 const bool ref) {
+        ov::Strides strides{1, 1};
+        ov::Strides dilations{1, 1};
+        ov::CoordinateDiff pads_begin{0, 0};
+        ov::CoordinateDiff pads_end{0, 0};
+        auto input = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::PartialShape{ 2, 3, 12, 12 });
+        auto weights_const = ov::op::v0::Constant::create(ov::element::f32, ov::Shape{ 4, 3, 3, 3 }, { 1 });
+        auto conv = std::make_shared<ov::op::v1::Convolution>(input,
+                                                              weights_const,
+                                                              strides,
+                                                              pads_begin,
+                                                              pads_end,
+                                                              dilations,
+                                                              ov::op::PadType::EXPLICIT);
+        std::shared_ptr<ov::Node> reshape_input_node = conv;
+        if (add_eltwise) {
+            auto sub_const = ov::opset1::Constant::create(ov::element::f32, ov::Shape{1}, {1});
+            reshape_input_node = std::make_shared<ov::opset1::Subtract>(reshape_input_node, sub_const);
+        }
+
+        if (add_activation) {
+            reshape_input_node = std::make_shared<ov::opset1::Sigmoid>(reshape_input_node);
+        }
+        std::shared_ptr<ov::Model> model = nullptr;
+        if (!ref) {
+            auto shape = eligible_reshape ? std::vector<int>{2, 4, 100} : std::vector<int>{2, 2, 20};
+            auto reshape_const = ov::opset1::Constant::create(ov::element::i32, {3}, shape);
+            auto reshape = std::make_shared<ov::opset1::Reshape>(reshape_input_node, reshape_const, true);
+            auto order = eligible_rotation ? std::vector<int>{0 ,2, 1} : std::vector<int>{2, 1, 0};
+            auto transpose_const = ov::opset1::Constant::create(ov::element::i32, {3}, order);
+            auto transpose = std::make_shared<ov::opset1::Transpose>(reshape, transpose_const);
+
+            auto softmax = std::make_shared<ov::op::v8::Softmax>(transpose);
+            model = std::make_shared<ov::Model>(ov::NodeVector{softmax}, ov::ParameterVector{input});
+        } else {
+            auto transpose_const = ov::opset1::Constant::create(ov::element::i32, {4}, {0, 2, 3, 1});
+            auto transpose = std::make_shared<ov::opset1::Transpose>(reshape_input_node, transpose_const);
+            auto reshape_const = ov::opset1::Constant::create(ov::element::i32, {3}, {2, 100, 4});
+            auto reshape = std::make_shared<ov::opset1::Reshape>(transpose, reshape_const, true);
+            auto softmax = std::make_shared<ov::op::v8::Softmax>(reshape);
+            model = std::make_shared<ov::Model>(ov::NodeVector{softmax}, ov::ParameterVector{input});
+        }
+        ov::pass::Manager manager;
+        manager.register_pass<ConvertConvolutionToInternal>();
+        if (!ref)
+            manager.register_pass<SinkReshape>();
+        manager.run_passes(model);
+        return model;
+    }
+
+protected:
+    void SetUp() override {
+        TransformationTestsF::SetUp();
+        bool add_eltwise;
+        bool add_activation;
+        bool eligible_rotation;
+        bool eligible_reshape;
+        std::tie(add_eltwise, add_activation, eligible_rotation, eligible_reshape) = this->GetParam();
+
+        model = init_model(add_eltwise, add_activation, eligible_rotation, eligible_reshape, true);
+        if (!eligible_rotation || !eligible_reshape)
+            model_ref = model->clone();
+        else
+            model_ref = init_model(add_eltwise, add_activation, eligible_rotation, eligible_reshape, false);
+    }
+};
+
+TEST_P(SinkReshapeTests, CompareFunctions) {}
+
+const std::vector<bool> add_eltwise = {false, true};
+const std::vector<bool> add_activation = {false, true};
+const std::vector<bool> eligible_rotation = {false, true};
+const std::vector<bool> eligible_reshape = {false, true};
+
+INSTANTIATE_TEST_SUITE_P(smoke_TransformationTests_reshape_transpose, SinkReshapeTests,
+                        ::testing::Combine(
+                                ::testing::ValuesIn(add_eltwise),
+                                ::testing::ValuesIn(add_activation),
+                                ::testing::ValuesIn(eligible_rotation),
+                                ::testing::ValuesIn(eligible_reshape)),
+                            SinkReshapeTests::get_test_case_name);