cpu: pooling: modify acl_pooling for stateless functions

zhili03 · zhili03 · commit 80e753c52570 · 2025-03-10T16:07:42.000Z
Change-Id: I30a987c8c56e1b0a64e3b2268cc96ec30b2abce4
diff --git a/src/cpu/aarch64/acl_pooling.cpp b/src/cpu/aarch64/acl_pooling.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2023 Arm Ltd. and affiliates
+* Copyright 2022-2023, 2025 Arm Ltd. and affiliates
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -21,35 +21,72 @@ namespace impl {
 namespace cpu {
 namespace aarch64 {
 
+status_t acl_pooling_fwd_t::init(engine_t *engine) {
+    auto asp = pd()->asp_;
+
+    auto op = std::make_unique<arm_compute::experimental::op::CpuPooling>();
+
+    pooling_op_ = std::move(op);
+
+    // Configure pooling operation when workspace tensor is used, mem allocation happens
+    if(asp.use_ws){
+        pooling_op_->configure(&asp.src_info, &asp.dst_info, asp.pool_info, &asp.ws_info);
+    }
+    // Configure pooling operation when workspace tensor is not used, mem allocation happens
+    else{
+        pooling_op_->configure(&asp.src_info, &asp.dst_info, asp.pool_info, nullptr);
+    }
+
+    return status::success;
+}
+
 status_t acl_pooling_fwd_t::execute_forward(const exec_ctx_t &ctx) const {
-    // Lock here is needed because resource_mapper does not support
-    // concurrent access.
-    std::lock_guard<std::mutex> _lock {this->mtx};
     status_t status = status::success;
-    auto src_base = CTX_IN_MEM(const void *, DNNL_ARG_SRC);
-    auto dst_base = CTX_OUT_MEM(void *, DNNL_ARG_DST);
+
+    auto src = CTX_IN_MEM(const void *, DNNL_ARG_SRC);
+    auto dst = CTX_OUT_MEM(void *, DNNL_ARG_DST);
+
     void *ws_base;
 
-    // Retrieve primitive resource and configured Compute Library objects
-    auto *acl_resource
-            = ctx.get_resource_mapper()->get<acl_pooling_resource_t>(this);
-    acl_pooling_obj_t &acl_obj = acl_resource->get_acl_obj();
+    auto asp = pd()->asp_;
+
+    arm_compute::Tensor src_tensor;
+    arm_compute::Tensor dst_tensor;
 
-    if (acl_obj.use_ws) ws_base = CTX_OUT_MEM(void *, DNNL_ARG_WORKSPACE);
+    src_tensor.allocator()->init(asp.src_info);
+    src_tensor.allocator()->import_memory(const_cast<void *>(src));
+    dst_tensor.allocator()->init(asp.dst_info);
+    dst_tensor.allocator()->import_memory(dst);
 
-    // import_memory() and free() methods do not allocate/free any additional
-    // memory, only acquire/release pointers.
-    acl_obj.src_tensor.allocator()->import_memory(const_cast<void *>(src_base));
-    acl_obj.dst_tensor.allocator()->import_memory(dst_base);
-    if (acl_obj.use_ws) acl_obj.ws_tensor.allocator()->import_memory(ws_base);
+    arm_compute::Tensor scratch_tensor;
+    void *scratchpad_base = ctx.get_scratchpad_grantor().get<void>(
+        memory_tracking::names::key_pool_reduction
+    );
+    scratch_tensor.allocator()->init(arm_compute::TensorInfo(
+        asp.dst_info.tensor_shape(), 1, arm_compute::DataType::F32
+    ));
+    scratch_tensor.allocator()->import_memory(scratchpad_base);
 
-    acl_obj.pool.run();
+    arm_compute::Tensor ws_tensor;
 
-    acl_obj.src_tensor.allocator()->free();
-    acl_obj.dst_tensor.allocator()->free();
-    if (acl_obj.use_ws) acl_obj.ws_tensor.allocator()->free();
+    if (asp.use_ws) {
+        ws_base = CTX_OUT_MEM(void *, DNNL_ARG_WORKSPACE);
+        ws_tensor.allocator()->init(asp.ws_info);
+        ws_tensor.allocator()->import_memory(ws_base);
+    }
+    //for scratchpad based tensor
+    arm_compute::ITensorPack run_pack {
+                            {arm_compute::TensorType::ACL_SRC_0, &src_tensor},
+                            {arm_compute::TensorType::ACL_DST_0, &dst_tensor},
+                            {arm_compute::TensorType::ACL_INT_0, &scratch_tensor}};
+
+    if (asp.use_ws) {
+        run_pack.add_tensor(arm_compute::TensorType::ACL_DST_1, &ws_tensor);
+    }
+    pooling_op_->run(run_pack);
 
     return status;
+
 }
 
 } // namespace aarch64
diff --git a/src/cpu/aarch64/acl_pooling.hpp b/src/cpu/aarch64/acl_pooling.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2022-2023 Arm Ltd. and affiliates
+* Copyright 2022-2023, 2025 Arm Ltd. and affiliates
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -17,72 +17,39 @@
 #ifndef CPU_AARCH64_ACL_POOLING_HPP
 #define CPU_AARCH64_ACL_POOLING_HPP
 
-#include "cpu/aarch64/acl_utils.hpp"
 #include "cpu/cpu_pooling_pd.hpp"
 
+#include "cpu/aarch64/acl_utils.hpp"
+
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/runtime/IOperator.h"
+#include "arm_compute/runtime/experimental/operators/CpuPooling.h"
+
 namespace dnnl {
 namespace impl {
 namespace cpu {
 namespace aarch64 {
 
-struct acl_pooling_obj_t {
-    arm_compute::NEPoolingLayer pool;
-    arm_compute::Tensor src_tensor;
-    arm_compute::Tensor ws_tensor;
-    arm_compute::Tensor dst_tensor;
-    bool use_ws;
-};
-
 struct acl_pooling_conf_t {
-    arm_compute::PoolingLayerInfo pool_info;
     arm_compute::TensorInfo src_info;
-    arm_compute::TensorInfo ws_info;
     arm_compute::TensorInfo dst_info;
+    arm_compute::PoolingLayerInfo pool_info;
+    arm_compute::TensorInfo ws_info;
     bool use_ws;
 };
 
-struct acl_pooling_resource_t : public resource_t {
-    acl_pooling_resource_t()
-        : acl_pooling_obj_(utils::make_unique<acl_pooling_obj_t>()) {}
-
-    status_t configure(const acl_pooling_conf_t &app) {
-        if (!acl_pooling_obj_) return status::out_of_memory;
-
-        // Init Compute Library tensors based on info from descriptor
-        acl_pooling_obj_->src_tensor.allocator()->init(app.src_info);
-        acl_pooling_obj_->dst_tensor.allocator()->init(app.dst_info);
-
-        if (app.use_ws) {
-            acl_pooling_obj_->ws_tensor.allocator()->init(app.ws_info);
-            acl_pooling_obj_->pool.configure(&acl_pooling_obj_->src_tensor,
-                    &acl_pooling_obj_->dst_tensor, app.pool_info,
-                    &acl_pooling_obj_->ws_tensor);
-            acl_pooling_obj_->use_ws = true;
-        } else {
-            acl_pooling_obj_->pool.configure(&acl_pooling_obj_->src_tensor,
-                    &acl_pooling_obj_->dst_tensor, app.pool_info);
-        }
-
-        return status::success;
-    }
-
-    acl_pooling_obj_t &get_acl_obj() const { return *acl_pooling_obj_; }
-
-    DNNL_DISALLOW_COPY_AND_ASSIGN(acl_pooling_resource_t);
-
-private:
-    std::unique_ptr<acl_pooling_obj_t> acl_pooling_obj_;
-}; // acl_pooling_resource_t
-
 struct acl_pooling_fwd_t : public primitive_t {
     struct pd_t : public cpu_pooling_fwd_pd_t {
         using cpu_pooling_fwd_pd_t::cpu_pooling_fwd_pd_t;
-
-        DECLARE_COMMON_PD_T("acl", acl_pooling_fwd_t);
+        DECLARE_COMMON_PD_T("acl", acl_pooling_fwd_t, USE_GLOBAL_SCRATCHPAD);
 
         status_t init(engine_t *engine) {
+            auto scratchpad = scratchpad_registry().registrar();
+            CHECK(init_scratchpad(scratchpad));
+
+            // ACL supports forward propagation only
             bool ok = set_default_params() == status::success
-                    && is_fwd() // ACL supports forward propagation only
+                    && is_fwd()
                     && utils::everyone_is(
                             src_md()->data_type, dst_md()->data_type)
                     && utils::one_of(
@@ -97,21 +64,22 @@ struct acl_pooling_fwd_t : public primitive_t {
             // Choose the pooling type
             const alg_kind_t alg = pod->alg_kind;
             const bool is_max_pool = (alg == alg_kind::pooling_max);
-            app.pool_info.pool_type = is_max_pool
+            asp_.pool_info.pool_type = is_max_pool
                     ? arm_compute::PoolingType::MAX
                     : arm_compute::PoolingType::AVG;
 
             // Check if workspace Tensor is needed
             const bool ws_init = (is_max_pool
                     && pod->prop_kind == prop_kind::forward_training);
-            app.use_ws = ws_init;
+            asp_.use_ws = ws_init;
 
             ACL_CHECK_SUPPORT(ws_init && src_md()->data_type != data_type::f32,
                     "ACL Max pooling forward training only supports f32");
 
             if (ws_init)
+                // ACL only supports U32/S32 no U8
                 init_default_ws(
-                        data_type::s32); // ACL only supports U32/S32 no U8
+                        data_type::s32);
             auto src_tag = memory_desc_matches_one_of_tag(
                     *src_md(), format_tag::nhwc, format_tag::nchw);
             auto dst_tag = memory_desc_matches_one_of_tag(
@@ -129,12 +97,12 @@ struct acl_pooling_fwd_t : public primitive_t {
             ACL_CHECK_SUPPORT(ndims != 4, "Tensor is not 4d");
 
             // Pooling window
-            app.pool_info.pool_size = arm_compute::Size2D(KW(), KH());
+            asp_.pool_info.pool_size = arm_compute::Size2D(KW(), KH());
             // Choose the data layout
             bool is_nhwc = src_tag == format_tag::nhwc;
             const auto acl_layout = is_nhwc ? arm_compute::DataLayout::NHWC
                                             : arm_compute::DataLayout::NCHW;
-            app.pool_info.data_layout = acl_layout;
+            asp_.pool_info.data_layout = acl_layout;
             const auto acl_data_t
                     = acl_utils::get_acl_data_t(src_d.data_type());
 
@@ -158,41 +126,43 @@ struct acl_pooling_fwd_t : public primitive_t {
                         "kernels are faster for this problem");
             }
 
-            app.pool_info.exclude_padding
+            asp_.pool_info.exclude_padding
                     = (alg == alg_kind::pooling_avg_exclude_padding);
 
-            app.pool_info.pad_stride_info = arm_compute::PadStrideInfo(KSW(),
+            asp_.pool_info.pad_stride_info = arm_compute::PadStrideInfo(KSW(),
                     KSH(), padL(), padR(), padT(), padB(),
                     arm_compute::DimensionRoundingType::FLOOR);
 
-            app.src_info = arm_compute::TensorInfo(is_nhwc
+            asp_.src_info = arm_compute::TensorInfo(is_nhwc
                             ? arm_compute::TensorShape(IC(), IW(), IH(), MB())
                             : arm_compute::TensorShape(IW(), IH(), IC(), MB()),
                     1, acl_data_t, acl_layout);
-            app.dst_info = arm_compute::TensorInfo(is_nhwc
+            asp_.dst_info = arm_compute::TensorInfo(is_nhwc
                             ? arm_compute::TensorShape(OC(), OW(), OH(), MB())
                             : arm_compute::TensorShape(OW(), OH(), OC(), MB()),
                     1, acl_data_t, acl_layout);
 
             // Use datatype lowest property instead of using -INF
-            app.pool_info.use_inf_as_limit = false;
+            asp_.pool_info.use_inf_as_limit = false;
 
             if (ws_init) {
-                app.ws_info = arm_compute::TensorInfo(is_nhwc
+                asp_.ws_info = arm_compute::TensorInfo(is_nhwc
                                 ? arm_compute::TensorShape(
                                         OC(), OW(), OH(), MB())
                                 : arm_compute::TensorShape(
                                         OW(), OH(), OC(), MB()),
                         1, arm_compute::DataType::U32, acl_layout);
 
                 // Return kernel indices instead of source indices.
-                app.pool_info.use_kernel_indices = true;
+                asp_.pool_info.use_kernel_indices = true;
                 ACL_CHECK_VALID(
-                        arm_compute::NEPoolingLayer::validate(&app.src_info,
-                                &app.dst_info, app.pool_info, &app.ws_info));
+                          arm_compute::experimental::op::CpuPooling::validate(
+                              &asp_.src_info, &asp_.dst_info, asp_.pool_info, &asp_.ws_info));
             } else {
-                ACL_CHECK_VALID(arm_compute::NEPoolingLayer::validate(
-                        &app.src_info, &app.dst_info, app.pool_info));
+                asp_.pool_info.use_kernel_indices = false;
+                  ACL_CHECK_VALID(
+                          arm_compute::experimental::op::CpuPooling::validate(
+                              &asp_.src_info, &asp_.dst_info, asp_.pool_info));
             }
 
             return status::success;
@@ -262,34 +232,37 @@ struct acl_pooling_fwd_t : public primitive_t {
             return problem_size > cutoff * thread_count;
         }
 
-        acl_pooling_conf_t app = utils::zero<decltype(app)>();
-    };
+        acl_pooling_conf_t asp_;
+
+        status_t init_scratchpad(
+    memory_tracking::registrar_t &scratchpad) {
+                const memory_desc_wrapper dst_d(&dst_md_);
+                scratchpad.book(
+                    memory_tracking::names::key_pool_reduction,
+                    dst_d.nelems(), sizeof(float)
+                );
+                if (asp_.use_ws) {
+                        scratchpad.book(
+                            memory_tracking::names::key_pool_ind_plain2blocked_cvt,
+                            dst_d.nelems(), sizeof(uint32_t));
+                    }
+                return status::success;
+            }
+
+    };  // pd_t
 
+    // constructor
     acl_pooling_fwd_t(const pd_t *apd) : primitive_t(apd) {}
 
     status_t execute(const exec_ctx_t &ctx) const override {
         return execute_forward(ctx);
     }
 
-    status_t create_resource(
-            engine_t *engine, resource_mapper_t &mapper) const override {
-        if (mapper.has_resource(this)) return status::success;
-
-        auto r = utils::make_unique<acl_pooling_resource_t>();
-        if (!r) return status::out_of_memory;
-
-        // Configure the resource based on information from primitive descriptor
-        auto st = r->configure(pd()->app);
-        if (st == status::success) { mapper.add(this, std::move(r)); }
-
-        return st;
-    }
-
 private:
-    // execute_forward has to be const thus mutability of mtx
-    mutable std::mutex mtx;
+    status_t init(engine_t *engine) override;
     status_t execute_forward(const exec_ctx_t &ctx) const;
     const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
+    std::unique_ptr<arm_compute::experimental::op::CpuPooling> pooling_op_;
 }; // acl_pooling_fwd_t
 
 } // namespace aarch64