1
1
/* ******************************************************************************
2
- * Copyright 2022-2023 Arm Ltd. and affiliates
2
+ * Copyright 2022-2023, 2025 Arm Ltd. and affiliates
3
3
*
4
4
* Licensed under the Apache License, Version 2.0 (the "License");
5
5
* you may not use this file except in compliance with the License.
17
17
#ifndef CPU_AARCH64_ACL_POOLING_HPP
18
18
#define CPU_AARCH64_ACL_POOLING_HPP
19
19
20
- #include " cpu/aarch64/acl_utils.hpp"
21
20
#include " cpu/cpu_pooling_pd.hpp"
22
21
22
+ #include " cpu/aarch64/acl_utils.hpp"
23
+
24
+ #include " arm_compute/core/TensorInfo.h"
25
+ #include " arm_compute/runtime/IOperator.h"
26
+ #include " arm_compute/runtime/experimental/operators/CpuPooling.h"
27
+
23
28
namespace dnnl {
24
29
namespace impl {
25
30
namespace cpu {
26
31
namespace aarch64 {
27
32
28
- struct acl_pooling_obj_t {
29
- arm_compute::NEPoolingLayer pool;
30
- arm_compute::Tensor src_tensor;
31
- arm_compute::Tensor ws_tensor;
32
- arm_compute::Tensor dst_tensor;
33
- bool use_ws;
34
- };
35
-
36
33
struct acl_pooling_conf_t {
37
- arm_compute::PoolingLayerInfo pool_info;
38
34
arm_compute::TensorInfo src_info;
39
- arm_compute::TensorInfo ws_info;
40
35
arm_compute::TensorInfo dst_info;
36
+ arm_compute::PoolingLayerInfo pool_info;
37
+ arm_compute::TensorInfo ws_info;
41
38
bool use_ws;
42
39
};
43
40
44
- struct acl_pooling_resource_t : public resource_t {
45
- acl_pooling_resource_t ()
46
- : acl_pooling_obj_(utils::make_unique<acl_pooling_obj_t >()) {}
47
-
48
- status_t configure (const acl_pooling_conf_t &app) {
49
- if (!acl_pooling_obj_) return status::out_of_memory;
50
-
51
- // Init Compute Library tensors based on info from descriptor
52
- acl_pooling_obj_->src_tensor .allocator ()->init (app.src_info );
53
- acl_pooling_obj_->dst_tensor .allocator ()->init (app.dst_info );
54
-
55
- if (app.use_ws ) {
56
- acl_pooling_obj_->ws_tensor .allocator ()->init (app.ws_info );
57
- acl_pooling_obj_->pool .configure (&acl_pooling_obj_->src_tensor ,
58
- &acl_pooling_obj_->dst_tensor , app.pool_info ,
59
- &acl_pooling_obj_->ws_tensor );
60
- acl_pooling_obj_->use_ws = true ;
61
- } else {
62
- acl_pooling_obj_->pool .configure (&acl_pooling_obj_->src_tensor ,
63
- &acl_pooling_obj_->dst_tensor , app.pool_info );
64
- }
65
-
66
- return status::success;
67
- }
68
-
69
- acl_pooling_obj_t &get_acl_obj () const { return *acl_pooling_obj_; }
70
-
71
- DNNL_DISALLOW_COPY_AND_ASSIGN (acl_pooling_resource_t );
72
-
73
- private:
74
- std::unique_ptr<acl_pooling_obj_t > acl_pooling_obj_;
75
- }; // acl_pooling_resource_t
76
-
77
41
struct acl_pooling_fwd_t : public primitive_t {
78
42
struct pd_t : public cpu_pooling_fwd_pd_t {
79
43
using cpu_pooling_fwd_pd_t ::cpu_pooling_fwd_pd_t ;
80
-
81
- DECLARE_COMMON_PD_T (" acl" , acl_pooling_fwd_t );
44
+ DECLARE_COMMON_PD_T (" acl" , acl_pooling_fwd_t , USE_GLOBAL_SCRATCHPAD);
82
45
83
46
status_t init (engine_t *engine) {
47
+ auto scratchpad = scratchpad_registry ().registrar ();
48
+ CHECK (init_scratchpad (scratchpad));
49
+
50
+ // ACL supports forward propagation only
84
51
bool ok = set_default_params () == status::success
85
- && is_fwd () // ACL supports forward propagation only
52
+ && is_fwd ()
86
53
&& utils::everyone_is (
87
54
src_md ()->data_type , dst_md ()->data_type )
88
55
&& utils::one_of (
@@ -97,21 +64,22 @@ struct acl_pooling_fwd_t : public primitive_t {
97
64
// Choose the pooling type
98
65
const alg_kind_t alg = pod->alg_kind ;
99
66
const bool is_max_pool = (alg == alg_kind::pooling_max);
100
- app .pool_info .pool_type = is_max_pool
67
+ asp_ .pool_info .pool_type = is_max_pool
101
68
? arm_compute::PoolingType::MAX
102
69
: arm_compute::PoolingType::AVG;
103
70
104
71
// Check if workspace Tensor is needed
105
72
const bool ws_init = (is_max_pool
106
73
&& pod->prop_kind == prop_kind::forward_training);
107
- app .use_ws = ws_init;
74
+ asp_ .use_ws = ws_init;
108
75
109
76
ACL_CHECK_SUPPORT (ws_init && src_md ()->data_type != data_type::f32 ,
110
77
" ACL Max pooling forward training only supports f32" );
111
78
112
79
if (ws_init)
80
+ // ACL only supports U32/S32 no U8
113
81
init_default_ws (
114
- data_type::s32); // ACL only supports U32/S32 no U8
82
+ data_type::s32);
115
83
auto src_tag = memory_desc_matches_one_of_tag (
116
84
*src_md (), format_tag::nhwc, format_tag::nchw);
117
85
auto dst_tag = memory_desc_matches_one_of_tag (
@@ -129,12 +97,12 @@ struct acl_pooling_fwd_t : public primitive_t {
129
97
ACL_CHECK_SUPPORT (ndims != 4 , " Tensor is not 4d" );
130
98
131
99
// Pooling window
132
- app .pool_info .pool_size = arm_compute::Size2D (KW (), KH ());
100
+ asp_ .pool_info .pool_size = arm_compute::Size2D (KW (), KH ());
133
101
// Choose the data layout
134
102
bool is_nhwc = src_tag == format_tag::nhwc;
135
103
const auto acl_layout = is_nhwc ? arm_compute::DataLayout::NHWC
136
104
: arm_compute::DataLayout::NCHW;
137
- app .pool_info .data_layout = acl_layout;
105
+ asp_ .pool_info .data_layout = acl_layout;
138
106
const auto acl_data_t
139
107
= acl_utils::get_acl_data_t (src_d.data_type ());
140
108
@@ -158,41 +126,43 @@ struct acl_pooling_fwd_t : public primitive_t {
158
126
" kernels are faster for this problem" );
159
127
}
160
128
161
- app .pool_info .exclude_padding
129
+ asp_ .pool_info .exclude_padding
162
130
= (alg == alg_kind::pooling_avg_exclude_padding);
163
131
164
- app .pool_info .pad_stride_info = arm_compute::PadStrideInfo (KSW (),
132
+ asp_ .pool_info .pad_stride_info = arm_compute::PadStrideInfo (KSW (),
165
133
KSH (), padL (), padR (), padT (), padB (),
166
134
arm_compute::DimensionRoundingType::FLOOR);
167
135
168
- app .src_info = arm_compute::TensorInfo (is_nhwc
136
+ asp_ .src_info = arm_compute::TensorInfo (is_nhwc
169
137
? arm_compute::TensorShape (IC (), IW (), IH (), MB ())
170
138
: arm_compute::TensorShape (IW (), IH (), IC (), MB ()),
171
139
1 , acl_data_t , acl_layout);
172
- app .dst_info = arm_compute::TensorInfo (is_nhwc
140
+ asp_ .dst_info = arm_compute::TensorInfo (is_nhwc
173
141
? arm_compute::TensorShape (OC (), OW (), OH (), MB ())
174
142
: arm_compute::TensorShape (OW (), OH (), OC (), MB ()),
175
143
1 , acl_data_t , acl_layout);
176
144
177
145
// Use datatype lowest property instead of using -INF
178
- app .pool_info .use_inf_as_limit = false ;
146
+ asp_ .pool_info .use_inf_as_limit = false ;
179
147
180
148
if (ws_init) {
181
- app .ws_info = arm_compute::TensorInfo (is_nhwc
149
+ asp_ .ws_info = arm_compute::TensorInfo (is_nhwc
182
150
? arm_compute::TensorShape (
183
151
OC (), OW (), OH (), MB ())
184
152
: arm_compute::TensorShape (
185
153
OW (), OH (), OC (), MB ()),
186
154
1 , arm_compute::DataType::U32, acl_layout);
187
155
188
156
// Return kernel indices instead of source indices.
189
- app .pool_info .use_kernel_indices = true ;
157
+ asp_ .pool_info .use_kernel_indices = true ;
190
158
ACL_CHECK_VALID (
191
- arm_compute::NEPoolingLayer:: validate (&app. src_info ,
192
- &app .dst_info , app .pool_info , &app .ws_info ));
159
+ arm_compute::experimental::op::CpuPooling:: validate (
160
+ &asp_. src_info , &asp_ .dst_info , asp_ .pool_info , &asp_ .ws_info ));
193
161
} else {
194
- ACL_CHECK_VALID (arm_compute::NEPoolingLayer::validate (
195
- &app.src_info , &app.dst_info , app.pool_info ));
162
+ asp_.pool_info .use_kernel_indices = false ;
163
+ ACL_CHECK_VALID (
164
+ arm_compute::experimental::op::CpuPooling::validate (
165
+ &asp_.src_info , &asp_.dst_info , asp_.pool_info ));
196
166
}
197
167
198
168
return status::success;
@@ -262,34 +232,37 @@ struct acl_pooling_fwd_t : public primitive_t {
262
232
return problem_size > cutoff * thread_count;
263
233
}
264
234
265
- acl_pooling_conf_t app = utils::zero<decltype (app)>();
266
- };
235
+ acl_pooling_conf_t asp_;
236
+
237
+ status_t init_scratchpad (
238
+ memory_tracking::registrar_t &scratchpad) {
239
+ const memory_desc_wrapper dst_d (&dst_md_);
240
+ scratchpad.book (
241
+ memory_tracking::names::key_pool_reduction,
242
+ dst_d.nelems (), sizeof (float )
243
+ );
244
+ if (asp_.use_ws ) {
245
+ scratchpad.book (
246
+ memory_tracking::names::key_pool_ind_plain2blocked_cvt,
247
+ dst_d.nelems (), sizeof (uint32_t ));
248
+ }
249
+ return status::success;
250
+ }
251
+
252
+ }; // pd_t
267
253
254
+ // constructor
268
255
acl_pooling_fwd_t (const pd_t *apd) : primitive_t (apd) {}
269
256
270
257
status_t execute (const exec_ctx_t &ctx) const override {
271
258
return execute_forward (ctx);
272
259
}
273
260
274
- status_t create_resource (
275
- engine_t *engine, resource_mapper_t &mapper) const override {
276
- if (mapper.has_resource (this )) return status::success;
277
-
278
- auto r = utils::make_unique<acl_pooling_resource_t >();
279
- if (!r) return status::out_of_memory;
280
-
281
- // Configure the resource based on information from primitive descriptor
282
- auto st = r->configure (pd ()->app );
283
- if (st == status::success) { mapper.add (this , std::move (r)); }
284
-
285
- return st;
286
- }
287
-
288
261
private:
289
- // execute_forward has to be const thus mutability of mtx
290
- mutable std::mutex mtx;
262
+ status_t init (engine_t *engine) override ;
291
263
status_t execute_forward (const exec_ctx_t &ctx) const ;
292
264
const pd_t *pd () const { return (const pd_t *)primitive_t::pd ().get (); }
265
+ std::unique_ptr<arm_compute::experimental::op::CpuPooling> pooling_op_;
293
266
}; // acl_pooling_fwd_t
294
267
295
268
} // namespace aarch64
0 commit comments