@@ -75,7 +75,7 @@ ov::npuw::IBaseInferRequest::RqPtrs ov::npuw::IBaseInferRequest::create_infer_re
75
75
OPENVINO_THROW (" NPUW: TEMPORARY LIMITATION: Couldn't create reference infer "
76
76
" requests if 'nireq' is set to > 1!" );
77
77
}
78
- LOG_INFO (" Create reference subrequest for submodel [" << id << " ] on " << m_npuw_model->m_ref_device << " ..." );
78
+ LOG_INFO (" Create reference subrequest for Subgraph [" << id << " ] on " << m_npuw_model->m_ref_device << " ..." );
79
79
LOG_BLOCK ();
80
80
if (m_npuw_model->submodel_device (id) != m_npuw_model->m_ref_device ) {
81
81
auto & ref_submodel = m_npuw_model->m_compiled_submodels .at (id).ref_compiled_model ;
@@ -85,60 +85,129 @@ ov::npuw::IBaseInferRequest::RqPtrs ov::npuw::IBaseInferRequest::create_infer_re
85
85
m_ref_subrequests.at (id) = std::move (ref_infer_request);
86
86
LOG_INFO (" Done" );
87
87
} else {
88
- LOG_INFO (" Skip creation of reference subrequest for submodule ["
89
- << id << " ] on reference device: " << m_npuw_model->m_ref_device << " , as actual subrequest ["
90
- << id << " ] has been already created on "
91
- << " it ." );
88
+ LOG_INFO (" Skip creation of reference subrequest for Subgraph ["
89
+ << id << " ] on reference device: " << m_npuw_model->m_ref_device << " , as actual subrequest ["
90
+ << id << " ] has been already created on "
91
+ << " it ." );
92
92
}
93
93
}
94
94
95
95
return rqs;
96
96
}
97
97
98
- void ov::npuw::IBaseInferRequest::ensure_subrequest_is_accurate (std::size_t idx, bool & failover) {
98
+ // index <-> real_idx matter here
99
+ void ov::npuw::IBaseInferRequest::ensure_subrequest_is_accurate (std::size_t idx, bool & accuracy_failover) {
100
+ if (!m_npuw_model->m_acc_check ) {
101
+ return ;
102
+ }
99
103
LOG_INFO (" Check if subrequest[" << idx << " ] is accurate..." );
100
104
LOG_BLOCK ();
101
- failover = false ;
102
- if (m_ref_subrequests.at (idx) != nullptr && m_subrequests.at (idx)._ptr != m_ref_subrequests.at (idx)._ptr ) {
103
- NPUW_ASSERT (m_npuw_model->m_compiled_submodels .at (idx).switched_to_ref == false );
104
- NPUW_ASSERT (m_npuw_model->m_compiled_submodels .at (idx).replaced_by .value_or (idx) == idx);
105
105
106
- const auto & ref_comp_model = m_ref_subrequests.at (idx)->get_compiled_model ();
107
- const auto & actual_comp_model = m_subrequests.at (idx)->get_compiled_model ();
106
+ if (m_npuw_model->submodel_device (idx) == m_npuw_model->m_ref_device ) {
107
+ LOG_INFO (" Skipped, subrequest[" << idx << " ] is launched on reference device." );
108
+ return ;
109
+ }
110
+
111
+ accuracy_failover = false ;
112
+
113
+ std::size_t real_idx = real (idx);
114
+ auto & actual_subr = m_subrequests.at (real_idx);
115
+ auto & ref_subr = m_ref_subrequests.at (real_idx);
116
+ NPUW_ASSERT (ref_subr);
117
+
118
+ // This check is omitted in the new PR. What does it mean?
119
+ // Check for already switched request is omiited -> that is okay.
120
+ if (actual_subr._ptr != ref_subr._ptr ) {
121
+ NPUW_ASSERT (m_npuw_model->m_compiled_submodels .at (real_idx).switched_to_ref == false );
122
+
123
+ const auto & ref_comp_model = ref_subr->get_compiled_model ();
124
+ const auto & actual_comp_model = actual_subr->get_compiled_model ();
108
125
NPUW_ASSERT (actual_comp_model->inputs ().size () == ref_comp_model->inputs ().size ());
109
126
// Setting inputs:
110
127
for (size_t i = 0 ; i < actual_comp_model->inputs ().size (); i++) {
111
- const auto & itensor = m_subrequests. at (idx) ->get_tensor (actual_comp_model->inputs ()[i]);
112
- m_ref_subrequests. at (idx) ->set_tensor (ref_comp_model->inputs ()[i], itensor);
128
+ const auto & itensor = actual_subr ->get_tensor (actual_comp_model->inputs ()[i]);
129
+ ref_subr ->set_tensor (ref_comp_model->inputs ()[i], itensor);
113
130
}
114
- m_ref_subrequests.at (idx)->infer ();
131
+ // Running inference:
132
+ ref_subr->infer ();
115
133
134
+ // Comparing results of actual and reference inferfences:
116
135
LOG_INFO (" Compare actual outputs against references:" );
117
136
bool tensors_converge = true ;
137
+ std::vector<bool > converges (actual_comp_model->outputs ().size ());
138
+ std::vector<double > metrics (actual_comp_model->outputs ().size ());
118
139
for (size_t i = 0 ; i < actual_comp_model->outputs ().size (); i++) {
119
- LOG_INFO (" - " << actual_comp_model->outputs ()[i]);
120
- const auto & actual_tensor = m_subrequests.at (idx)->get_tensor (actual_comp_model->outputs ()[i]);
121
- const auto & ref_tensor = m_ref_subrequests.at (idx)->get_tensor (ref_comp_model->outputs ()[i]);
122
- LOG_BLOCK ();
123
- tensors_converge &= m_npuw_model->m_acc_check (actual_tensor, ref_tensor);
140
+ const auto & actual_tensor = actual_subr->get_tensor (actual_comp_model->outputs ()[i]);
141
+ const auto & ref_tensor = ref_subr->get_tensor (ref_comp_model->outputs ()[i]);
142
+ converges[i] = m_npuw_model->m_acc_check (actual_tensor, ref_tensor, &metrics[i]);
143
+ tensors_converge &= converges[i];
144
+ }
145
+ if (tensors_converge == false ) {
146
+ if (ov::npuw::get_log_level () == ov::npuw::LogLevel::Error) {
147
+ // For just log level error print header message:
148
+ LOG_ERROR (" Check if subrequest[" << idx << " ] is accurate..." );
149
+ }
150
+ }
151
+ // Log comparison details:
152
+ for (size_t i = 0 ; i < actual_comp_model->outputs ().size (); i++) {
153
+ if (converges[i]) {
154
+ LOG_INFO (" - " << actual_comp_model->outputs ()[i]);
155
+ LOG_BLOCK ();
156
+ LOG_INFO (m_npuw_model->m_acc_check_name << " loss: " << metrics[i] <<
157
+ " , threshold: " << m_npuw_model->m_acc_check_threshold << " ." );
158
+ LOG_INFO (" PASS" );
159
+ } else {
160
+ LOG_ERROR (" - " << actual_comp_model->outputs ()[i]);
161
+ LOG_BLOCK ();
162
+ LOG_ERROR (m_npuw_model->m_acc_check_name << " loss: " << metrics[i] <<
163
+ " , threshold: " << m_npuw_model->m_acc_check_threshold << " ." );
164
+ LOG_ERROR (" FAIL" );
165
+ }
124
166
}
125
- LOG_INFO ((tensors_converge ? " PASS" : " FAIL" ));
126
167
127
- if (!tensors_converge) {
128
- LOG_INFO (" Subrequest is inaccurate, failover to reference." );
129
- // FIXME: We need to copy reference tensors to actual only in single-model-inference mode
130
- // or if our subgraph is last in the chain.
168
+ // If comparison fails, failover to reference results:
169
+ if (tensors_converge) {
170
+ LOG_INFO (" PASS" );
171
+ } else {
172
+ LOG_ERROR (" FAIL" );
173
+ LOG_ERROR (" Subrequest[" << idx << " ] is inaccurate, failover to reference results." );
174
+ if (idx != real_idx) {
175
+ LOG_ERROR (" As subrequest[" << idx << " ] is actually " << " subrequest[" << real_idx <<
176
+ " ], all subrequests, corresponding to last, will be further " <<
177
+ " launched on " << m_npuw_model->m_ref_device << " .'" );
178
+ } else if (m_npuw_model->m_compiled_submodels [real_idx].replaced_by ) {
179
+ LOG_ERROR (" As subrequest[" << real_idx << " ] is actually " << " a function, all " <<
180
+ " subrequests, corresponding to it, will be further launched on " <<
181
+ m_npuw_model->m_ref_device << " ." );
182
+ }
183
+
184
+ if (m_npuw_model->m_cfg .get <::intel_npu::NPUW_ACC_DUMP_FAILS>()) {
185
+ const auto model = m_npuw_model->m_compiled_submodels [real_idx].model ;
186
+ const auto model_path = " inaccurate_" + model->get_friendly_name () + " .xml" ;
187
+ ov::save_model (model, model_path);
188
+ dump_input_tensors (idx, true );
189
+ dump_output_tensors (idx, true );
190
+ }
191
+
192
+ // TODO: For future implementation of accuracy failover in spatial mode, it will
193
+ // be safe to just copy results from reference subrequest back to already
194
+ // properly allocated and linked tensors of actual subrequest due to the
195
+ // complex memory management logic.
196
+
197
+ // FIXME: Now, we need to copy reference tensors to actual only in
198
+ // single-model-inference mode or if our subgraph is the last in the chain.
131
199
for (size_t i = 0 ; i < actual_comp_model->outputs ().size (); i++) {
132
- const auto & actual_tensor = m_subrequests. at (idx) ->get_tensor (actual_comp_model->outputs ()[i]);
133
- const auto & ref_tensor = m_ref_subrequests. at (idx) ->get_tensor (ref_comp_model->outputs ()[i]);
200
+ const auto & actual_tensor = actual_subr ->get_tensor (actual_comp_model->outputs ()[i]);
201
+ const auto & ref_tensor = ref_subr ->get_tensor (ref_comp_model->outputs ()[i]);
134
202
ref_tensor->copy_to (actual_tensor._ptr );
135
203
}
136
- m_npuw_model->m_compiled_submodels .at (idx ).compiled_model =
137
- m_npuw_model->m_compiled_submodels .at (idx ).ref_compiled_model ;
138
- m_npuw_model-> m_compiled_submodels .at (idx). switched_to_ref = true ;
139
- m_subrequests. at ( idx) = m_ref_subrequests. at (idx);
204
+ m_npuw_model->m_compiled_submodels .at (real_idx ).compiled_model =
205
+ m_npuw_model->m_compiled_submodels .at (real_idx ).ref_compiled_model ;
206
+ m_subrequests .at (real_idx) = m_ref_subrequests. at (real_idx) ;
207
+ // Using idx here, let real_idx be handled inside if needed.
140
208
update_subrequest_links (idx);
141
- failover = true ;
209
+ m_npuw_model->m_compiled_submodels .at (real_idx).switched_to_ref = true ;
210
+ accuracy_failover = true ;
142
211
}
143
212
144
213
LOG_INFO (" Done" );
@@ -226,9 +295,9 @@ void ov::npuw::IBaseInferRequest::infer() {
226
295
m_run_iter++;
227
296
228
297
if (failover_happened) {
229
- LOG_INFO (" Refined device distribution:" );
298
+ LOG_ERROR (" Refined device distribution:" );
230
299
LOG_BLOCK ();
231
- m_npuw_model->log_device_dist ();
300
+ m_npuw_model->log_device_dist (ov::npuw::LogLevel::Error );
232
301
}
233
302
m_now_idx.reset ();
234
303
}
@@ -488,12 +557,11 @@ void ov::npuw::IBaseInferRequest::bind_global_results(std::size_t idx, RqPtr req
488
557
LOG_DEBUG (" Done" );
489
558
}
490
559
491
- void ov::npuw::IBaseInferRequest::dump_input_tensors (std::size_t idx) {
560
+ void ov::npuw::IBaseInferRequest::dump_input_tensors (std::size_t idx, bool forced ) {
492
561
const std::string dump_ios_opt = m_npuw_model->m_cfg .get <::intel_npu::NPUW_DUMP_IO>();
493
562
const std::size_t end_idx = m_npuw_model->m_compiled_submodels .size ();
494
- auto real_idx = m_npuw_model->m_compiled_submodels [idx].replaced_by .value_or (idx);
495
-
496
- if (!ov::npuw::util::is_set (idx, dump_ios_opt, real_idx, end_idx)) {
563
+ const std::size_t real_idx = real (idx);
564
+ if (!ov::npuw::util::is_set (idx, dump_ios_opt, real_idx, end_idx) && !forced) {
497
565
return ;
498
566
}
499
567
@@ -569,12 +637,12 @@ void ov::npuw::IBaseInferRequest::dump_input_tensors(std::size_t idx) {
569
637
}
570
638
}
571
639
572
- void ov::npuw::IBaseInferRequest::dump_output_tensors (std::size_t idx) {
640
+ void ov::npuw::IBaseInferRequest::dump_output_tensors (std::size_t idx, bool forced ) {
573
641
const std::string dump_ios_opt = m_npuw_model->m_cfg .get <::intel_npu::NPUW_DUMP_IO>();
574
642
const std::size_t end_idx = m_npuw_model->m_compiled_submodels .size ();
575
643
auto real_idx = m_npuw_model->m_compiled_submodels [idx].replaced_by .value_or (idx);
576
644
577
- if (!ov::npuw::util::is_set (idx, dump_ios_opt, real_idx, end_idx)) {
645
+ if (!ov::npuw::util::is_set (idx, dump_ios_opt, real_idx, end_idx) && !forced ) {
578
646
return ;
579
647
}
580
648
0 commit comments