Skip to content

Commit 1b18cb0

Browse files
committed
Picked fixes to current version of Accuracy Failover from PR openvinotoolkit#27348
1 parent 3f6e736 commit 1b18cb0

File tree

10 files changed

+214
-85
lines changed

10 files changed

+214
-85
lines changed

src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ DEFINE_OPT(NPUW_UNFOLD_IREQS, bool, false, npuw::unfold_ireqs, RunTime);
6363
DEFINE_OPT(NPUW_ACC_CHECK, bool, false, npuw::accuracy::check, RunTime);
6464
DEFINE_OPT(NPUW_ACC_THRESH, double, 0.01, npuw::accuracy::threshold, RunTime);
6565
DEFINE_OPT(NPUW_ACC_DEVICE, std::string, "", npuw::accuracy::reference_device, RunTime);
66+
DEFINE_OPT(NPUW_ACC_DUMP_FAILS, bool, false, npuw::accuracy::dump_failures, RunTime);
6667
DEFINE_OPT(NPUW_DUMP_FULL, bool, false, npuw::dump::full, CompileTime);
6768
DEFINE_OPT(NPUW_DUMP_SUBS, std::string, "", npuw::dump::subgraphs, CompileTime);
6869
DEFINE_OPT(NPUW_DUMP_SUBS_ON_FAIL, std::string, "", npuw::dump::subgraphs_on_fail, CompileTime);

src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -331,6 +331,14 @@ static constexpr ov::Property<double> threshold{"NPUW_ACC_THRESH"};
331331
* Default value: empty.
332332
*/
333333
static constexpr ov::Property<std::string> reference_device{"NPUW_ACC_DEVICE"};
334+
335+
/**
336+
* @brief
337+
* Type: bool.
338+
* Enable dumps of materials for model(s), failing accuracy check.
339+
* Default value: false.
340+
*/
341+
static constexpr ov::Property<bool> dump_failures{"NPUW_ACC_DUMP_FAILS"};
334342
} // namespace accuracy
335343

336344
namespace dump {

src/plugins/intel_npu/src/al/src/config/npuw.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ void intel_npu::registerNPUWOptions(OptionsDesc& desc) {
4747
desc.add<NPUW_ACC_CHECK>();
4848
desc.add<NPUW_ACC_THRESH>();
4949
desc.add<NPUW_ACC_DEVICE>();
50+
desc.add<NPUW_ACC_DUMP_FAILS>();
5051
#ifdef NPU_PLUGIN_DEVELOPER_BUILD
5152
desc.add<NPUW_DUMP_FULL>();
5253
desc.add<NPUW_DUMP_SUBS>();

src/plugins/intel_npu/src/plugin/npuw/accuracy/comparator.cpp

Lines changed: 44 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -13,30 +13,47 @@
1313
ov::npuw::metrics::NRMSE::NRMSE(double threshold) : m_threshold(threshold) {}
1414

1515
bool ov::npuw::metrics::NRMSE::operator()(const ov::SoPtr<ov::ITensor>& actual,
16-
const ov::SoPtr<ov::ITensor>& reference) const {
17-
NPUW_ASSERT(actual->is_continuous());
18-
NPUW_ASSERT(reference->is_continuous());
16+
const ov::SoPtr<ov::ITensor>& reference,
17+
double* result) const {
1918
NPUW_ASSERT(actual->get_shape() == reference->get_shape());
2019
// Check for alignment:
2120
NPUW_ASSERT(actual->get_byte_size() == reference->get_byte_size());
22-
// FIXME: Check for strides
21+
22+
ov::Tensor in_actual(actual->get_element_type(), actual->get_shape());
23+
ov::Tensor in_reference(reference->get_element_type(), reference->get_shape());
24+
25+
if (!actual->is_continuous()) {
26+
ov::make_tensor(actual).copy_to(in_actual);
27+
} else {
28+
in_actual = ov::make_tensor(actual);
29+
}
30+
if (!reference->is_continuous()) {
31+
ov::make_tensor(reference).copy_to(in_reference);
32+
} else {
33+
in_reference = ov::make_tensor(reference);
34+
}
35+
36+
// TODO: it might be more correct to make to_f32 function
37+
// to work with strided tensors
38+
NPUW_ASSERT(in_actual.is_continuous());
39+
NPUW_ASSERT(in_reference.is_continuous());
2340

2441
ov::Tensor actual_f32;
2542
ov::Tensor reference_f32;
2643

27-
if (ov::element::Type_t::f32 == actual->get_element_type()) {
28-
actual_f32 = ov::make_tensor(actual);
44+
if (ov::element::f32 == in_actual.get_element_type()) {
45+
actual_f32 = in_actual;
2946
} else {
30-
ov::Tensor dst(ov::element::Type_t::f32, actual->get_shape());
31-
ov::npuw::util::to_f32(ov::make_tensor(actual), dst);
47+
ov::Tensor dst(ov::element::Type_t::f32, in_actual.get_shape());
48+
ov::npuw::util::to_f32(in_actual, dst);
3249
actual_f32 = std::move(dst);
3350
}
3451

35-
if (ov::element::Type_t::f32 == reference->get_element_type()) {
36-
reference_f32 = ov::make_tensor(reference);
52+
if (ov::element::f32 == in_reference.get_element_type()) {
53+
reference_f32 = in_reference;
3754
} else {
38-
ov::Tensor dst(ov::element::Type_t::f32, reference->get_shape());
39-
ov::npuw::util::to_f32(ov::make_tensor(reference), dst);
55+
ov::Tensor dst(ov::element::Type_t::f32, in_reference.get_shape());
56+
ov::npuw::util::to_f32(in_reference, dst);
4057
reference_f32 = dst;
4158
}
4259

@@ -51,13 +68,21 @@ bool ov::npuw::metrics::NRMSE::operator()(const ov::SoPtr<ov::ITensor>& actual,
5168
}
5269

5370
if (squared_error <= std::numeric_limits<double>::epsilon()) {
54-
LOG_INFO("NRMSE loss: 0.0, threshold: " << m_threshold << ".");
55-
LOG_INFO("PASS");
71+
if (result != nullptr) {
72+
*result = 0.0;
73+
}
5674
return true;
5775
}
5876

5977
double rmse = sqrt(squared_error / size);
60-
NPUW_ASSERT(rmse >= 0.0);
78+
79+
if (rmse < 0.0) {
80+
// Calculated RMSE metric is < 0.0, what is unexpected. So, return that tensors are unequal.
81+
if (result != nullptr) {
82+
*result = rmse;
83+
}
84+
return false;
85+
}
6186

6287
auto actual_min_max = std::minmax_element(actual_data, actual_data + size);
6388
auto reference_min_max = std::minmax_element(reference_data, reference_data + size);
@@ -66,9 +91,8 @@ bool ov::npuw::metrics::NRMSE::operator()(const ov::SoPtr<ov::ITensor>& actual,
6691
std::max(0.f, *actual_min_max.second) - std::min(0.f, *actual_min_max.first)});
6792

6893
double nrmse = rmse / den;
69-
LOG_INFO("NRMSE loss: " << nrmse << ", threshold: " << m_threshold << ".");
70-
71-
bool success = nrmse <= m_threshold;
72-
LOG_INFO((success ? "PASS" : "FAIL"));
73-
return success;
94+
if (result != nullptr) {
95+
*result = nrmse;
96+
}
97+
return nrmse <= m_threshold;
7498
}

src/plugins/intel_npu/src/plugin/npuw/accuracy/comparator.hpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,9 @@ namespace metrics {
1515
class NRMSE {
1616
public:
1717
explicit NRMSE(double threshold);
18-
bool operator()(const ov::SoPtr<ov::ITensor>& backup_tensor, const ov::SoPtr<ov::ITensor>& original_tensor) const;
19-
18+
bool operator()(const ov::SoPtr<ov::ITensor>& backup_tensor,
19+
const ov::SoPtr<ov::ITensor>& original_tensor,
20+
double* result = nullptr) const;
2021
private:
2122
double m_threshold{};
2223
};

src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.cpp

Lines changed: 108 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ ov::npuw::IBaseInferRequest::RqPtrs ov::npuw::IBaseInferRequest::create_infer_re
7575
OPENVINO_THROW("NPUW: TEMPORARY LIMITATION: Couldn't create reference infer "
7676
"requests if 'nireq' is set to > 1!");
7777
}
78-
LOG_INFO("Create reference subrequest for submodel [" << id << "] on " << m_npuw_model->m_ref_device << "...");
78+
LOG_INFO("Create reference subrequest for Subgraph[" << id << "] on " << m_npuw_model->m_ref_device << "...");
7979
LOG_BLOCK();
8080
if (m_npuw_model->submodel_device(id) != m_npuw_model->m_ref_device) {
8181
auto& ref_submodel = m_npuw_model->m_compiled_submodels.at(id).ref_compiled_model;
@@ -85,60 +85,129 @@ ov::npuw::IBaseInferRequest::RqPtrs ov::npuw::IBaseInferRequest::create_infer_re
8585
m_ref_subrequests.at(id) = std::move(ref_infer_request);
8686
LOG_INFO("Done");
8787
} else {
88-
LOG_INFO("Skip creation of reference subrequest for submodule["
89-
<< id << "] on reference device: " << m_npuw_model->m_ref_device << ", as actual subrequest ["
90-
<< id << "] has been already created on "
91-
<< "it .");
88+
LOG_INFO("Skip creation of reference subrequest for Subgraph["
89+
<< id << "] on reference device: " << m_npuw_model->m_ref_device << ", as actual subrequest ["
90+
<< id << "] has been already created on "
91+
<< "it .");
9292
}
9393
}
9494

9595
return rqs;
9696
}
9797

98-
void ov::npuw::IBaseInferRequest::ensure_subrequest_is_accurate(std::size_t idx, bool& failover) {
98+
// index <-> real_idx matter here
99+
void ov::npuw::IBaseInferRequest::ensure_subrequest_is_accurate(std::size_t idx, bool& accuracy_failover) {
100+
if (!m_npuw_model->m_acc_check) {
101+
return;
102+
}
99103
LOG_INFO("Check if subrequest[" << idx << "] is accurate...");
100104
LOG_BLOCK();
101-
failover = false;
102-
if (m_ref_subrequests.at(idx) != nullptr && m_subrequests.at(idx)._ptr != m_ref_subrequests.at(idx)._ptr) {
103-
NPUW_ASSERT(m_npuw_model->m_compiled_submodels.at(idx).switched_to_ref == false);
104-
NPUW_ASSERT(m_npuw_model->m_compiled_submodels.at(idx).replaced_by.value_or(idx) == idx);
105105

106-
const auto& ref_comp_model = m_ref_subrequests.at(idx)->get_compiled_model();
107-
const auto& actual_comp_model = m_subrequests.at(idx)->get_compiled_model();
106+
if (m_npuw_model->submodel_device(idx) == m_npuw_model->m_ref_device) {
107+
LOG_INFO("Skipped, subrequest[" << idx << "] is launched on reference device.");
108+
return;
109+
}
110+
111+
accuracy_failover = false;
112+
113+
std::size_t real_idx = real(idx);
114+
auto& actual_subr = m_subrequests.at(real_idx);
115+
auto& ref_subr = m_ref_subrequests.at(real_idx);
116+
NPUW_ASSERT(ref_subr);
117+
118+
// This check is omitted in the new PR. What does it mean?
119+
// Check for already switched request is omiited -> that is okay.
120+
if (actual_subr._ptr != ref_subr._ptr) {
121+
NPUW_ASSERT(m_npuw_model->m_compiled_submodels.at(real_idx).switched_to_ref == false);
122+
123+
const auto& ref_comp_model = ref_subr->get_compiled_model();
124+
const auto& actual_comp_model = actual_subr->get_compiled_model();
108125
NPUW_ASSERT(actual_comp_model->inputs().size() == ref_comp_model->inputs().size());
109126
// Setting inputs:
110127
for (size_t i = 0; i < actual_comp_model->inputs().size(); i++) {
111-
const auto& itensor = m_subrequests.at(idx)->get_tensor(actual_comp_model->inputs()[i]);
112-
m_ref_subrequests.at(idx)->set_tensor(ref_comp_model->inputs()[i], itensor);
128+
const auto& itensor = actual_subr->get_tensor(actual_comp_model->inputs()[i]);
129+
ref_subr->set_tensor(ref_comp_model->inputs()[i], itensor);
113130
}
114-
m_ref_subrequests.at(idx)->infer();
131+
// Running inference:
132+
ref_subr->infer();
115133

134+
// Comparing results of actual and reference inferfences:
116135
LOG_INFO("Compare actual outputs against references:");
117136
bool tensors_converge = true;
137+
std::vector<bool> converges(actual_comp_model->outputs().size());
138+
std::vector<double> metrics(actual_comp_model->outputs().size());
118139
for (size_t i = 0; i < actual_comp_model->outputs().size(); i++) {
119-
LOG_INFO(" - " << actual_comp_model->outputs()[i]);
120-
const auto& actual_tensor = m_subrequests.at(idx)->get_tensor(actual_comp_model->outputs()[i]);
121-
const auto& ref_tensor = m_ref_subrequests.at(idx)->get_tensor(ref_comp_model->outputs()[i]);
122-
LOG_BLOCK();
123-
tensors_converge &= m_npuw_model->m_acc_check(actual_tensor, ref_tensor);
140+
const auto& actual_tensor = actual_subr->get_tensor(actual_comp_model->outputs()[i]);
141+
const auto& ref_tensor = ref_subr->get_tensor(ref_comp_model->outputs()[i]);
142+
converges[i] = m_npuw_model->m_acc_check(actual_tensor, ref_tensor, &metrics[i]);
143+
tensors_converge &= converges[i];
144+
}
145+
if (tensors_converge == false) {
146+
if (ov::npuw::get_log_level() == ov::npuw::LogLevel::Error) {
147+
// For just log level error print header message:
148+
LOG_ERROR("Check if subrequest[" << idx << "] is accurate...");
149+
}
150+
}
151+
// Log comparison details:
152+
for (size_t i = 0; i < actual_comp_model->outputs().size(); i++) {
153+
if (converges[i]) {
154+
LOG_INFO(" - " << actual_comp_model->outputs()[i]);
155+
LOG_BLOCK();
156+
LOG_INFO(m_npuw_model->m_acc_check_name << " loss: " << metrics[i] <<
157+
", threshold: " << m_npuw_model->m_acc_check_threshold << ".");
158+
LOG_INFO("PASS");
159+
} else {
160+
LOG_ERROR(" - " << actual_comp_model->outputs()[i]);
161+
LOG_BLOCK();
162+
LOG_ERROR(m_npuw_model->m_acc_check_name << " loss: " << metrics[i] <<
163+
", threshold: " << m_npuw_model->m_acc_check_threshold << ".");
164+
LOG_ERROR("FAIL");
165+
}
124166
}
125-
LOG_INFO((tensors_converge ? "PASS" : "FAIL"));
126167

127-
if (!tensors_converge) {
128-
LOG_INFO("Subrequest is inaccurate, failover to reference.");
129-
// FIXME: We need to copy reference tensors to actual only in single-model-inference mode
130-
// or if our subgraph is last in the chain.
168+
// If comparison fails, failover to reference results:
169+
if (tensors_converge) {
170+
LOG_INFO("PASS");
171+
} else {
172+
LOG_ERROR("FAIL");
173+
LOG_ERROR("Subrequest[" << idx << "] is inaccurate, failover to reference results.");
174+
if (idx != real_idx) {
175+
LOG_ERROR("As subrequest[" << idx << "] is actually " << "subrequest[" << real_idx <<
176+
"], all subrequests, corresponding to last, will be further " <<
177+
"launched on " << m_npuw_model->m_ref_device << ".'");
178+
} else if (m_npuw_model->m_compiled_submodels[real_idx].replaced_by) {
179+
LOG_ERROR("As subrequest[" << real_idx << "] is actually " << "a function, all " <<
180+
"subrequests, corresponding to it, will be further launched on " <<
181+
m_npuw_model->m_ref_device << ".");
182+
}
183+
184+
if (m_npuw_model->m_cfg.get<::intel_npu::NPUW_ACC_DUMP_FAILS>()) {
185+
const auto model = m_npuw_model->m_compiled_submodels[real_idx].model;
186+
const auto model_path = "inaccurate_" + model->get_friendly_name() + ".xml";
187+
ov::save_model(model, model_path);
188+
dump_input_tensors(idx, true);
189+
dump_output_tensors(idx, true);
190+
}
191+
192+
// TODO: For future implementation of accuracy failover in spatial mode, it will
193+
// be safe to just copy results from reference subrequest back to already
194+
// properly allocated and linked tensors of actual subrequest due to the
195+
// complex memory management logic.
196+
197+
// FIXME: Now, we need to copy reference tensors to actual only in
198+
// single-model-inference mode or if our subgraph is the last in the chain.
131199
for (size_t i = 0; i < actual_comp_model->outputs().size(); i++) {
132-
const auto& actual_tensor = m_subrequests.at(idx)->get_tensor(actual_comp_model->outputs()[i]);
133-
const auto& ref_tensor = m_ref_subrequests.at(idx)->get_tensor(ref_comp_model->outputs()[i]);
200+
const auto& actual_tensor = actual_subr->get_tensor(actual_comp_model->outputs()[i]);
201+
const auto& ref_tensor = ref_subr->get_tensor(ref_comp_model->outputs()[i]);
134202
ref_tensor->copy_to(actual_tensor._ptr);
135203
}
136-
m_npuw_model->m_compiled_submodels.at(idx).compiled_model =
137-
m_npuw_model->m_compiled_submodels.at(idx).ref_compiled_model;
138-
m_npuw_model->m_compiled_submodels.at(idx).switched_to_ref = true;
139-
m_subrequests.at(idx) = m_ref_subrequests.at(idx);
204+
m_npuw_model->m_compiled_submodels.at(real_idx).compiled_model =
205+
m_npuw_model->m_compiled_submodels.at(real_idx).ref_compiled_model;
206+
m_subrequests.at(real_idx) = m_ref_subrequests.at(real_idx);
207+
// Using idx here, let real_idx be handled inside if needed.
140208
update_subrequest_links(idx);
141-
failover = true;
209+
m_npuw_model->m_compiled_submodels.at(real_idx).switched_to_ref = true;
210+
accuracy_failover = true;
142211
}
143212

144213
LOG_INFO("Done");
@@ -226,9 +295,9 @@ void ov::npuw::IBaseInferRequest::infer() {
226295
m_run_iter++;
227296

228297
if (failover_happened) {
229-
LOG_INFO("Refined device distribution:");
298+
LOG_ERROR("Refined device distribution:");
230299
LOG_BLOCK();
231-
m_npuw_model->log_device_dist();
300+
m_npuw_model->log_device_dist(ov::npuw::LogLevel::Error);
232301
}
233302
m_now_idx.reset();
234303
}
@@ -488,12 +557,11 @@ void ov::npuw::IBaseInferRequest::bind_global_results(std::size_t idx, RqPtr req
488557
LOG_DEBUG("Done");
489558
}
490559

491-
void ov::npuw::IBaseInferRequest::dump_input_tensors(std::size_t idx) {
560+
void ov::npuw::IBaseInferRequest::dump_input_tensors(std::size_t idx, bool forced) {
492561
const std::string dump_ios_opt = m_npuw_model->m_cfg.get<::intel_npu::NPUW_DUMP_IO>();
493562
const std::size_t end_idx = m_npuw_model->m_compiled_submodels.size();
494-
auto real_idx = m_npuw_model->m_compiled_submodels[idx].replaced_by.value_or(idx);
495-
496-
if (!ov::npuw::util::is_set(idx, dump_ios_opt, real_idx, end_idx)) {
563+
const std::size_t real_idx = real(idx);
564+
if (!ov::npuw::util::is_set(idx, dump_ios_opt, real_idx, end_idx) && !forced) {
497565
return;
498566
}
499567

@@ -569,12 +637,12 @@ void ov::npuw::IBaseInferRequest::dump_input_tensors(std::size_t idx) {
569637
}
570638
}
571639

572-
void ov::npuw::IBaseInferRequest::dump_output_tensors(std::size_t idx) {
640+
void ov::npuw::IBaseInferRequest::dump_output_tensors(std::size_t idx, bool forced) {
573641
const std::string dump_ios_opt = m_npuw_model->m_cfg.get<::intel_npu::NPUW_DUMP_IO>();
574642
const std::size_t end_idx = m_npuw_model->m_compiled_submodels.size();
575643
auto real_idx = m_npuw_model->m_compiled_submodels[idx].replaced_by.value_or(idx);
576644

577-
if (!ov::npuw::util::is_set(idx, dump_ios_opt, real_idx, end_idx)) {
645+
if (!ov::npuw::util::is_set(idx, dump_ios_opt, real_idx, end_idx) && !forced) {
578646
return;
579647
}
580648

src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.hpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -150,8 +150,8 @@ class IBaseInferRequest : public ov::ISyncInferRequest {
150150
virtual void bind_global_params(std::size_t idx, RqPtr request);
151151
virtual void bind_global_results(std::size_t idx, RqPtr request);
152152

153-
void dump_input_tensors(std::size_t idx);
154-
void dump_output_tensors(std::size_t idx);
153+
void dump_input_tensors(std::size_t idx, bool forced = false);
154+
void dump_output_tensors(std::size_t idx, bool forced = false);
155155

156156
// Quick-and-dirty profiling
157157
ov::npuw::perf::metric<float, ov::npuw::perf::MSec> m_ms_unpack;

0 commit comments

Comments
 (0)