diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp index 82dfaceb12ddd4..05a70a9c0c8796 100644 --- a/src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp +++ b/src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp @@ -98,6 +98,7 @@ DEFINE_OPT(NPUW_UNFOLD_IREQS, bool, false, npuw::unfold_ireqs, RunTime); DEFINE_OPT(NPUW_ACC_CHECK, bool, false, npuw::accuracy::check, RunTime); DEFINE_OPT(NPUW_ACC_THRESH, double, 0.01, npuw::accuracy::threshold, RunTime); DEFINE_OPT(NPUW_ACC_DEVICE, std::string, "", npuw::accuracy::reference_device, RunTime); +DEFINE_OPT(NPUW_ACC_DUMP_FAILS, bool, false, npuw::accuracy::dump_failures, RunTime); DEFINE_OPT(NPUW_DUMP_FULL, bool, false, npuw::dump::full, RunTime); DEFINE_OPT(NPUW_DUMP_SUBS, std::string, "", npuw::dump::subgraphs, RunTime); DEFINE_OPT(NPUW_DUMP_SUBS_ON_FAIL, std::string, "", npuw::dump::subgraphs_on_fail, RunTime); diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp index afdd28a46c5ddc..12354701eed725 100644 --- a/src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp +++ b/src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp @@ -331,6 +331,14 @@ static constexpr ov::Property threshold{"NPUW_ACC_THRESH"}; * Default value: empty. */ static constexpr ov::Property reference_device{"NPUW_ACC_DEVICE"}; + +/** + * @brief + * Type: bool. + * Enable dumps of materials for model(s), failing accuracy check. + * Default value: false. + */ +static constexpr ov::Property dump_failures{"NPUW_ACC_DUMP_FAILS"}; } // namespace accuracy namespace dump { diff --git a/src/plugins/intel_npu/src/al/src/config/npuw.cpp b/src/plugins/intel_npu/src/al/src/config/npuw.cpp index 76a28acc8016d4..edb666bda427a7 100644 --- a/src/plugins/intel_npu/src/al/src/config/npuw.cpp +++ b/src/plugins/intel_npu/src/al/src/config/npuw.cpp @@ -47,6 +47,7 @@ void intel_npu::registerNPUWOptions(OptionsDesc& desc) { desc.add(); desc.add(); desc.add(); + desc.add(); #ifdef NPU_PLUGIN_DEVELOPER_BUILD desc.add(); desc.add(); diff --git a/src/plugins/intel_npu/src/plugin/npuw/accuracy/comparator.cpp b/src/plugins/intel_npu/src/plugin/npuw/accuracy/comparator.cpp index 4440027c818969..13294ac521f122 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/accuracy/comparator.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/accuracy/comparator.cpp @@ -13,30 +13,47 @@ ov::npuw::metrics::NRMSE::NRMSE(double threshold) : m_threshold(threshold) {} bool ov::npuw::metrics::NRMSE::operator()(const ov::SoPtr& actual, - const ov::SoPtr& reference) const { - NPUW_ASSERT(actual->is_continuous()); - NPUW_ASSERT(reference->is_continuous()); + const ov::SoPtr& reference, + double* result) const { NPUW_ASSERT(actual->get_shape() == reference->get_shape()); // Check for alignment: NPUW_ASSERT(actual->get_byte_size() == reference->get_byte_size()); - // FIXME: Check for strides + + ov::Tensor in_actual(actual->get_element_type(), actual->get_shape()); + ov::Tensor in_reference(reference->get_element_type(), reference->get_shape()); + + if (!actual->is_continuous()) { + ov::make_tensor(actual).copy_to(in_actual); + } else { + in_actual = ov::make_tensor(actual); + } + if (!reference->is_continuous()) { + ov::make_tensor(reference).copy_to(in_reference); + } else { + in_reference = ov::make_tensor(reference); + } + + // TODO: it might be more correct to make to_f32 function + // to work with strided tensors + NPUW_ASSERT(in_actual.is_continuous()); + NPUW_ASSERT(in_reference.is_continuous()); ov::Tensor actual_f32; ov::Tensor reference_f32; - if (ov::element::Type_t::f32 == actual->get_element_type()) { - actual_f32 = ov::make_tensor(actual); + if (ov::element::f32 == in_actual.get_element_type()) { + actual_f32 = in_actual; } else { - ov::Tensor dst(ov::element::Type_t::f32, actual->get_shape()); - ov::npuw::util::to_f32(ov::make_tensor(actual), dst); + ov::Tensor dst(ov::element::Type_t::f32, in_actual.get_shape()); + ov::npuw::util::to_f32(in_actual, dst); actual_f32 = std::move(dst); } - if (ov::element::Type_t::f32 == reference->get_element_type()) { - reference_f32 = ov::make_tensor(reference); + if (ov::element::f32 == in_reference.get_element_type()) { + reference_f32 = in_reference; } else { - ov::Tensor dst(ov::element::Type_t::f32, reference->get_shape()); - ov::npuw::util::to_f32(ov::make_tensor(reference), dst); + ov::Tensor dst(ov::element::Type_t::f32, in_reference.get_shape()); + ov::npuw::util::to_f32(in_reference, dst); reference_f32 = dst; } @@ -51,13 +68,21 @@ bool ov::npuw::metrics::NRMSE::operator()(const ov::SoPtr& actual, } if (squared_error <= std::numeric_limits::epsilon()) { - LOG_INFO("NRMSE loss: 0.0, threshold: " << m_threshold << "."); - LOG_INFO("PASS"); + if (result != nullptr) { + *result = 0.0; + } return true; } double rmse = sqrt(squared_error / size); - NPUW_ASSERT(rmse >= 0.0); + + if (rmse < 0.0) { + // Calculated RMSE metric is < 0.0, what is unexpected. So, return that tensors are unequal. + if (result != nullptr) { + *result = rmse; + } + return false; + } auto actual_min_max = std::minmax_element(actual_data, actual_data + size); auto reference_min_max = std::minmax_element(reference_data, reference_data + size); @@ -66,9 +91,8 @@ bool ov::npuw::metrics::NRMSE::operator()(const ov::SoPtr& actual, std::max(0.f, *actual_min_max.second) - std::min(0.f, *actual_min_max.first)}); double nrmse = rmse / den; - LOG_INFO("NRMSE loss: " << nrmse << ", threshold: " << m_threshold << "."); - - bool success = nrmse <= m_threshold; - LOG_INFO((success ? "PASS" : "FAIL")); - return success; + if (result != nullptr) { + *result = nrmse; + } + return nrmse <= m_threshold; } diff --git a/src/plugins/intel_npu/src/plugin/npuw/accuracy/comparator.hpp b/src/plugins/intel_npu/src/plugin/npuw/accuracy/comparator.hpp index e77a38ced0edc2..1d0182582946c3 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/accuracy/comparator.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/accuracy/comparator.hpp @@ -15,8 +15,9 @@ namespace metrics { class NRMSE { public: explicit NRMSE(double threshold); - bool operator()(const ov::SoPtr& backup_tensor, const ov::SoPtr& original_tensor) const; - + bool operator()(const ov::SoPtr& backup_tensor, + const ov::SoPtr& original_tensor, + double* result = nullptr) const; private: double m_threshold{}; }; diff --git a/src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.cpp b/src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.cpp index c6fc0d6dd563c4..a6fb0cae4d8436 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.cpp @@ -75,7 +75,7 @@ ov::npuw::IBaseInferRequest::RqPtrs ov::npuw::IBaseInferRequest::create_infer_re OPENVINO_THROW("NPUW: TEMPORARY LIMITATION: Couldn't create reference infer " "requests if 'nireq' is set to > 1!"); } - LOG_INFO("Create reference subrequest for submodel [" << id << "] on " << m_npuw_model->m_ref_device << "..."); + LOG_INFO("Create reference subrequest for Subgraph[" << id << "] on " << m_npuw_model->m_ref_device << "..."); LOG_BLOCK(); if (m_npuw_model->submodel_device(id) != m_npuw_model->m_ref_device) { auto& ref_submodel = m_npuw_model->m_compiled_submodels.at(id).ref_compiled_model; @@ -94,56 +94,215 @@ ov::npuw::IBaseInferRequest::RqPtrs ov::npuw::IBaseInferRequest::create_infer_re return rqs; } -void ov::npuw::IBaseInferRequest::ensure_subrequest_is_accurate(std::size_t idx, bool& failover) { +namespace { + void set_inputs(const ov::SoPtr& from, ov::SoPtr& to) { + const auto& from_comp_model = from->get_compiled_model(); + const auto& to_comp_model = to->get_compiled_model(); + for (size_t i = 0; i < from_comp_model->inputs().size(); i++) { + const auto& itnsr = from->get_tensor(from_comp_model->inputs()[i]); + to->set_tensor(to_comp_model->inputs()[i], itnsr); + } + } + + void copy_results(const ov::SoPtr& from, ov::SoPtr& to) { + const auto& from_comp_model = from->get_compiled_model(); + const auto& to_comp_model = to->get_compiled_model(); + for (size_t i = 0; i < to_comp_model->outputs().size(); i++) { + const auto& from_tnsr = from->get_tensor(from_comp_model->outputs()[i]); + const auto& to_tnsr = to->get_tensor(to_comp_model->outputs()[i]); + from_tnsr->copy_to(to_tnsr._ptr); + } + } + + std::stringstream create_launch_msg(std::size_t idx, std::size_t real_idx) { + std::stringstream log_msg_stream; + log_msg_stream << "Launching subrequest[" << idx << "]" << + ((real_idx == idx) ? std::string("...").c_str() : + std::string(std::string(", which is actually subrequest[") + + std::to_string(real_idx) + "]").c_str()); + return log_msg_stream; + } +} // anonymous namespace + +void ov::npuw::IBaseInferRequest::try_accurate_subinfer(std::size_t subidx, std::size_t offset, + std::size_t len, bool& accuracy_failover) { + auto real_subidx = real(subidx); + auto& act_subr = m_subrequests.at(real_subidx); + if (!m_npuw_model->m_acc_check) { + act_subr->infer(); + return; + } + + std::stringstream log_msg_stream = create_launch_msg(subidx, real_subidx); + if (m_npuw_model->m_compiled_submodels[real_subidx].spatial && len != 0) { + log_msg_stream << ", on range : [" << offset << ", " << offset + len << ")"; + } + log_msg_stream << "..."; + LOG_INFO(log_msg_stream.str()); + LOG_BLOCK(); + + if (m_npuw_model->m_compiled_submodels[real_subidx].switched_to_ref) { + LOG_INFO("Subrequest was inaccurate somewhere before, launching it on reference device."); + + auto& act_subr = m_subrequests.at(real_subidx); + auto& ref_subr = m_ref_subrequests.at(real_subidx); + + set_inputs(act_subr, ref_subr); + ref_subr->infer(); + copy_results(ref_subr, act_subr); + } else { + act_subr->infer(); + ensure_subrequest_is_accurate(subidx, accuracy_failover); + } +} + +void ov::npuw::IBaseInferRequest::try_accurate_subinfer(std::size_t subidx, bool& accuracy_failover) { + try_accurate_subinfer(subidx, 0, 0, accuracy_failover); +} + +void ov::npuw::IBaseInferRequest::try_accurate_substart_async(std::size_t subidx) { + auto real_subidx = real(subidx); + auto& act_subr = m_subrequests.at(real_subidx); + if (!m_npuw_model->m_acc_check) { + act_subr->start_async(); + return; + } + + std::stringstream log_msg_stream = create_launch_msg(subidx, real_subidx); + log_msg_stream << "..."; + LOG_INFO(log_msg_stream.str()); + LOG_BLOCK(); + + if (m_npuw_model->m_compiled_submodels[real_subidx].switched_to_ref) { + LOG_INFO("Subrequest was inaccurate somewhere before, launching it on reference device."); + + auto& act_subr = m_subrequests.at(real_subidx); + auto& ref_subr = m_ref_subrequests.at(real_subidx); + + set_inputs(act_subr, ref_subr); + ref_subr->start_async(); + } else { + act_subr->start_async(); + } +} + +void ov::npuw::IBaseInferRequest::try_accurate_subwait(std::size_t subidx, bool& accuracy_failover) { + auto real_subidx = real(subidx); + auto& act_subr = m_subrequests.at(real_subidx); + if (!m_npuw_model->m_acc_check) { + act_subr->wait(); + return; + } + + LOG_BLOCK(); + + if (m_npuw_model->m_compiled_submodels[real_subidx].switched_to_ref) { + auto& act_subr = m_subrequests.at(real_subidx); + auto& ref_subr = m_ref_subrequests.at(real_subidx); + + ref_subr->wait(); + copy_results(ref_subr, act_subr); + } else { + act_subr->wait(); + ensure_subrequest_is_accurate(subidx, accuracy_failover); + } +} + +void ov::npuw::IBaseInferRequest::ensure_subrequest_is_accurate(std::size_t idx, bool& accuracy_failover) { + if (!m_npuw_model->m_acc_check) { + return; + } + LOG_INFO("Check if subrequest[" << idx << "] is accurate..."); LOG_BLOCK(); - failover = false; - if (m_ref_subrequests.at(idx) != nullptr && m_subrequests.at(idx)._ptr != m_ref_subrequests.at(idx)._ptr) { - NPUW_ASSERT(m_npuw_model->m_compiled_submodels.at(idx).switched_to_ref == false); - NPUW_ASSERT(m_npuw_model->m_compiled_submodels.at(idx).replaced_by.value_or(idx) == idx); - - const auto& ref_comp_model = m_ref_subrequests.at(idx)->get_compiled_model(); - const auto& actual_comp_model = m_subrequests.at(idx)->get_compiled_model(); - NPUW_ASSERT(actual_comp_model->inputs().size() == ref_comp_model->inputs().size()); - // Setting inputs: - for (size_t i = 0; i < actual_comp_model->inputs().size(); i++) { - const auto& itensor = m_subrequests.at(idx)->get_tensor(actual_comp_model->inputs()[i]); - m_ref_subrequests.at(idx)->set_tensor(ref_comp_model->inputs()[i], itensor); - } - m_ref_subrequests.at(idx)->infer(); - LOG_INFO("Compare actual outputs against references:"); - bool tensors_converge = true; - for (size_t i = 0; i < actual_comp_model->outputs().size(); i++) { + std::size_t real_idx = real(idx); + OPENVINO_ASSERT(m_npuw_model->m_compiled_submodels[real_idx].switched_to_ref == false); + + if (m_npuw_model->submodel_device(idx) == m_npuw_model->m_ref_device) { + LOG_INFO("Skipped, subrequest[" << idx << "] is launched on reference device."); + return; + } + + accuracy_failover = false; + auto& actual_subr = m_subrequests.at(real_idx); + auto& ref_subr = m_ref_subrequests.at(real_idx); + + // Setting inputs: + set_inputs(actual_subr, ref_subr); + + // Running inference: + ref_subr->infer(); + + // Comparing results of actual and reference inferfences: + LOG_INFO("Compare actual outputs against references:"); + bool tensors_converge = true; + const auto& actual_comp_model = actual_subr->get_compiled_model(); + const auto& ref_comp_model = ref_subr->get_compiled_model(); + std::vector converges(actual_comp_model->outputs().size()); + std::vector metrics(actual_comp_model->outputs().size()); + for (size_t i = 0; i < actual_comp_model->outputs().size(); i++) { + const auto& actual_tensor = actual_subr->get_tensor(actual_comp_model->outputs()[i]); + const auto& ref_tensor = ref_subr->get_tensor(ref_comp_model->outputs()[i]); + converges[i] = m_npuw_model->m_acc_check(actual_tensor, ref_tensor, &metrics[i]); + tensors_converge &= converges[i]; + } + if (tensors_converge == false) { + if (ov::npuw::get_log_level() == ov::npuw::LogLevel::Error) { + // For just log level error print header message: + LOG_ERROR("Check if subrequest[" << idx << "] is accurate..."); + } + } + // Log comparison details: + for (size_t i = 0; i < actual_comp_model->outputs().size(); i++) { + if (converges[i]) { LOG_INFO(" - " << actual_comp_model->outputs()[i]); - const auto& actual_tensor = m_subrequests.at(idx)->get_tensor(actual_comp_model->outputs()[i]); - const auto& ref_tensor = m_ref_subrequests.at(idx)->get_tensor(ref_comp_model->outputs()[i]); LOG_BLOCK(); - tensors_converge &= m_npuw_model->m_acc_check(actual_tensor, ref_tensor); - } - LOG_INFO((tensors_converge ? "PASS" : "FAIL")); - - if (!tensors_converge) { - LOG_INFO("Subrequest is inaccurate, failover to reference."); - // FIXME: We need to copy reference tensors to actual only in single-model-inference mode - // or if our subgraph is last in the chain. - for (size_t i = 0; i < actual_comp_model->outputs().size(); i++) { - const auto& actual_tensor = m_subrequests.at(idx)->get_tensor(actual_comp_model->outputs()[i]); - const auto& ref_tensor = m_ref_subrequests.at(idx)->get_tensor(ref_comp_model->outputs()[i]); - ref_tensor->copy_to(actual_tensor._ptr); - } - m_npuw_model->m_compiled_submodels.at(idx).compiled_model = - m_npuw_model->m_compiled_submodels.at(idx).ref_compiled_model; - m_npuw_model->m_compiled_submodels.at(idx).switched_to_ref = true; - m_subrequests.at(idx) = m_ref_subrequests.at(idx); - update_subrequest_links(idx); - failover = true; + LOG_INFO(m_npuw_model->m_acc_check_name << " loss: " << metrics[i] << + ", threshold: " << m_npuw_model->m_acc_check_threshold << "."); + LOG_INFO("PASS"); + } else { + LOG_ERROR(" - " << actual_comp_model->outputs()[i]); + LOG_BLOCK(); + LOG_ERROR(m_npuw_model->m_acc_check_name << " loss: " << metrics[i] << + ", threshold: " << m_npuw_model->m_acc_check_threshold << "."); + LOG_ERROR("FAIL"); } + } - LOG_INFO("Done"); + // If comparison fails, copy reference results to original tensors and mark subgraph as + // switched to reference: + if (tensors_converge) { + LOG_INFO("PASS"); } else { - LOG_INFO("Skipped, subrequest is launched on reference device."); + LOG_ERROR("FAIL"); + LOG_ERROR("Subrequest[" << idx << "] is inaccurate, failover to reference results."); + if (idx != real_idx) { + LOG_ERROR("As subrequest[" << idx << "] is actually " << "subrequest[" << real_idx << + "], all subrequests, corresponding to last, will be further " << + "launched on " << m_npuw_model->m_ref_device << ".'"); + } else if (m_npuw_model->m_compiled_submodels[real_idx].replaced_by) { + LOG_ERROR("As subrequest[" << real_idx << "] is actually " << "a function, all " << + "subrequests, corresponding to it, will be further launched on " << + m_npuw_model->m_ref_device << "."); + } + + if (m_npuw_model->m_cfg.get<::intel_npu::NPUW_ACC_DUMP_FAILS>()) { + const auto model = m_npuw_model->m_compiled_submodels[real_idx].model; + const auto model_path = "inaccurate_" + model->get_friendly_name() + ".xml"; + ov::save_model(model, model_path); + dump_input_tensors(idx, true); + dump_output_tensors(idx, true); + } + + // Due to complex memory management logic it is safe to just copy + // results back to already properly allocated and linked tensors: + copy_results(ref_subr, actual_subr); + m_npuw_model->m_compiled_submodels[real_idx].switched_to_ref = true; + accuracy_failover = true; } + + LOG_INFO("Done"); } ov::SoPtr ov::npuw::IBaseInferRequest::get_tensor(const ov::Output& port) const { @@ -215,19 +374,15 @@ void ov::npuw::IBaseInferRequest::infer() { run_subrequest_for_success(idx, failover); failover_happened |= failover; complete_subrequest(idx); - if (m_npuw_model->m_acc_check) { - ensure_subrequest_is_accurate(idx, failover); - failover_happened |= failover; - } } // Increment counter regardless if dumps etc are enabled or not. m_run_iter++; if (failover_happened) { - LOG_INFO("Refined device distribution:"); + LOG_ERROR("Refined device distribution:"); LOG_BLOCK(); - m_npuw_model->log_device_dist(); + m_npuw_model->log_device_dist(ov::npuw::LogLevel::Error); } m_now_idx.reset(); } @@ -487,12 +642,11 @@ void ov::npuw::IBaseInferRequest::bind_global_results(std::size_t idx, RqPtr req LOG_DEBUG("Done"); } -void ov::npuw::IBaseInferRequest::dump_input_tensors(std::size_t idx) { +void ov::npuw::IBaseInferRequest::dump_input_tensors(std::size_t idx, bool forced) { const std::string dump_ios_opt = m_npuw_model->m_cfg.get<::intel_npu::NPUW_DUMP_IO>(); const std::size_t end_idx = m_npuw_model->m_compiled_submodels.size(); - auto real_idx = m_npuw_model->m_compiled_submodels[idx].replaced_by.value_or(idx); - - if (!ov::npuw::util::is_set(idx, dump_ios_opt, real_idx, end_idx)) { + const std::size_t real_idx = real(idx); + if (!ov::npuw::util::is_set(idx, dump_ios_opt, real_idx, end_idx) && !forced) { return; } @@ -568,12 +722,12 @@ void ov::npuw::IBaseInferRequest::dump_input_tensors(std::size_t idx) { } } -void ov::npuw::IBaseInferRequest::dump_output_tensors(std::size_t idx) { +void ov::npuw::IBaseInferRequest::dump_output_tensors(std::size_t idx, bool forced) { const std::string dump_ios_opt = m_npuw_model->m_cfg.get<::intel_npu::NPUW_DUMP_IO>(); const std::size_t end_idx = m_npuw_model->m_compiled_submodels.size(); auto real_idx = m_npuw_model->m_compiled_submodels[idx].replaced_by.value_or(idx); - if (!ov::npuw::util::is_set(idx, dump_ios_opt, real_idx, end_idx)) { + if (!ov::npuw::util::is_set(idx, dump_ios_opt, real_idx, end_idx) && !forced) { return; } diff --git a/src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.hpp b/src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.hpp index 576810cc9b216d..c7002aa8f3b2de 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.hpp @@ -74,8 +74,12 @@ class IBaseInferRequest : public ov::ISyncInferRequest { // their inference requests anymore - they must be stored // only once in the subrequests list RqPtrs create_infer_requests(std::size_t id, size_t nireq = 1, bool* recompiled = nullptr); - void ensure_subrequest_is_accurate(std::size_t idx, bool& failover); - virtual void update_subrequest_links(std::size_t idx) = 0; + void try_accurate_subinfer(std::size_t idx, bool& accuracy_failover); + void try_accurate_subinfer(std::size_t idx, std::size_t offset, std::size_t len, + bool& accuracy_failover); + virtual void try_accurate_substart_async(std::size_t idx); + virtual void try_accurate_subwait(std::size_t idx, bool& accuracy_failover); + virtual void ensure_subrequest_is_accurate(std::size_t idx, bool& accuracy_failover); std::shared_ptr m_npuw_model; std::vector m_completion_cbs; @@ -150,8 +154,9 @@ class IBaseInferRequest : public ov::ISyncInferRequest { virtual void bind_global_params(std::size_t idx, RqPtr request); virtual void bind_global_results(std::size_t idx, RqPtr request); - void dump_input_tensors(std::size_t idx); - void dump_output_tensors(std::size_t idx); + + void dump_input_tensors(std::size_t idx, bool forced = false); + void dump_output_tensors(std::size_t idx, bool forced = false); // Quick-and-dirty profiling ov::npuw::perf::metric m_ms_unpack; @@ -172,11 +177,11 @@ class IBaseInferRequest : public ov::ISyncInferRequest { std::size_t next(std::size_t idx_base) const; std::size_t real(std::size_t idx) const; - RqPtrs m_ref_subrequests; - using now_t = std::optional; now_t now_idx() const; + RqPtrs m_ref_subrequests; + private: now_t m_now_idx; }; diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp index 3d8ded4191d042..8fe3f806c29654 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp @@ -194,6 +194,8 @@ ov::npuw::CompiledModel::CompiledModel(const std::shared_ptr& model, const double threshold_opt = m_cfg.get<::intel_npu::NPUW_ACC_THRESH>(); m_acc_check = metrics::NRMSE(threshold_opt); + m_acc_check_name = "NRMSE"; + m_acc_check_threshold = threshold_opt; m_ref_device = m_cfg.getString<::intel_npu::NPUW_ACC_DEVICE>(); LOG_INFO("Accuracy check is enabled."); } @@ -443,8 +445,7 @@ ov::npuw::CompiledModel::CompiledModel(const std::shared_ptr& model, } } - m_compiled_submodels[id].device_it = - id != real_id ? m_compiled_submodels[real_id].device_it : m_dev_list.cbegin(); + m_compiled_submodels[id].device_it = m_dev_list.cbegin(); if (forced_sub_devices.count(id)) { std::string forced_device = forced_sub_devices[id]; @@ -499,7 +500,10 @@ ov::npuw::CompiledModel::CompiledModel(const std::shared_ptr& model, // Finalize memory in closures and weight banks finalize_weights_bank(); - detach_memory(); + + if (!m_acc_check || (m_acc_check && !m_cfg.get<::intel_npu::NPUW_ACC_DUMP_FAILS>())) { + detach_memory(); + } // Print stats report when possible { @@ -1545,7 +1549,7 @@ bool ov::npuw::CompiledModel::is_gather_closure(const std::size_t idx, const std return false; } -void ov::npuw::CompiledModel::log_device_dist() const { +void ov::npuw::CompiledModel::log_device_dist(ov::npuw::LogLevel log_lvl) const { std::unordered_map stats_for_devices; execution_stats stats_for_optimized_out{0.f, 0ul}; @@ -1560,14 +1564,32 @@ void ov::npuw::CompiledModel::log_device_dist() const { stat.ops += real_cm.stat.ops; } - auto print_stats = [this](const std::string& device, const execution_stats& stat) { + auto print_stats = [this, log_lvl](const std::string& device, const execution_stats& stat) { float flops_prcnt = 100.f; float ops_prcnt = 100.f; if (m_total_stat.gflops > 0 && m_total_stat.ops > 0) { flops_prcnt = stat.gflops / static_cast(m_total_stat.gflops) * 100; ops_prcnt = stat.ops / static_cast(m_total_stat.ops) * 100; } - LOG_INFO(device << ": " << flops_prcnt << "% FLOPS, " << ops_prcnt << "% Layers"); + std::stringstream log_msg; + log_msg << device << ": " << flops_prcnt << "% FLOPS, " << ops_prcnt << "% Layers"; + switch (log_lvl) { + case LogLevel::Error: + LOG_ERROR(log_msg.str()); + break; + case LogLevel::Warning: + LOG_WARN(log_msg.str()); + break; + case LogLevel::Info: + LOG_INFO(log_msg.str()); + break; + case LogLevel::Verbose: + LOG_VERB(log_msg.str()); + break; + case LogLevel::Debug: + LOG_DEBUG(log_msg.str()); + break; + } }; for (auto&& device_st : stats_for_devices) { LOG_BLOCK(); @@ -1712,6 +1734,7 @@ void ov::npuw::CompiledModel::implement_properties() { BIND(npuw::accuracy::check, NPUW_ACC_CHECK), BIND(npuw::accuracy::threshold, NPUW_ACC_THRESH), BIND(npuw::accuracy::reference_device, NPUW_ACC_DEVICE), + BIND(npuw::accuracy::dump_failures, NPUW_ACC_DUMP_FAILS), #ifdef NPU_PLUGIN_DEVELOPER_BUILD BIND(npuw::dump::full, NPUW_DUMP_FULL), BIND(npuw::dump::subgraphs, NPUW_DUMP_SUBS), diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp index 79524f3818001d..1bda73f561f978 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp @@ -18,6 +18,7 @@ #include "serialization.hpp" #include "spatial.hpp" #include "weights_bank.hpp" +#include "logging.hpp" namespace intel_npu { class Plugin; @@ -96,7 +97,8 @@ class CompiledModel : public ov::npuw::ICompiledModel { bool unpack_required(const std::size_t idx) const; bool unpack_required(const std::size_t idx, const std::size_t cidx) const; - void log_device_dist() const; + void log_device_dist(ov::npuw::LogLevel log_lvl = ov::npuw::LogLevel::Info) const; + void implement_properties(); // For full deserialization flow with weights @@ -181,7 +183,9 @@ class CompiledModel : public ov::npuw::ICompiledModel { }; std::vector m_compiled_submodels; - std::function&, const ov::SoPtr&)> m_acc_check; + std::function&, const ov::SoPtr&, double*)> m_acc_check; + std::string m_acc_check_name; + double m_acc_check_threshold; std::string m_ref_device; execution_stats m_total_stat; diff --git a/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp b/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp index e1c4a957ab2039..1b68732a10d4bd 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp @@ -129,7 +129,7 @@ void ov::npuw::FuncMemMgr::assign(const LinkFrom& from) { // - Look for an output tensor to reuse // - If there's one, assign it to this allocation // - If there's none, allocate a new tensor - // - How a tensor to reuse is piced: + // - How a tensor to reuse is picked: // 1. It should exist // 2. It's "remaining reads" count should be 0 (all planned reads // happened at this point). @@ -265,6 +265,7 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptrlog_device_dist(); + m_npuw_model->log_device_dist(ov::npuw::LogLevel::Error); } // Identify connections for the funcall pipeline, if needed @@ -578,11 +579,11 @@ void ov::npuw::JustInferRequest::function_prologue(std::size_t idx) { } void ov::npuw::JustInferRequest::recreate_subrequests(std::size_t idx) { - auto& comp_model_desc = m_npuw_model->m_compiled_submodels[idx]; - auto real_idx = comp_model_desc.replaced_by.value_or(idx); + std::size_t real_idx = real(idx); + auto& comp_model_desc = m_npuw_model->m_compiled_submodels[real_idx]; - const auto is_piped = is_pipelined(idx); - auto new_rqs = create_infer_requests(idx, is_piped ? 2 : 1); + const auto is_piped = is_pipelined(real_idx); + auto new_rqs = create_infer_requests(real_idx, is_piped ? 2 : 1); // NB: Regardless if this subrequest was a function call // or not, always use the real_idx here - for regular @@ -599,13 +600,13 @@ void ov::npuw::JustInferRequest::recreate_subrequests(std::size_t idx) { // overkill - only affected subrequest(s) could be updated instead, // but it is a more complex thing and can be implemented separately connect_subrequests(); - m_subrequest_devices[idx] = *comp_model_desc.device_it; + m_subrequest_devices[real_idx] = *comp_model_desc.device_it; } void ov::npuw::JustInferRequest::run_subrequest_for_success(std::size_t idx, bool& failover) { failover = false; - auto& comp_model_desc = m_npuw_model->m_compiled_submodels[idx]; - auto real_idx = comp_model_desc.replaced_by.value_or(idx); + bool accuracy_failover = false; + auto real_idx = real(idx); // Infer is also fail-safe... bool job_done = false; @@ -628,7 +629,7 @@ void ov::npuw::JustInferRequest::run_subrequest_for_success(std::size_t idx, boo // the subrequest' outputs to global Results, if relevant. bind_global_results(idx); - if (comp_model_desc.replaced_by) { + if (m_npuw_model->m_compiled_submodels[idx].replaced_by) { function_prologue(idx); } if (!dump_in) { @@ -639,7 +640,7 @@ void ov::npuw::JustInferRequest::run_subrequest_for_success(std::size_t idx, boo try { LOG_DEBUG("Trying to run subrequest[" << idx << "]..."); LOG_BLOCK(); - unsafe_run_this_prep_next(idx, next_prepared); + unsafe_run_this_prep_next(idx, next_prepared, accuracy_failover); job_done = true; LOG_DEBUG("Done: " << idx << "(exec subrequest)"); } catch (const std::exception& ex) { @@ -654,7 +655,8 @@ void ov::npuw::JustInferRequest::run_subrequest_for_success(std::size_t idx, boo LOG_INFO("- Trying next device..."); // Altering iterators here!! Contracts should be changed! - comp_model_desc.device_it++; + auto& proto_comp_model_desc = m_npuw_model->m_compiled_submodels[real_idx]; + proto_comp_model_desc.device_it++; if (!m_npuw_model->compile_for_success(real_idx)) { OPENVINO_THROW("Failed to compile. No more devices are left!"); } @@ -670,36 +672,41 @@ void ov::npuw::JustInferRequest::run_subrequest_for_success(std::size_t idx, boo std::swap(m_subrequests[real_idx], m_funcall_pipeline[real_idx].subrequest); } } + + failover |= accuracy_failover; } -void ov::npuw::JustInferRequest::unsafe_during(std::size_t real_idx, const std::function& f) { - auto& comp_model_desc = m_npuw_model->m_compiled_submodels[real_idx]; - if (!comp_model_desc.spatial) { +void ov::npuw::JustInferRequest::unsafe_during(std::size_t idx, + const std::function& f, + bool& accuracy_failover) { + std::size_t real_idx = real(idx); + auto& proto_comp_model_desc = m_npuw_model->m_compiled_submodels[real_idx]; + if (!proto_comp_model_desc.spatial) { // Non-spatial execution: trigger request asynchronously, run `f` in this context - auto& r = m_subrequests[real_idx]; - r->start_async(); + try_accurate_substart_async(idx); f(); // expect noexcept - r->wait(); + try_accurate_subwait(idx, accuracy_failover); } else { // Spatial execution... Do the opposite - run f asynchronously, and meanwhile run the // spatial inference auto future = std::async(std::launch::async, f); - unsafe_infer(real_idx); + unsafe_infer(idx, accuracy_failover); future.wait(); } } -void ov::npuw::JustInferRequest::unsafe_infer(std::size_t real_idx) { - auto& comp_model_desc = m_npuw_model->m_compiled_submodels[real_idx]; - auto& r = m_subrequests[real_idx]; - if (!comp_model_desc.spatial) { +void ov::npuw::JustInferRequest::unsafe_infer(std::size_t idx, bool& accuracy_failover) { + std::size_t real_idx = real(idx); + auto& proto_comp_model_desc = m_npuw_model->m_compiled_submodels[real_idx]; + if (!proto_comp_model_desc.spatial) { // Run normally - r->infer(); + try_accurate_subinfer(idx, accuracy_failover); } else { + auto& r = m_subrequests[real_idx]; // Run over the specified range... Note: the full inputs/outputs // must be prepared in the m_spatial_io at this point - const auto& spatial = comp_model_desc.spatial.value(); - const auto num_outputs = comp_model_desc.compiled_model->outputs().size(); + const auto& spatial = proto_comp_model_desc.spatial.value(); + const auto num_outputs = proto_comp_model_desc.compiled_model->outputs().size(); NPUW_ASSERT(m_spatial_selector); // Create a sparse vector with full input sizes. @@ -707,7 +714,7 @@ void ov::npuw::JustInferRequest::unsafe_infer(std::size_t real_idx) { // number of input parameters (activations) so some slots may be // not used here. // FIXME: All these preparations could be done statically (just once) - std::vector full_in_shapes(comp_model_desc.param_base); + std::vector full_in_shapes(proto_comp_model_desc.param_base); for (auto&& param : spatial.params) { full_in_shapes[param.idx] = m_spatial_io[real_idx].inputs.at(param.idx)->get_shape(); } @@ -732,7 +739,7 @@ void ov::npuw::JustInferRequest::unsafe_infer(std::size_t real_idx) { // Collect spatial inputs for this offset for (auto&& param : spatial.params) { - const auto& iport = comp_model_desc.compiled_model->inputs()[param.idx]; + const auto& iport = proto_comp_model_desc.compiled_model->inputs()[param.idx]; const auto& iview = ov::npuw::util::view(m_spatial_io[real_idx].inputs.at(param.idx), param.dim, offset, spatial.nway); r->set_tensor(iport, iview); @@ -740,7 +747,7 @@ void ov::npuw::JustInferRequest::unsafe_infer(std::size_t real_idx) { // Now set the spatial outputs for (std::size_t out_idx = 0u; out_idx < num_outputs; out_idx++) { - const auto& oport = comp_model_desc.compiled_model->outputs()[out_idx]; + const auto& oport = proto_comp_model_desc.compiled_model->outputs()[out_idx]; r->set_tensor(oport, ov::npuw::util::view(m_spatial_io[real_idx].outputs.at(out_idx), spatial.out_dim, @@ -749,7 +756,7 @@ void ov::npuw::JustInferRequest::unsafe_infer(std::size_t real_idx) { } // for(outputs) // Now run the part - r->infer(); + try_accurate_subinfer(idx, offset, spatial.nway, accuracy_failover); } // for(full_nway_times) // Now process the tail, if required @@ -762,7 +769,7 @@ void ov::npuw::JustInferRequest::unsafe_infer(std::size_t real_idx) { offset, spatial.tail_size); - const auto& iport = comp_model_desc.compiled_model->inputs()[param.idx]; + const auto& iport = proto_comp_model_desc.compiled_model->inputs()[param.idx]; auto out_view = ov::npuw::util::view(m_spatial_io[real_idx].input_tails.at(param.idx), param.dim, 0, @@ -774,12 +781,12 @@ void ov::npuw::JustInferRequest::unsafe_infer(std::size_t real_idx) { // Now set the tail tensors for (std::size_t out_idx = 0u; out_idx < num_outputs; out_idx++) { - const auto& oport = comp_model_desc.compiled_model->outputs()[out_idx]; + const auto& oport = proto_comp_model_desc.compiled_model->outputs()[out_idx]; r->set_tensor(oport, m_spatial_io[real_idx].output_tails.at(out_idx)); } // for(outputs) // Now run the tail infer - r->infer(); + try_accurate_subinfer(idx, offset, spatial.tail_size, accuracy_failover); // Now copy the views from the output full-nway tensor to the output tensors for (std::size_t out_idx = 0u; out_idx < num_outputs; out_idx++) { @@ -798,7 +805,8 @@ void ov::npuw::JustInferRequest::unsafe_infer(std::size_t real_idx) { } } -void ov::npuw::JustInferRequest::unsafe_run_this_prep_next(std::size_t idx, bool& next_prepared) { +void ov::npuw::JustInferRequest::unsafe_run_this_prep_next(std::size_t idx, bool& next_prepared, + bool& accuracy_failover) { auto& comp_model_desc = m_npuw_model->m_compiled_submodels[idx]; auto real_idx = comp_model_desc.replaced_by.value_or(idx); const std::size_t next_idx = next(idx + 1); @@ -812,18 +820,18 @@ void ov::npuw::JustInferRequest::unsafe_run_this_prep_next(std::size_t idx, bool if (is_pipelined(real_idx)) { // function pipelining is here! and the next rq is ours. NPUW_ASSERT(m_funcall_pipeline[idx].next.value() == next_idx); - unsafe_during(real_idx, [&]() { + unsafe_during(idx, [&]() { LOG_DEBUG("Unpacking closures for the NEXT subrequest[" << next_idx << "]..."); LOG_BLOCK(); // Note: do it here unconditionally - if this request fails, // have to resubmit all the data to the recompiled pair anyway bind_global_parameters(next_idx); unpack_closure(next_idx, m_funcall_pipeline[real_idx].subrequest); - }); + }, accuracy_failover); } else { // Function pipelining is not used. THIS infer request // is also the NEXT one. Nothing much to do here - unsafe_infer(real_idx); + unsafe_infer(idx, accuracy_failover); bind_global_parameters(next_idx); } } else { @@ -833,9 +841,9 @@ void ov::npuw::JustInferRequest::unsafe_run_this_prep_next(std::size_t idx, bool if (next_idx == 0) { // Note: even if m_function_pipelining is ON, // SWAP won't happen here - see the below check for .next - unsafe_infer(real_idx); + unsafe_infer(idx, accuracy_failover); } else { - unsafe_during(real_idx, [&]() { + unsafe_during(idx, [&]() { if (!next_prepared) { bind_global_parameters(next_idx); next_prepared = true; @@ -846,21 +854,21 @@ void ov::npuw::JustInferRequest::unsafe_run_this_prep_next(std::size_t idx, bool LOG_BLOCK(); unpack_closure(my_next_idx, m_funcall_pipeline[real_idx].subrequest); } - }); + }, accuracy_failover); } } } else { // This is a regular subgraph. Start it async to prepare the next // parameters if (next_idx == 0) { - unsafe_infer(real_idx); + unsafe_infer(idx, accuracy_failover); } else { - unsafe_during(real_idx, [&]() { + unsafe_during(idx, [&]() { if (!next_prepared) { bind_global_parameters(next_idx); next_prepared = true; } - }); + }, accuracy_failover); } } // if (replaced_by) } @@ -881,10 +889,6 @@ bool ov::npuw::JustInferRequest::supports_async_pipeline() const { return false; } -void ov::npuw::JustInferRequest::update_subrequest_links(std::size_t) { - connect_subrequests(); -} - bool ov::npuw::JustInferRequest::is_pipelined(std::size_t idx) const { const auto& desc = m_npuw_model->m_compiled_submodels[real(idx)]; return m_use_function_pipelining && desc.replaced_by && !desc.forced_to_fcall; diff --git a/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.hpp b/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.hpp index 5e8273833b2f16..3a2021041d0ca0 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.hpp @@ -81,7 +81,6 @@ class JustInferRequest final : public IBaseInferRequest { void complete_subrequest(std::size_t idx) override; void cancel_subrequest(std::size_t idx) override; bool supports_async_pipeline() const override; - void update_subrequest_links(std::size_t idx) override; TensorPtr alloc_global_out(std::size_t out_idx) override; @@ -97,9 +96,9 @@ class JustInferRequest final : public IBaseInferRequest { void function_prologue(std::size_t idx); - void unsafe_during(std::size_t real_idx, const std::function& f); - void unsafe_infer(std::size_t real_idx); - void unsafe_run_this_prep_next(std::size_t idx, bool& next_prepared_p); + void unsafe_during(std::size_t idx, const std::function& f, bool& accuracy_failover); + void unsafe_infer(std::size_t idx, bool& accuracy_failover); + void unsafe_run_this_prep_next(std::size_t idx, bool& next_prepared, bool& accuracy_failover); void connect_subrequests(); void recreate_subrequests(std::size_t idx); diff --git a/src/plugins/intel_npu/src/plugin/npuw/unfold_sync_infer_request.cpp b/src/plugins/intel_npu/src/plugin/npuw/unfold_sync_infer_request.cpp index 0e60f2886f142b..edc5b3a87240e9 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/unfold_sync_infer_request.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/unfold_sync_infer_request.cpp @@ -37,6 +37,25 @@ ov::npuw::UnfoldInferRequest::UnfoldInferRequest(const std::shared_ptrcreate_infer_request(); m_subrequest_devices[i] = *proto_comp_model_desc.device_it; + + if (m_npuw_model->m_acc_check) { + LOG_INFO("Create reference subrequest for Subgraph[" << i << "] on " << m_npuw_model->m_ref_device << "..."); + LOG_BLOCK(); + if (m_npuw_model->submodel_device(i) != m_npuw_model->m_ref_device) { + auto& ref_submodel = m_npuw_model->m_compiled_submodels.at(real(i)).ref_compiled_model; + ov::SoPtr ref_infer_request = {ref_submodel->create_infer_request(), + ref_submodel._so}; + NPUW_ASSERT(ref_infer_request); + m_ref_subrequests.at(i) = std::move(ref_infer_request); + LOG_INFO("Done"); + } else { + LOG_INFO("Skip creation of reference subrequest for Subgraph[" + << i << "] on reference device: " << m_npuw_model->m_ref_device << ", as actual subrequest [" + << i << "] has been already created on " + << "it ."); + } + } + LOG_INFO("DONE"); } // for(submodels) @@ -86,6 +105,7 @@ bool ov::npuw::UnfoldInferRequest::valid_subrequest(std::size_t idx) const { void ov::npuw::UnfoldInferRequest::infer() { const bool do_async = m_npuw_model->m_cfg.get<::intel_npu::NPUW_FUNCALL_ASYNC>(); + bool accuracy_failover = false; auto prepare = [&](std::size_t idx) { if (idx >= m_subrequests.size()) { @@ -94,16 +114,16 @@ void ov::npuw::UnfoldInferRequest::infer() { bind_global_params(idx, m_subrequests[idx]); bind_global_results(idx, m_subrequests[idx]); }; - auto wait_and_clear = [](RqPtrs& rqs) { - for (auto&& r : rqs) { - r->wait(); + auto wait_and_clear = [&](std::vector rqs_ids) { + for (auto&& r_id : rqs_ids) { + try_accurate_subwait(r_id, accuracy_failover); } - rqs.clear(); + rqs_ids.clear(); }; if (do_async) { std::size_t past_repl_id = 0u; - RqPtrs previous_requests; + std::vector previous_requests; prepare(0); for (std::size_t idx = 0; idx < m_num_submodels; idx++) { @@ -121,8 +141,8 @@ void ov::npuw::UnfoldInferRequest::infer() { wait_and_clear(previous_requests); past_repl_id = this_repl_id; } - subr->start_async(); - previous_requests.push_back(subr); + try_accurate_substart_async(idx); + previous_requests.push_back(idx); prepare(idx + 1); } wait_and_clear(previous_requests); @@ -134,9 +154,189 @@ void ov::npuw::UnfoldInferRequest::infer() { prepare(idx + 1); continue; } - subr->start_async(); + try_accurate_substart_async(idx); prepare(idx + 1); - subr->wait(); + try_accurate_subwait(idx, accuracy_failover); } } // (async) + + if (accuracy_failover) { + LOG_ERROR("Refined device distribution:"); + LOG_BLOCK(); + m_npuw_model->log_device_dist(ov::npuw::LogLevel::Error); + } +} + +namespace { + void set_inputs(const ov::SoPtr& from, ov::SoPtr& to) { + const auto& from_comp_model = from->get_compiled_model(); + const auto& to_comp_model = to->get_compiled_model(); + for (size_t i = 0; i < from_comp_model->inputs().size(); i++) { + const auto& itnsr = from->get_tensor(from_comp_model->inputs()[i]); + to->set_tensor(to_comp_model->inputs()[i], itnsr); + } + } + + void copy_results(const ov::SoPtr& from, ov::SoPtr& to) { + const auto& from_comp_model = from->get_compiled_model(); + const auto& to_comp_model = to->get_compiled_model(); + for (size_t i = 0; i < to_comp_model->outputs().size(); i++) { + const auto& from_tnsr = from->get_tensor(from_comp_model->outputs()[i]); + const auto& to_tnsr = to->get_tensor(to_comp_model->outputs()[i]); + from_tnsr->copy_to(to_tnsr._ptr); + } + } + + std::stringstream create_launch_msg(std::size_t idx, std::size_t real_idx) { + std::stringstream log_msg_stream; + log_msg_stream << "Launching subrequest[" << idx << "]" << + ((real_idx == idx) ? std::string("...").c_str() : + std::string(std::string(", which is actually subrequest[") + + std::to_string(real_idx) + "]").c_str()); + return log_msg_stream; + } +} // anonymous namespace + +void ov::npuw::UnfoldInferRequest::try_accurate_substart_async(std::size_t subidx) { + auto& act_subr = m_subrequests.at(subidx); + if (!m_npuw_model->m_acc_check) { + act_subr->start_async(); + return; + } + + std::stringstream log_msg_stream = create_launch_msg(subidx, subidx); + log_msg_stream << "..."; + LOG_INFO(log_msg_stream.str()); + LOG_BLOCK(); + + if (m_npuw_model->m_compiled_submodels[real(subidx)].switched_to_ref) { + LOG_INFO("Subrequest was inaccurate somewhere before, launching it on reference device."); + + auto& act_subr = m_subrequests.at(subidx); + auto& ref_subr = m_ref_subrequests.at(subidx); + + set_inputs(act_subr, ref_subr); + ref_subr->start_async(); + } else { + act_subr->start_async(); + } +} + +void ov::npuw::UnfoldInferRequest::try_accurate_subwait(std::size_t subidx, bool& accuracy_failover) { + auto& act_subr = m_subrequests.at(subidx); + if (!m_npuw_model->m_acc_check) { + act_subr->wait(); + return; + } + + LOG_BLOCK(); + + if (m_npuw_model->m_compiled_submodels[real(subidx)].switched_to_ref) { + auto& act_subr = m_subrequests.at(subidx); + auto& ref_subr = m_ref_subrequests.at(subidx); + + ref_subr->wait(); + copy_results(ref_subr, act_subr); + } else { + act_subr->wait(); + ensure_subrequest_is_accurate(subidx, accuracy_failover); + } +} + +void ov::npuw::UnfoldInferRequest::ensure_subrequest_is_accurate(std::size_t idx, bool& accuracy_failover) { + if (!m_npuw_model->m_acc_check) { + return; + } + + LOG_INFO("Check if subrequest[" << idx << "] is accurate..."); + LOG_BLOCK(); + + std::size_t real_idx = real(idx); + OPENVINO_ASSERT(m_npuw_model->m_compiled_submodels[real_idx].switched_to_ref == false); + + if (m_npuw_model->submodel_device(idx) == m_npuw_model->m_ref_device) { + LOG_INFO("Skipped, subrequest[" << idx << "] is launched on reference device."); + return; + } + + accuracy_failover = false; + auto& actual_subr = m_subrequests.at(idx); + auto& ref_subr = m_ref_subrequests.at(idx); + + // Setting inputs: + set_inputs(actual_subr, ref_subr); + + // Running inference: + ref_subr->infer(); + + // Comparing results of actual and reference inferfences: + LOG_INFO("Compare actual outputs against references:"); + bool tensors_converge = true; + const auto& actual_comp_model = actual_subr->get_compiled_model(); + const auto& ref_comp_model = ref_subr->get_compiled_model(); + std::vector converges(actual_comp_model->outputs().size()); + std::vector metrics(actual_comp_model->outputs().size()); + for (size_t i = 0; i < actual_comp_model->outputs().size(); i++) { + const auto& actual_tensor = actual_subr->get_tensor(actual_comp_model->outputs()[i]); + const auto& ref_tensor = ref_subr->get_tensor(ref_comp_model->outputs()[i]); + converges[i] = m_npuw_model->m_acc_check(actual_tensor, ref_tensor, &metrics[i]); + tensors_converge &= converges[i]; + } + if (tensors_converge == false) { + if (ov::npuw::get_log_level() == ov::npuw::LogLevel::Error) { + // For just log level error print header message: + LOG_ERROR("Check if subrequest[" << idx << "] is accurate..."); + } + } + // Log comparison details: + for (size_t i = 0; i < actual_comp_model->outputs().size(); i++) { + if (converges[i]) { + LOG_INFO(" - " << actual_comp_model->outputs()[i]); + LOG_BLOCK(); + LOG_INFO(m_npuw_model->m_acc_check_name << " loss: " << metrics[i] << + ", threshold: " << m_npuw_model->m_acc_check_threshold << "."); + LOG_INFO("PASS"); + } else { + LOG_ERROR(" - " << actual_comp_model->outputs()[i]); + LOG_BLOCK(); + LOG_ERROR(m_npuw_model->m_acc_check_name << " loss: " << metrics[i] << + ", threshold: " << m_npuw_model->m_acc_check_threshold << "."); + LOG_ERROR("FAIL"); + } + } + + // If comparison fails, copy reference results to original tensors and mark subgraph as + // switched to reference: + if (tensors_converge) { + LOG_INFO("PASS"); + } else { + LOG_ERROR("FAIL"); + LOG_ERROR("Subrequest[" << idx << "] is inaccurate, failover to reference results."); + if (idx != real_idx) { + LOG_ERROR("As subrequest[" << idx << "] is actually " << "subrequest[" << real_idx << + "], all subrequests, corresponding to last, will be further " << + "launched on " << m_npuw_model->m_ref_device << ".'"); + } else if (m_npuw_model->m_compiled_submodels[real_idx].replaced_by) { + LOG_ERROR("As subrequest[" << real_idx << "] is actually " << "a function, all " << + "subrequests, corresponding to it, will be further launched on " << + m_npuw_model->m_ref_device << "."); + } + + if (m_npuw_model->m_cfg.get<::intel_npu::NPUW_ACC_DUMP_FAILS>()) { + // Not here anymore due to optimizations. + const auto model = m_npuw_model->m_compiled_submodels[real_idx].model; + const auto model_path = std::string("inaccurate_") + model->get_friendly_name() + std::string(".xml"); + ov::save_model(model, model_path); + dump_input_tensors(idx, true); + dump_output_tensors(idx, true); + } + + // Due to complex memory management logic it is safe to just copy + // results back to already properly allocated and linked tensors: + copy_results(ref_subr, actual_subr); + m_npuw_model->m_compiled_submodels[real_idx].switched_to_ref = true; + accuracy_failover = true; + } + + LOG_INFO("Done"); } diff --git a/src/plugins/intel_npu/src/plugin/npuw/unfold_sync_infer_request.hpp b/src/plugins/intel_npu/src/plugin/npuw/unfold_sync_infer_request.hpp index 76b67571ec4c40..170304eb52633b 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/unfold_sync_infer_request.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/unfold_sync_infer_request.hpp @@ -32,10 +32,13 @@ class UnfoldInferRequest final : public IBaseInferRequest { bool supports_async_pipeline() const override { return false; } - void update_subrequest_links(std::size_t) override {} private: void infer() override; + void try_accurate_substart_async(std::size_t idx) override; + void try_accurate_subwait(std::size_t idx, bool& accuracy_failover) override; + void ensure_subrequest_is_accurate(std::size_t idx, bool& accuracy_failover) override; + }; } // namespace npuw