Picked fixes to current version of Accuracy Failover from PR openvinotoolkit#27348

AsyaPronina · AsyaPronina · commit 1b18cb0c0e31 · 2025-06-04T01:11:30.000+01:00
diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp
@@ -63,6 +63,7 @@ DEFINE_OPT(NPUW_UNFOLD_IREQS, bool, false, npuw::unfold_ireqs, RunTime);
 DEFINE_OPT(NPUW_ACC_CHECK, bool, false, npuw::accuracy::check, RunTime);
 DEFINE_OPT(NPUW_ACC_THRESH, double, 0.01, npuw::accuracy::threshold, RunTime);
 DEFINE_OPT(NPUW_ACC_DEVICE, std::string, "", npuw::accuracy::reference_device, RunTime);
+DEFINE_OPT(NPUW_ACC_DUMP_FAILS, bool, false, npuw::accuracy::dump_failures, RunTime);
 DEFINE_OPT(NPUW_DUMP_FULL, bool, false, npuw::dump::full, CompileTime);
 DEFINE_OPT(NPUW_DUMP_SUBS, std::string, "", npuw::dump::subgraphs, CompileTime);
 DEFINE_OPT(NPUW_DUMP_SUBS_ON_FAIL, std::string, "", npuw::dump::subgraphs_on_fail, CompileTime);
diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp
@@ -331,6 +331,14 @@ static constexpr ov::Property<double> threshold{"NPUW_ACC_THRESH"};
  * Default value: empty.
  */
 static constexpr ov::Property<std::string> reference_device{"NPUW_ACC_DEVICE"};
+
+/**
+ * @brief
+ * Type: bool.
+ * Enable dumps of materials for model(s), failing accuracy check.
+ * Default value: false.
+ */
+static constexpr ov::Property<bool> dump_failures{"NPUW_ACC_DUMP_FAILS"};
 }  // namespace accuracy
 
 namespace dump {
diff --git a/src/plugins/intel_npu/src/al/src/config/npuw.cpp b/src/plugins/intel_npu/src/al/src/config/npuw.cpp
@@ -47,6 +47,7 @@ void intel_npu::registerNPUWOptions(OptionsDesc& desc) {
     desc.add<NPUW_ACC_CHECK>();
     desc.add<NPUW_ACC_THRESH>();
     desc.add<NPUW_ACC_DEVICE>();
+    desc.add<NPUW_ACC_DUMP_FAILS>();
 #ifdef NPU_PLUGIN_DEVELOPER_BUILD
     desc.add<NPUW_DUMP_FULL>();
     desc.add<NPUW_DUMP_SUBS>();
diff --git a/src/plugins/intel_npu/src/plugin/npuw/accuracy/comparator.cpp b/src/plugins/intel_npu/src/plugin/npuw/accuracy/comparator.cpp
@@ -13,30 +13,47 @@
 ov::npuw::metrics::NRMSE::NRMSE(double threshold) : m_threshold(threshold) {}
 
 bool ov::npuw::metrics::NRMSE::operator()(const ov::SoPtr<ov::ITensor>& actual,
-                                          const ov::SoPtr<ov::ITensor>& reference) const {
-    NPUW_ASSERT(actual->is_continuous());
-    NPUW_ASSERT(reference->is_continuous());
+                                          const ov::SoPtr<ov::ITensor>& reference,
+                                          double* result) const {
     NPUW_ASSERT(actual->get_shape() == reference->get_shape());
     // Check for alignment:
     NPUW_ASSERT(actual->get_byte_size() == reference->get_byte_size());
-    // FIXME: Check for strides
+
+    ov::Tensor in_actual(actual->get_element_type(), actual->get_shape());
+    ov::Tensor in_reference(reference->get_element_type(), reference->get_shape());
+
+    if (!actual->is_continuous()) {
+        ov::make_tensor(actual).copy_to(in_actual);
+    } else {
+        in_actual = ov::make_tensor(actual);
+    }
+    if (!reference->is_continuous()) {
+        ov::make_tensor(reference).copy_to(in_reference);
+    } else {
+        in_reference = ov::make_tensor(reference);
+    }
+
+    // TODO: it might be more correct to make to_f32 function
+    //       to work with strided tensors
+    NPUW_ASSERT(in_actual.is_continuous());
+    NPUW_ASSERT(in_reference.is_continuous());
 
     ov::Tensor actual_f32;
     ov::Tensor reference_f32;
 
-    if (ov::element::Type_t::f32 == actual->get_element_type()) {
-        actual_f32 = ov::make_tensor(actual);
+    if (ov::element::f32 == in_actual.get_element_type()) {
+        actual_f32 = in_actual;
     } else {
-        ov::Tensor dst(ov::element::Type_t::f32, actual->get_shape());
-        ov::npuw::util::to_f32(ov::make_tensor(actual), dst);
+        ov::Tensor dst(ov::element::Type_t::f32, in_actual.get_shape());
+        ov::npuw::util::to_f32(in_actual, dst);
         actual_f32 = std::move(dst);
     }
 
-    if (ov::element::Type_t::f32 == reference->get_element_type()) {
-        reference_f32 = ov::make_tensor(reference);
+    if (ov::element::f32 == in_reference.get_element_type()) {
+        reference_f32 = in_reference;
     } else {
-        ov::Tensor dst(ov::element::Type_t::f32, reference->get_shape());
-        ov::npuw::util::to_f32(ov::make_tensor(reference), dst);
+        ov::Tensor dst(ov::element::Type_t::f32, in_reference.get_shape());
+        ov::npuw::util::to_f32(in_reference, dst);
         reference_f32 = dst;
     }
 
@@ -51,13 +68,21 @@ bool ov::npuw::metrics::NRMSE::operator()(const ov::SoPtr<ov::ITensor>& actual,
     }
 
     if (squared_error <= std::numeric_limits<double>::epsilon()) {
-        LOG_INFO("NRMSE loss: 0.0, threshold: " << m_threshold << ".");
-        LOG_INFO("PASS");
+        if (result != nullptr) {
+            *result = 0.0;
+        }
         return true;
     }
 
     double rmse = sqrt(squared_error / size);
-    NPUW_ASSERT(rmse >= 0.0);
+   
+    if (rmse < 0.0) {
+        // Calculated RMSE metric is < 0.0, what is unexpected. So, return that tensors are unequal.
+        if (result != nullptr) {
+            *result = rmse;
+        }
+        return false;
+    }
 
     auto actual_min_max = std::minmax_element(actual_data, actual_data + size);
     auto reference_min_max = std::minmax_element(reference_data, reference_data + size);
@@ -66,9 +91,8 @@ bool ov::npuw::metrics::NRMSE::operator()(const ov::SoPtr<ov::ITensor>& actual,
                            std::max(0.f, *actual_min_max.second) - std::min(0.f, *actual_min_max.first)});
 
     double nrmse = rmse / den;
-    LOG_INFO("NRMSE loss: " << nrmse << ", threshold: " << m_threshold << ".");
-
-    bool success = nrmse <= m_threshold;
-    LOG_INFO((success ? "PASS" : "FAIL"));
-    return success;
+    if (result != nullptr) {
+        *result = nrmse;
+    }
+    return nrmse <= m_threshold;
 }
diff --git a/src/plugins/intel_npu/src/plugin/npuw/accuracy/comparator.hpp b/src/plugins/intel_npu/src/plugin/npuw/accuracy/comparator.hpp
@@ -15,8 +15,9 @@ namespace metrics {
 class NRMSE {
 public:
     explicit NRMSE(double threshold);
-    bool operator()(const ov::SoPtr<ov::ITensor>& backup_tensor, const ov::SoPtr<ov::ITensor>& original_tensor) const;
-
+    bool operator()(const ov::SoPtr<ov::ITensor>& backup_tensor,
+                    const ov::SoPtr<ov::ITensor>& original_tensor,
+                    double* result = nullptr) const;
 private:
     double m_threshold{};
 };
diff --git a/src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.cpp b/src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.cpp
@@ -75,7 +75,7 @@ ov::npuw::IBaseInferRequest::RqPtrs ov::npuw::IBaseInferRequest::create_infer_re
             OPENVINO_THROW("NPUW: TEMPORARY LIMITATION: Couldn't create reference infer "
                            "requests if 'nireq' is set to > 1!");
         }
-        LOG_INFO("Create reference subrequest for submodel [" << id << "] on " << m_npuw_model->m_ref_device << "...");
+        LOG_INFO("Create reference subrequest for Subgraph[" << id << "] on " << m_npuw_model->m_ref_device << "...");
         LOG_BLOCK();
         if (m_npuw_model->submodel_device(id) != m_npuw_model->m_ref_device) {
             auto& ref_submodel = m_npuw_model->m_compiled_submodels.at(id).ref_compiled_model;
@@ -85,60 +85,129 @@ ov::npuw::IBaseInferRequest::RqPtrs ov::npuw::IBaseInferRequest::create_infer_re
             m_ref_subrequests.at(id) = std::move(ref_infer_request);
             LOG_INFO("Done");
         } else {
-            LOG_INFO("Skip creation of reference subrequest for submodule["
-                     << id << "] on reference device: " << m_npuw_model->m_ref_device << ", as actual subrequest ["
-                     << id << "] has been already created on "
-                     << "it .");
+            LOG_INFO("Skip creation of reference subrequest for Subgraph["
+                    << id << "] on reference device: " << m_npuw_model->m_ref_device << ", as actual subrequest ["
+                    << id << "] has been already created on "
+                    << "it .");
         }
     }
 
     return rqs;
 }
 
-void ov::npuw::IBaseInferRequest::ensure_subrequest_is_accurate(std::size_t idx, bool& failover) {
+// index <-> real_idx matter here
+void ov::npuw::IBaseInferRequest::ensure_subrequest_is_accurate(std::size_t idx, bool& accuracy_failover) {
+    if (!m_npuw_model->m_acc_check) {
+         return;
+    }
     LOG_INFO("Check if subrequest[" << idx << "] is accurate...");
     LOG_BLOCK();
-    failover = false;
-    if (m_ref_subrequests.at(idx) != nullptr && m_subrequests.at(idx)._ptr != m_ref_subrequests.at(idx)._ptr) {
-        NPUW_ASSERT(m_npuw_model->m_compiled_submodels.at(idx).switched_to_ref == false);
-        NPUW_ASSERT(m_npuw_model->m_compiled_submodels.at(idx).replaced_by.value_or(idx) == idx);
 
-        const auto& ref_comp_model = m_ref_subrequests.at(idx)->get_compiled_model();
-        const auto& actual_comp_model = m_subrequests.at(idx)->get_compiled_model();
+    if (m_npuw_model->submodel_device(idx) == m_npuw_model->m_ref_device) {
+        LOG_INFO("Skipped, subrequest[" << idx << "] is launched on reference device.");
+        return;
+    }
+
+    accuracy_failover = false;
+
+    std::size_t real_idx = real(idx);
+    auto& actual_subr = m_subrequests.at(real_idx);
+    auto& ref_subr = m_ref_subrequests.at(real_idx);
+    NPUW_ASSERT(ref_subr);
+
+    // This check is omitted in the new PR. What does it mean?
+    // Check for already switched request is omiited -> that is okay.
+    if (actual_subr._ptr != ref_subr._ptr) {
+        NPUW_ASSERT(m_npuw_model->m_compiled_submodels.at(real_idx).switched_to_ref == false);
+
+        const auto& ref_comp_model = ref_subr->get_compiled_model();
+        const auto& actual_comp_model = actual_subr->get_compiled_model();
         NPUW_ASSERT(actual_comp_model->inputs().size() == ref_comp_model->inputs().size());
         // Setting inputs:
         for (size_t i = 0; i < actual_comp_model->inputs().size(); i++) {
-            const auto& itensor = m_subrequests.at(idx)->get_tensor(actual_comp_model->inputs()[i]);
-            m_ref_subrequests.at(idx)->set_tensor(ref_comp_model->inputs()[i], itensor);
+            const auto& itensor = actual_subr->get_tensor(actual_comp_model->inputs()[i]);
+            ref_subr->set_tensor(ref_comp_model->inputs()[i], itensor);
         }
-        m_ref_subrequests.at(idx)->infer();
+        // Running inference:
+        ref_subr->infer();
 
+        // Comparing results of actual and reference inferfences:
         LOG_INFO("Compare actual outputs against references:");
         bool tensors_converge = true;
+        std::vector<bool> converges(actual_comp_model->outputs().size());
+        std::vector<double> metrics(actual_comp_model->outputs().size());
         for (size_t i = 0; i < actual_comp_model->outputs().size(); i++) {
-            LOG_INFO(" - " << actual_comp_model->outputs()[i]);
-            const auto& actual_tensor = m_subrequests.at(idx)->get_tensor(actual_comp_model->outputs()[i]);
-            const auto& ref_tensor = m_ref_subrequests.at(idx)->get_tensor(ref_comp_model->outputs()[i]);
-            LOG_BLOCK();
-            tensors_converge &= m_npuw_model->m_acc_check(actual_tensor, ref_tensor);
+            const auto& actual_tensor = actual_subr->get_tensor(actual_comp_model->outputs()[i]);
+            const auto& ref_tensor = ref_subr->get_tensor(ref_comp_model->outputs()[i]);
+            converges[i] = m_npuw_model->m_acc_check(actual_tensor, ref_tensor, &metrics[i]);
+            tensors_converge &= converges[i];
+        }
+        if (tensors_converge == false) {
+            if (ov::npuw::get_log_level() == ov::npuw::LogLevel::Error) {
+                // For just log level error print header message:
+                LOG_ERROR("Check if subrequest[" << idx << "] is accurate...");
+            }
+        }
+        // Log comparison details:
+        for (size_t i = 0; i < actual_comp_model->outputs().size(); i++) {
+            if (converges[i]) {
+                LOG_INFO(" - " << actual_comp_model->outputs()[i]);
+                LOG_BLOCK();
+                LOG_INFO(m_npuw_model->m_acc_check_name << " loss: " << metrics[i] <<
+                        ", threshold: " << m_npuw_model->m_acc_check_threshold << ".");
+                LOG_INFO("PASS");
+            } else {
+                LOG_ERROR(" - " << actual_comp_model->outputs()[i]);
+                LOG_BLOCK();
+                LOG_ERROR(m_npuw_model->m_acc_check_name << " loss: " << metrics[i] <<
+                        ", threshold: " << m_npuw_model->m_acc_check_threshold << ".");
+                LOG_ERROR("FAIL");
+            }
         }
-        LOG_INFO((tensors_converge ? "PASS" : "FAIL"));
 
-        if (!tensors_converge) {
-            LOG_INFO("Subrequest is inaccurate, failover to reference.");
-            // FIXME: We need to copy reference tensors to actual only in single-model-inference mode
-            //        or if our subgraph is last in the chain.
+        // If comparison fails, failover to reference results:
+        if (tensors_converge) {
+            LOG_INFO("PASS");
+        } else {
+            LOG_ERROR("FAIL");
+            LOG_ERROR("Subrequest[" << idx << "] is inaccurate, failover to reference results.");
+            if (idx != real_idx) {
+                LOG_ERROR("As subrequest[" << idx << "] is actually " << "subrequest[" << real_idx <<
+                        "], all subrequests, corresponding to last, will be further " <<
+                        "launched on " << m_npuw_model->m_ref_device << ".'");
+            } else if (m_npuw_model->m_compiled_submodels[real_idx].replaced_by) {
+                LOG_ERROR("As subrequest[" << real_idx << "] is actually " << "a function, all " <<
+                        "subrequests, corresponding to it, will be further launched on " <<
+                        m_npuw_model->m_ref_device << ".");
+            }
+
+            if (m_npuw_model->m_cfg.get<::intel_npu::NPUW_ACC_DUMP_FAILS>()) {
+                const auto model = m_npuw_model->m_compiled_submodels[real_idx].model;
+                const auto model_path = "inaccurate_" + model->get_friendly_name() + ".xml";
+                ov::save_model(model, model_path);
+                dump_input_tensors(idx, true);
+                dump_output_tensors(idx, true);
+            }
+            
+            // TODO: For future implementation of accuracy failover in spatial mode, it will
+            //       be safe to just copy results from reference subrequest back to already
+            //       properly allocated and linked tensors of actual subrequest due to the
+            //       complex memory management logic.
+
+            // FIXME: Now, we need to copy reference tensors to actual only in
+            //        single-model-inference mode or if our subgraph is the last in the chain.
             for (size_t i = 0; i < actual_comp_model->outputs().size(); i++) {
-                const auto& actual_tensor = m_subrequests.at(idx)->get_tensor(actual_comp_model->outputs()[i]);
-                const auto& ref_tensor = m_ref_subrequests.at(idx)->get_tensor(ref_comp_model->outputs()[i]);
+                const auto& actual_tensor = actual_subr->get_tensor(actual_comp_model->outputs()[i]);
+                const auto& ref_tensor = ref_subr->get_tensor(ref_comp_model->outputs()[i]);
                 ref_tensor->copy_to(actual_tensor._ptr);
             }
-            m_npuw_model->m_compiled_submodels.at(idx).compiled_model =
-                m_npuw_model->m_compiled_submodels.at(idx).ref_compiled_model;
-            m_npuw_model->m_compiled_submodels.at(idx).switched_to_ref = true;
-            m_subrequests.at(idx) = m_ref_subrequests.at(idx);
+            m_npuw_model->m_compiled_submodels.at(real_idx).compiled_model =
+                m_npuw_model->m_compiled_submodels.at(real_idx).ref_compiled_model;
+            m_subrequests.at(real_idx) = m_ref_subrequests.at(real_idx);
+            // Using idx here, let real_idx be handled inside if needed.
             update_subrequest_links(idx);
-            failover = true;
+            m_npuw_model->m_compiled_submodels.at(real_idx).switched_to_ref = true;
+            accuracy_failover = true;
         }
 
         LOG_INFO("Done");
@@ -226,9 +295,9 @@ void ov::npuw::IBaseInferRequest::infer() {
     m_run_iter++;
 
     if (failover_happened) {
-        LOG_INFO("Refined device distribution:");
+        LOG_ERROR("Refined device distribution:");
         LOG_BLOCK();
-        m_npuw_model->log_device_dist();
+        m_npuw_model->log_device_dist(ov::npuw::LogLevel::Error);
     }
     m_now_idx.reset();
 }
@@ -488,12 +557,11 @@ void ov::npuw::IBaseInferRequest::bind_global_results(std::size_t idx, RqPtr req
     LOG_DEBUG("Done");
 }
 
-void ov::npuw::IBaseInferRequest::dump_input_tensors(std::size_t idx) {
+void ov::npuw::IBaseInferRequest::dump_input_tensors(std::size_t idx, bool forced) {
     const std::string dump_ios_opt = m_npuw_model->m_cfg.get<::intel_npu::NPUW_DUMP_IO>();
     const std::size_t end_idx = m_npuw_model->m_compiled_submodels.size();
-    auto real_idx = m_npuw_model->m_compiled_submodels[idx].replaced_by.value_or(idx);
-
-    if (!ov::npuw::util::is_set(idx, dump_ios_opt, real_idx, end_idx)) {
+    const std::size_t real_idx = real(idx);
+    if (!ov::npuw::util::is_set(idx, dump_ios_opt, real_idx, end_idx) && !forced) {
         return;
     }
 
@@ -569,12 +637,12 @@ void ov::npuw::IBaseInferRequest::dump_input_tensors(std::size_t idx) {
     }
 }
 
-void ov::npuw::IBaseInferRequest::dump_output_tensors(std::size_t idx) {
+void ov::npuw::IBaseInferRequest::dump_output_tensors(std::size_t idx, bool forced) {
     const std::string dump_ios_opt = m_npuw_model->m_cfg.get<::intel_npu::NPUW_DUMP_IO>();
     const std::size_t end_idx = m_npuw_model->m_compiled_submodels.size();
     auto real_idx = m_npuw_model->m_compiled_submodels[idx].replaced_by.value_or(idx);
 
-    if (!ov::npuw::util::is_set(idx, dump_ios_opt, real_idx, end_idx)) {
+    if (!ov::npuw::util::is_set(idx, dump_ios_opt, real_idx, end_idx) && !forced) {
         return;
     }
 
diff --git a/src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.hpp b/src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.hpp
@@ -150,8 +150,8 @@ class IBaseInferRequest : public ov::ISyncInferRequest {
     virtual void bind_global_params(std::size_t idx, RqPtr request);
     virtual void bind_global_results(std::size_t idx, RqPtr request);
 
-    void dump_input_tensors(std::size_t idx);
-    void dump_output_tensors(std::size_t idx);
+    void dump_input_tensors(std::size_t idx, bool forced = false);
+    void dump_output_tensors(std::size_t idx, bool forced = false);
 
     // Quick-and-dirty profiling
     ov::npuw::perf::metric<float, ov::npuw::perf::MSec> m_ms_unpack;
diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp
diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp
diff --git a/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp b/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp