diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp
index 82dfaceb12ddd4..05a70a9c0c8796 100644
--- a/src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp
+++ b/src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp
@@ -98,6 +98,7 @@ DEFINE_OPT(NPUW_UNFOLD_IREQS, bool, false, npuw::unfold_ireqs, RunTime);
 DEFINE_OPT(NPUW_ACC_CHECK, bool, false, npuw::accuracy::check, RunTime);
 DEFINE_OPT(NPUW_ACC_THRESH, double, 0.01, npuw::accuracy::threshold, RunTime);
 DEFINE_OPT(NPUW_ACC_DEVICE, std::string, "", npuw::accuracy::reference_device, RunTime);
+DEFINE_OPT(NPUW_ACC_DUMP_FAILS, bool, false, npuw::accuracy::dump_failures, RunTime);
 DEFINE_OPT(NPUW_DUMP_FULL, bool, false, npuw::dump::full, RunTime);
 DEFINE_OPT(NPUW_DUMP_SUBS, std::string, "", npuw::dump::subgraphs, RunTime);
 DEFINE_OPT(NPUW_DUMP_SUBS_ON_FAIL, std::string, "", npuw::dump::subgraphs_on_fail, RunTime);
diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp
index afdd28a46c5ddc..12354701eed725 100644
--- a/src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp
+++ b/src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp
@@ -331,6 +331,14 @@ static constexpr ov::Property<double> threshold{"NPUW_ACC_THRESH"};
  * Default value: empty.
  */
 static constexpr ov::Property<std::string> reference_device{"NPUW_ACC_DEVICE"};
+
+/**
+ * @brief
+ * Type: bool.
+ * Enable dumps of materials for model(s), failing accuracy check.
+ * Default value: false.
+ */
+static constexpr ov::Property<bool> dump_failures{"NPUW_ACC_DUMP_FAILS"};
 }  // namespace accuracy
 
 namespace dump {
diff --git a/src/plugins/intel_npu/src/al/src/config/npuw.cpp b/src/plugins/intel_npu/src/al/src/config/npuw.cpp
index 76a28acc8016d4..edb666bda427a7 100644
--- a/src/plugins/intel_npu/src/al/src/config/npuw.cpp
+++ b/src/plugins/intel_npu/src/al/src/config/npuw.cpp
@@ -47,6 +47,7 @@ void intel_npu::registerNPUWOptions(OptionsDesc& desc) {
     desc.add<NPUW_ACC_CHECK>();
     desc.add<NPUW_ACC_THRESH>();
     desc.add<NPUW_ACC_DEVICE>();
+    desc.add<NPUW_ACC_DUMP_FAILS>();
 #ifdef NPU_PLUGIN_DEVELOPER_BUILD
     desc.add<NPUW_DUMP_FULL>();
     desc.add<NPUW_DUMP_SUBS>();
diff --git a/src/plugins/intel_npu/src/plugin/npuw/accuracy/comparator.cpp b/src/plugins/intel_npu/src/plugin/npuw/accuracy/comparator.cpp
index 4440027c818969..13294ac521f122 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/accuracy/comparator.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/accuracy/comparator.cpp
@@ -13,30 +13,47 @@
 ov::npuw::metrics::NRMSE::NRMSE(double threshold) : m_threshold(threshold) {}
 
 bool ov::npuw::metrics::NRMSE::operator()(const ov::SoPtr<ov::ITensor>& actual,
-                                          const ov::SoPtr<ov::ITensor>& reference) const {
-    NPUW_ASSERT(actual->is_continuous());
-    NPUW_ASSERT(reference->is_continuous());
+                                          const ov::SoPtr<ov::ITensor>& reference,
+                                          double* result) const {
     NPUW_ASSERT(actual->get_shape() == reference->get_shape());
     // Check for alignment:
     NPUW_ASSERT(actual->get_byte_size() == reference->get_byte_size());
-    // FIXME: Check for strides
+
+    ov::Tensor in_actual(actual->get_element_type(), actual->get_shape());
+    ov::Tensor in_reference(reference->get_element_type(), reference->get_shape());
+
+    if (!actual->is_continuous()) {
+        ov::make_tensor(actual).copy_to(in_actual);
+    } else {
+        in_actual = ov::make_tensor(actual);
+    }
+    if (!reference->is_continuous()) {
+        ov::make_tensor(reference).copy_to(in_reference);
+    } else {
+        in_reference = ov::make_tensor(reference);
+    }
+
+    // TODO: it might be more correct to make to_f32 function
+    //       to work with strided tensors
+    NPUW_ASSERT(in_actual.is_continuous());
+    NPUW_ASSERT(in_reference.is_continuous());
 
     ov::Tensor actual_f32;
     ov::Tensor reference_f32;
 
-    if (ov::element::Type_t::f32 == actual->get_element_type()) {
-        actual_f32 = ov::make_tensor(actual);
+    if (ov::element::f32 == in_actual.get_element_type()) {
+        actual_f32 = in_actual;
     } else {
-        ov::Tensor dst(ov::element::Type_t::f32, actual->get_shape());
-        ov::npuw::util::to_f32(ov::make_tensor(actual), dst);
+        ov::Tensor dst(ov::element::Type_t::f32, in_actual.get_shape());
+        ov::npuw::util::to_f32(in_actual, dst);
         actual_f32 = std::move(dst);
     }
 
-    if (ov::element::Type_t::f32 == reference->get_element_type()) {
-        reference_f32 = ov::make_tensor(reference);
+    if (ov::element::f32 == in_reference.get_element_type()) {
+        reference_f32 = in_reference;
     } else {
-        ov::Tensor dst(ov::element::Type_t::f32, reference->get_shape());
-        ov::npuw::util::to_f32(ov::make_tensor(reference), dst);
+        ov::Tensor dst(ov::element::Type_t::f32, in_reference.get_shape());
+        ov::npuw::util::to_f32(in_reference, dst);
         reference_f32 = dst;
     }
 
@@ -51,13 +68,21 @@ bool ov::npuw::metrics::NRMSE::operator()(const ov::SoPtr<ov::ITensor>& actual,
     }
 
     if (squared_error <= std::numeric_limits<double>::epsilon()) {
-        LOG_INFO("NRMSE loss: 0.0, threshold: " << m_threshold << ".");
-        LOG_INFO("PASS");
+        if (result != nullptr) {
+            *result = 0.0;
+        }
         return true;
     }
 
     double rmse = sqrt(squared_error / size);
-    NPUW_ASSERT(rmse >= 0.0);
+   
+    if (rmse < 0.0) {
+        // Calculated RMSE metric is < 0.0, what is unexpected. So, return that tensors are unequal.
+        if (result != nullptr) {
+            *result = rmse;
+        }
+        return false;
+    }
 
     auto actual_min_max = std::minmax_element(actual_data, actual_data + size);
     auto reference_min_max = std::minmax_element(reference_data, reference_data + size);
@@ -66,9 +91,8 @@ bool ov::npuw::metrics::NRMSE::operator()(const ov::SoPtr<ov::ITensor>& actual,
                            std::max(0.f, *actual_min_max.second) - std::min(0.f, *actual_min_max.first)});
 
     double nrmse = rmse / den;
-    LOG_INFO("NRMSE loss: " << nrmse << ", threshold: " << m_threshold << ".");
-
-    bool success = nrmse <= m_threshold;
-    LOG_INFO((success ? "PASS" : "FAIL"));
-    return success;
+    if (result != nullptr) {
+        *result = nrmse;
+    }
+    return nrmse <= m_threshold;
 }
diff --git a/src/plugins/intel_npu/src/plugin/npuw/accuracy/comparator.hpp b/src/plugins/intel_npu/src/plugin/npuw/accuracy/comparator.hpp
index e77a38ced0edc2..1d0182582946c3 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/accuracy/comparator.hpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/accuracy/comparator.hpp
@@ -15,8 +15,9 @@ namespace metrics {
 class NRMSE {
 public:
     explicit NRMSE(double threshold);
-    bool operator()(const ov::SoPtr<ov::ITensor>& backup_tensor, const ov::SoPtr<ov::ITensor>& original_tensor) const;
-
+    bool operator()(const ov::SoPtr<ov::ITensor>& backup_tensor,
+                    const ov::SoPtr<ov::ITensor>& original_tensor,
+                    double* result = nullptr) const;
 private:
     double m_threshold{};
 };
diff --git a/src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.cpp b/src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.cpp
index c6fc0d6dd563c4..a6fb0cae4d8436 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.cpp
@@ -75,7 +75,7 @@ ov::npuw::IBaseInferRequest::RqPtrs ov::npuw::IBaseInferRequest::create_infer_re
             OPENVINO_THROW("NPUW: TEMPORARY LIMITATION: Couldn't create reference infer "
                            "requests if 'nireq' is set to > 1!");
         }
-        LOG_INFO("Create reference subrequest for submodel [" << id << "] on " << m_npuw_model->m_ref_device << "...");
+        LOG_INFO("Create reference subrequest for Subgraph[" << id << "] on " << m_npuw_model->m_ref_device << "...");
         LOG_BLOCK();
         if (m_npuw_model->submodel_device(id) != m_npuw_model->m_ref_device) {
             auto& ref_submodel = m_npuw_model->m_compiled_submodels.at(id).ref_compiled_model;
@@ -94,56 +94,215 @@ ov::npuw::IBaseInferRequest::RqPtrs ov::npuw::IBaseInferRequest::create_infer_re
     return rqs;
 }
 
-void ov::npuw::IBaseInferRequest::ensure_subrequest_is_accurate(std::size_t idx, bool& failover) {
+namespace {
+    void set_inputs(const ov::SoPtr<ov::IAsyncInferRequest>& from, ov::SoPtr<ov::IAsyncInferRequest>& to) {
+        const auto& from_comp_model = from->get_compiled_model();
+        const auto& to_comp_model = to->get_compiled_model();
+        for (size_t i = 0; i < from_comp_model->inputs().size(); i++) {
+            const auto& itnsr = from->get_tensor(from_comp_model->inputs()[i]);
+            to->set_tensor(to_comp_model->inputs()[i], itnsr);
+        }
+    }
+
+    void copy_results(const ov::SoPtr<ov::IAsyncInferRequest>& from, ov::SoPtr<ov::IAsyncInferRequest>& to) {
+        const auto& from_comp_model = from->get_compiled_model();
+        const auto& to_comp_model = to->get_compiled_model();
+        for (size_t i = 0; i < to_comp_model->outputs().size(); i++) {
+            const auto& from_tnsr = from->get_tensor(from_comp_model->outputs()[i]);
+            const auto& to_tnsr = to->get_tensor(to_comp_model->outputs()[i]);
+            from_tnsr->copy_to(to_tnsr._ptr);
+        }
+    }
+
+    std::stringstream create_launch_msg(std::size_t idx,  std::size_t real_idx) {
+        std::stringstream log_msg_stream;
+        log_msg_stream << "Launching subrequest[" << idx << "]" <<
+        ((real_idx == idx) ? std::string("...").c_str() :
+                             std::string(std::string(", which is actually subrequest[") +
+                                std::to_string(real_idx) + "]").c_str());
+        return log_msg_stream;
+    }
+} // anonymous namespace
+
+void ov::npuw::IBaseInferRequest::try_accurate_subinfer(std::size_t subidx, std::size_t offset,
+                                                        std::size_t len, bool& accuracy_failover) {
+    auto real_subidx = real(subidx);
+    auto& act_subr = m_subrequests.at(real_subidx);
+    if (!m_npuw_model->m_acc_check) {
+        act_subr->infer();
+        return;
+    }
+
+    std::stringstream log_msg_stream = create_launch_msg(subidx, real_subidx);
+    if (m_npuw_model->m_compiled_submodels[real_subidx].spatial && len != 0) {
+        log_msg_stream << ", on range : [" << offset << ", " << offset + len << ")";
+    }
+    log_msg_stream << "...";
+    LOG_INFO(log_msg_stream.str());
+    LOG_BLOCK();
+
+    if (m_npuw_model->m_compiled_submodels[real_subidx].switched_to_ref) {
+        LOG_INFO("Subrequest was inaccurate somewhere before, launching it on reference device.");
+
+        auto& act_subr = m_subrequests.at(real_subidx);
+        auto& ref_subr = m_ref_subrequests.at(real_subidx);
+
+        set_inputs(act_subr, ref_subr);
+        ref_subr->infer();
+        copy_results(ref_subr, act_subr);
+    } else {
+        act_subr->infer();
+        ensure_subrequest_is_accurate(subidx, accuracy_failover);
+    }
+}
+
+void ov::npuw::IBaseInferRequest::try_accurate_subinfer(std::size_t subidx, bool& accuracy_failover) {
+    try_accurate_subinfer(subidx, 0, 0, accuracy_failover);
+}
+
+void ov::npuw::IBaseInferRequest::try_accurate_substart_async(std::size_t subidx) {
+    auto real_subidx = real(subidx);
+    auto& act_subr = m_subrequests.at(real_subidx);
+    if (!m_npuw_model->m_acc_check) {
+        act_subr->start_async();
+        return;
+    }
+
+    std::stringstream log_msg_stream = create_launch_msg(subidx, real_subidx);
+    log_msg_stream << "...";
+    LOG_INFO(log_msg_stream.str());
+    LOG_BLOCK();
+
+    if (m_npuw_model->m_compiled_submodels[real_subidx].switched_to_ref) {
+        LOG_INFO("Subrequest was inaccurate somewhere before, launching it on reference device.");
+
+        auto& act_subr = m_subrequests.at(real_subidx);
+        auto& ref_subr = m_ref_subrequests.at(real_subidx);
+
+        set_inputs(act_subr, ref_subr);
+        ref_subr->start_async();
+    } else {
+        act_subr->start_async();
+    }
+}
+
+void ov::npuw::IBaseInferRequest::try_accurate_subwait(std::size_t subidx, bool& accuracy_failover) {
+    auto real_subidx = real(subidx);
+    auto& act_subr = m_subrequests.at(real_subidx);
+    if (!m_npuw_model->m_acc_check) {
+        act_subr->wait();
+        return;
+    }
+
+    LOG_BLOCK();
+
+    if (m_npuw_model->m_compiled_submodels[real_subidx].switched_to_ref) {
+        auto& act_subr = m_subrequests.at(real_subidx);
+        auto& ref_subr = m_ref_subrequests.at(real_subidx);
+
+        ref_subr->wait();
+        copy_results(ref_subr, act_subr);
+    } else {
+        act_subr->wait();
+        ensure_subrequest_is_accurate(subidx, accuracy_failover);
+    }
+}
+
+void ov::npuw::IBaseInferRequest::ensure_subrequest_is_accurate(std::size_t idx, bool& accuracy_failover) {
+    if (!m_npuw_model->m_acc_check) {
+         return;
+    }
+
     LOG_INFO("Check if subrequest[" << idx << "] is accurate...");
     LOG_BLOCK();
-    failover = false;
-    if (m_ref_subrequests.at(idx) != nullptr && m_subrequests.at(idx)._ptr != m_ref_subrequests.at(idx)._ptr) {
-        NPUW_ASSERT(m_npuw_model->m_compiled_submodels.at(idx).switched_to_ref == false);
-        NPUW_ASSERT(m_npuw_model->m_compiled_submodels.at(idx).replaced_by.value_or(idx) == idx);
-
-        const auto& ref_comp_model = m_ref_subrequests.at(idx)->get_compiled_model();
-        const auto& actual_comp_model = m_subrequests.at(idx)->get_compiled_model();
-        NPUW_ASSERT(actual_comp_model->inputs().size() == ref_comp_model->inputs().size());
-        // Setting inputs:
-        for (size_t i = 0; i < actual_comp_model->inputs().size(); i++) {
-            const auto& itensor = m_subrequests.at(idx)->get_tensor(actual_comp_model->inputs()[i]);
-            m_ref_subrequests.at(idx)->set_tensor(ref_comp_model->inputs()[i], itensor);
-        }
-        m_ref_subrequests.at(idx)->infer();
 
-        LOG_INFO("Compare actual outputs against references:");
-        bool tensors_converge = true;
-        for (size_t i = 0; i < actual_comp_model->outputs().size(); i++) {
+    std::size_t real_idx = real(idx);
+    OPENVINO_ASSERT(m_npuw_model->m_compiled_submodels[real_idx].switched_to_ref == false);
+
+    if (m_npuw_model->submodel_device(idx) == m_npuw_model->m_ref_device) {
+        LOG_INFO("Skipped, subrequest[" << idx << "] is launched on reference device.");
+        return;
+    }
+
+    accuracy_failover = false;
+    auto& actual_subr = m_subrequests.at(real_idx);
+    auto& ref_subr = m_ref_subrequests.at(real_idx);
+
+    // Setting inputs:
+    set_inputs(actual_subr, ref_subr);
+
+    // Running inference:
+    ref_subr->infer();
+
+    // Comparing results of actual and reference inferfences:
+    LOG_INFO("Compare actual outputs against references:");
+    bool tensors_converge = true;
+    const auto& actual_comp_model = actual_subr->get_compiled_model();
+    const auto& ref_comp_model = ref_subr->get_compiled_model();
+    std::vector<bool> converges(actual_comp_model->outputs().size());
+    std::vector<double> metrics(actual_comp_model->outputs().size());
+    for (size_t i = 0; i < actual_comp_model->outputs().size(); i++) {
+        const auto& actual_tensor = actual_subr->get_tensor(actual_comp_model->outputs()[i]);
+        const auto& ref_tensor = ref_subr->get_tensor(ref_comp_model->outputs()[i]);
+        converges[i] = m_npuw_model->m_acc_check(actual_tensor, ref_tensor, &metrics[i]);
+        tensors_converge &= converges[i];
+    }
+    if (tensors_converge == false) {
+        if (ov::npuw::get_log_level() == ov::npuw::LogLevel::Error) {
+            // For just log level error print header message:
+            LOG_ERROR("Check if subrequest[" << idx << "] is accurate...");
+        }
+    }
+    // Log comparison details:
+    for (size_t i = 0; i < actual_comp_model->outputs().size(); i++) {
+        if (converges[i]) {
             LOG_INFO(" - " << actual_comp_model->outputs()[i]);
-            const auto& actual_tensor = m_subrequests.at(idx)->get_tensor(actual_comp_model->outputs()[i]);
-            const auto& ref_tensor = m_ref_subrequests.at(idx)->get_tensor(ref_comp_model->outputs()[i]);
             LOG_BLOCK();
-            tensors_converge &= m_npuw_model->m_acc_check(actual_tensor, ref_tensor);
-        }
-        LOG_INFO((tensors_converge ? "PASS" : "FAIL"));
-
-        if (!tensors_converge) {
-            LOG_INFO("Subrequest is inaccurate, failover to reference.");
-            // FIXME: We need to copy reference tensors to actual only in single-model-inference mode
-            //        or if our subgraph is last in the chain.
-            for (size_t i = 0; i < actual_comp_model->outputs().size(); i++) {
-                const auto& actual_tensor = m_subrequests.at(idx)->get_tensor(actual_comp_model->outputs()[i]);
-                const auto& ref_tensor = m_ref_subrequests.at(idx)->get_tensor(ref_comp_model->outputs()[i]);
-                ref_tensor->copy_to(actual_tensor._ptr);
-            }
-            m_npuw_model->m_compiled_submodels.at(idx).compiled_model =
-                m_npuw_model->m_compiled_submodels.at(idx).ref_compiled_model;
-            m_npuw_model->m_compiled_submodels.at(idx).switched_to_ref = true;
-            m_subrequests.at(idx) = m_ref_subrequests.at(idx);
-            update_subrequest_links(idx);
-            failover = true;
+            LOG_INFO(m_npuw_model->m_acc_check_name << " loss: " << metrics[i] <<
+                      ", threshold: " << m_npuw_model->m_acc_check_threshold << ".");
+            LOG_INFO("PASS");
+        } else {
+            LOG_ERROR(" - " << actual_comp_model->outputs()[i]);
+            LOG_BLOCK();
+            LOG_ERROR(m_npuw_model->m_acc_check_name << " loss: " << metrics[i] <<
+                      ", threshold: " << m_npuw_model->m_acc_check_threshold << ".");
+            LOG_ERROR("FAIL");
         }
+    }
 
-        LOG_INFO("Done");
+    // If comparison fails, copy reference results to original tensors and mark subgraph as
+    // switched to reference:
+    if (tensors_converge) {
+        LOG_INFO("PASS");
     } else {
-        LOG_INFO("Skipped, subrequest is launched on reference device.");
+        LOG_ERROR("FAIL");
+        LOG_ERROR("Subrequest[" << idx << "] is inaccurate, failover to reference results.");
+        if (idx != real_idx) {
+            LOG_ERROR("As subrequest[" << idx << "] is actually " << "subrequest[" << real_idx <<
+                       "], all subrequests, corresponding to last, will be further " <<
+                       "launched on " << m_npuw_model->m_ref_device << ".'");
+        } else if (m_npuw_model->m_compiled_submodels[real_idx].replaced_by) {
+            LOG_ERROR("As subrequest[" << real_idx << "] is actually " << "a function, all " <<
+                      "subrequests, corresponding to it, will be further launched on " <<
+                      m_npuw_model->m_ref_device << ".");
+        }
+
+        if (m_npuw_model->m_cfg.get<::intel_npu::NPUW_ACC_DUMP_FAILS>()) {
+            const auto model = m_npuw_model->m_compiled_submodels[real_idx].model;
+            const auto model_path = "inaccurate_" + model->get_friendly_name() + ".xml";
+            ov::save_model(model, model_path);
+            dump_input_tensors(idx, true);
+            dump_output_tensors(idx, true);
+        }
+        
+        // Due to complex memory management logic it is safe to just copy
+        // results back to already properly allocated and linked tensors:
+        copy_results(ref_subr, actual_subr);
+        m_npuw_model->m_compiled_submodels[real_idx].switched_to_ref = true;
+        accuracy_failover = true;
     }
+
+    LOG_INFO("Done");
 }
 
 ov::SoPtr<ov::ITensor> ov::npuw::IBaseInferRequest::get_tensor(const ov::Output<const ov::Node>& port) const {
@@ -215,19 +374,15 @@ void ov::npuw::IBaseInferRequest::infer() {
         run_subrequest_for_success(idx, failover);
         failover_happened |= failover;
         complete_subrequest(idx);
-        if (m_npuw_model->m_acc_check) {
-            ensure_subrequest_is_accurate(idx, failover);
-            failover_happened |= failover;
-        }
     }
 
     // Increment counter regardless if dumps etc are enabled or not.
     m_run_iter++;
 
     if (failover_happened) {
-        LOG_INFO("Refined device distribution:");
+        LOG_ERROR("Refined device distribution:");
         LOG_BLOCK();
-        m_npuw_model->log_device_dist();
+        m_npuw_model->log_device_dist(ov::npuw::LogLevel::Error);
     }
     m_now_idx.reset();
 }
@@ -487,12 +642,11 @@ void ov::npuw::IBaseInferRequest::bind_global_results(std::size_t idx, RqPtr req
     LOG_DEBUG("Done");
 }
 
-void ov::npuw::IBaseInferRequest::dump_input_tensors(std::size_t idx) {
+void ov::npuw::IBaseInferRequest::dump_input_tensors(std::size_t idx, bool forced) {
     const std::string dump_ios_opt = m_npuw_model->m_cfg.get<::intel_npu::NPUW_DUMP_IO>();
     const std::size_t end_idx = m_npuw_model->m_compiled_submodels.size();
-    auto real_idx = m_npuw_model->m_compiled_submodels[idx].replaced_by.value_or(idx);
-
-    if (!ov::npuw::util::is_set(idx, dump_ios_opt, real_idx, end_idx)) {
+    const std::size_t real_idx = real(idx);
+    if (!ov::npuw::util::is_set(idx, dump_ios_opt, real_idx, end_idx) && !forced) {
         return;
     }
 
@@ -568,12 +722,12 @@ void ov::npuw::IBaseInferRequest::dump_input_tensors(std::size_t idx) {
     }
 }
 
-void ov::npuw::IBaseInferRequest::dump_output_tensors(std::size_t idx) {
+void ov::npuw::IBaseInferRequest::dump_output_tensors(std::size_t idx, bool forced) {
     const std::string dump_ios_opt = m_npuw_model->m_cfg.get<::intel_npu::NPUW_DUMP_IO>();
     const std::size_t end_idx = m_npuw_model->m_compiled_submodels.size();
     auto real_idx = m_npuw_model->m_compiled_submodels[idx].replaced_by.value_or(idx);
 
-    if (!ov::npuw::util::is_set(idx, dump_ios_opt, real_idx, end_idx)) {
+    if (!ov::npuw::util::is_set(idx, dump_ios_opt, real_idx, end_idx) && !forced) {
         return;
     }
 
diff --git a/src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.hpp b/src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.hpp
index 576810cc9b216d..c7002aa8f3b2de 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.hpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/base_sync_infer_request.hpp
@@ -74,8 +74,12 @@ class IBaseInferRequest : public ov::ISyncInferRequest {
     // their inference requests anymore - they must be stored
     // only once in the subrequests list
     RqPtrs create_infer_requests(std::size_t id, size_t nireq = 1, bool* recompiled = nullptr);
-    void ensure_subrequest_is_accurate(std::size_t idx, bool& failover);
-    virtual void update_subrequest_links(std::size_t idx) = 0;
+    void try_accurate_subinfer(std::size_t idx, bool& accuracy_failover);
+    void try_accurate_subinfer(std::size_t idx, std::size_t offset, std::size_t len,
+                               bool& accuracy_failover);
+    virtual void try_accurate_substart_async(std::size_t idx);
+    virtual void try_accurate_subwait(std::size_t idx, bool& accuracy_failover);
+    virtual void ensure_subrequest_is_accurate(std::size_t idx, bool& accuracy_failover);
 
     std::shared_ptr<ov::npuw::CompiledModel> m_npuw_model;
     std::vector<IBaseInferRequest::Completed> m_completion_cbs;
@@ -150,8 +154,9 @@ class IBaseInferRequest : public ov::ISyncInferRequest {
     virtual void bind_global_params(std::size_t idx, RqPtr request);
     virtual void bind_global_results(std::size_t idx, RqPtr request);
 
-    void dump_input_tensors(std::size_t idx);
-    void dump_output_tensors(std::size_t idx);
+
+    void dump_input_tensors(std::size_t idx, bool forced = false);
+    void dump_output_tensors(std::size_t idx, bool forced = false);
 
     // Quick-and-dirty profiling
     ov::npuw::perf::metric<float, ov::npuw::perf::MSec> m_ms_unpack;
@@ -172,11 +177,11 @@ class IBaseInferRequest : public ov::ISyncInferRequest {
     std::size_t next(std::size_t idx_base) const;
     std::size_t real(std::size_t idx) const;
 
-    RqPtrs m_ref_subrequests;
-
     using now_t = std::optional<std::size_t>;
     now_t now_idx() const;
 
+    RqPtrs m_ref_subrequests;
+
 private:
     now_t m_now_idx;
 };
diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp
index 3d8ded4191d042..8fe3f806c29654 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp
@@ -194,6 +194,8 @@ ov::npuw::CompiledModel::CompiledModel(const std::shared_ptr<ov::Model>& model,
         const double threshold_opt = m_cfg.get<::intel_npu::NPUW_ACC_THRESH>();
 
         m_acc_check = metrics::NRMSE(threshold_opt);
+        m_acc_check_name = "NRMSE";
+        m_acc_check_threshold = threshold_opt;
         m_ref_device = m_cfg.getString<::intel_npu::NPUW_ACC_DEVICE>();
         LOG_INFO("Accuracy check is enabled.");
     }
@@ -443,8 +445,7 @@ ov::npuw::CompiledModel::CompiledModel(const std::shared_ptr<ov::Model>& model,
             }
         }
 
-        m_compiled_submodels[id].device_it =
-            id != real_id ? m_compiled_submodels[real_id].device_it : m_dev_list.cbegin();
+        m_compiled_submodels[id].device_it = m_dev_list.cbegin();
 
         if (forced_sub_devices.count(id)) {
             std::string forced_device = forced_sub_devices[id];
@@ -499,7 +500,10 @@ ov::npuw::CompiledModel::CompiledModel(const std::shared_ptr<ov::Model>& model,
 
     // Finalize memory in closures and weight banks
     finalize_weights_bank();
-    detach_memory();
+
+    if (!m_acc_check || (m_acc_check && !m_cfg.get<::intel_npu::NPUW_ACC_DUMP_FAILS>())) {
+        detach_memory();
+    }
 
     // Print stats report when possible
     {
@@ -1545,7 +1549,7 @@ bool ov::npuw::CompiledModel::is_gather_closure(const std::size_t idx, const std
     return false;
 }
 
-void ov::npuw::CompiledModel::log_device_dist() const {
+void ov::npuw::CompiledModel::log_device_dist(ov::npuw::LogLevel log_lvl) const {
     std::unordered_map<std::string, execution_stats> stats_for_devices;
     execution_stats stats_for_optimized_out{0.f, 0ul};
 
@@ -1560,14 +1564,32 @@ void ov::npuw::CompiledModel::log_device_dist() const {
         stat.ops += real_cm.stat.ops;
     }
 
-    auto print_stats = [this](const std::string& device, const execution_stats& stat) {
+    auto print_stats = [this, log_lvl](const std::string& device, const execution_stats& stat) {
         float flops_prcnt = 100.f;
         float ops_prcnt = 100.f;
         if (m_total_stat.gflops > 0 && m_total_stat.ops > 0) {
             flops_prcnt = stat.gflops / static_cast<float>(m_total_stat.gflops) * 100;
             ops_prcnt = stat.ops / static_cast<float>(m_total_stat.ops) * 100;
         }
-        LOG_INFO(device << ": " << flops_prcnt << "% FLOPS, " << ops_prcnt << "% Layers");
+        std::stringstream log_msg;
+        log_msg << device << ": " << flops_prcnt << "% FLOPS, " << ops_prcnt << "% Layers";
+        switch (log_lvl) {
+            case LogLevel::Error:
+                LOG_ERROR(log_msg.str());
+                break;
+            case LogLevel::Warning:
+                LOG_WARN(log_msg.str());
+                break;
+            case LogLevel::Info:
+                LOG_INFO(log_msg.str());
+                break;
+            case LogLevel::Verbose:
+                LOG_VERB(log_msg.str());
+                break;
+            case LogLevel::Debug:
+                LOG_DEBUG(log_msg.str());
+                break;
+        }
     };
     for (auto&& device_st : stats_for_devices) {
         LOG_BLOCK();
@@ -1712,6 +1734,7 @@ void ov::npuw::CompiledModel::implement_properties() {
                           BIND(npuw::accuracy::check, NPUW_ACC_CHECK),
                           BIND(npuw::accuracy::threshold, NPUW_ACC_THRESH),
                           BIND(npuw::accuracy::reference_device, NPUW_ACC_DEVICE),
+                          BIND(npuw::accuracy::dump_failures, NPUW_ACC_DUMP_FAILS),
 #ifdef NPU_PLUGIN_DEVELOPER_BUILD
                           BIND(npuw::dump::full, NPUW_DUMP_FULL),
                           BIND(npuw::dump::subgraphs, NPUW_DUMP_SUBS),
diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp
index 79524f3818001d..1bda73f561f978 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp
@@ -18,6 +18,7 @@
 #include "serialization.hpp"
 #include "spatial.hpp"
 #include "weights_bank.hpp"
+#include "logging.hpp"
 
 namespace intel_npu {
 class Plugin;
@@ -96,7 +97,8 @@ class CompiledModel : public ov::npuw::ICompiledModel {
     bool unpack_required(const std::size_t idx) const;
     bool unpack_required(const std::size_t idx, const std::size_t cidx) const;
 
-    void log_device_dist() const;
+    void log_device_dist(ov::npuw::LogLevel log_lvl = ov::npuw::LogLevel::Info) const;
+
     void implement_properties();
 
     // For full deserialization flow with weights
@@ -181,7 +183,9 @@ class CompiledModel : public ov::npuw::ICompiledModel {
     };
     std::vector<CompiledModelDesc> m_compiled_submodels;
 
-    std::function<bool(const ov::SoPtr<ov::ITensor>&, const ov::SoPtr<ov::ITensor>&)> m_acc_check;
+    std::function<bool(const ov::SoPtr<ov::ITensor>&, const ov::SoPtr<ov::ITensor>&, double*)> m_acc_check;
+    std::string m_acc_check_name;
+    double m_acc_check_threshold;
     std::string m_ref_device;
 
     execution_stats m_total_stat;
diff --git a/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp b/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp
index e1c4a957ab2039..1b68732a10d4bd 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp
@@ -129,7 +129,7 @@ void ov::npuw::FuncMemMgr::assign(const LinkFrom& from) {
     // - Look for an output tensor to reuse
     //   - If there's one, assign it to this allocation
     //   - If there's none, allocate a new tensor
-    // - How a tensor to reuse is piced:
+    // - How a tensor to reuse is picked:
     //   1. It should exist
     //   2. It's "remaining reads" count should be 0 (all planned reads
     //      happened at this point).
@@ -265,6 +265,7 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr<ov::npuw::Com
         auto rqs = create_infer_requests(i, is_piped ? 2 : 1, &recompiled);
         failover_happened |= recompiled;
         m_subrequests[i] = rqs.at(0);
+        
         m_subrequest_devices[i] = *comp_model_desc.device_it;
         if (is_piped) {
             m_funcall_pipeline[i].subrequest = rqs.at(1);
@@ -274,9 +275,9 @@ ov::npuw::JustInferRequest::JustInferRequest(const std::shared_ptr<ov::npuw::Com
     }  // for(submodels)
 
     if (failover_happened) {
-        LOG_INFO("Refined device distribution:");
+        LOG_ERROR("Refined device distribution:");
         LOG_BLOCK();
-        m_npuw_model->log_device_dist();
+        m_npuw_model->log_device_dist(ov::npuw::LogLevel::Error);
     }
 
     // Identify connections for the funcall pipeline, if needed
@@ -578,11 +579,11 @@ void ov::npuw::JustInferRequest::function_prologue(std::size_t idx) {
 }
 
 void ov::npuw::JustInferRequest::recreate_subrequests(std::size_t idx) {
-    auto& comp_model_desc = m_npuw_model->m_compiled_submodels[idx];
-    auto real_idx = comp_model_desc.replaced_by.value_or(idx);
+    std::size_t real_idx = real(idx);
+    auto& comp_model_desc = m_npuw_model->m_compiled_submodels[real_idx];
 
-    const auto is_piped = is_pipelined(idx);
-    auto new_rqs = create_infer_requests(idx, is_piped ? 2 : 1);
+    const auto is_piped = is_pipelined(real_idx);
+    auto new_rqs = create_infer_requests(real_idx, is_piped ? 2 : 1);
 
     // NB: Regardless if this subrequest was a function call
     // or not, always use the real_idx here - for regular
@@ -599,13 +600,13 @@ void ov::npuw::JustInferRequest::recreate_subrequests(std::size_t idx) {
     // overkill - only affected subrequest(s) could be updated instead,
     // but it is a more complex thing and can be implemented separately
     connect_subrequests();
-    m_subrequest_devices[idx] = *comp_model_desc.device_it;
+    m_subrequest_devices[real_idx] = *comp_model_desc.device_it;
 }
 
 void ov::npuw::JustInferRequest::run_subrequest_for_success(std::size_t idx, bool& failover) {
     failover = false;
-    auto& comp_model_desc = m_npuw_model->m_compiled_submodels[idx];
-    auto real_idx = comp_model_desc.replaced_by.value_or(idx);
+    bool accuracy_failover = false;
+    auto real_idx = real(idx);
 
     // Infer is also fail-safe...
     bool job_done = false;
@@ -628,7 +629,7 @@ void ov::npuw::JustInferRequest::run_subrequest_for_success(std::size_t idx, boo
         // the subrequest' outputs to global Results, if relevant.
         bind_global_results(idx);
 
-        if (comp_model_desc.replaced_by) {
+        if (m_npuw_model->m_compiled_submodels[idx].replaced_by) {
             function_prologue(idx);
         }
         if (!dump_in) {
@@ -639,7 +640,7 @@ void ov::npuw::JustInferRequest::run_subrequest_for_success(std::size_t idx, boo
         try {
             LOG_DEBUG("Trying to run subrequest[" << idx << "]...");
             LOG_BLOCK();
-            unsafe_run_this_prep_next(idx, next_prepared);
+            unsafe_run_this_prep_next(idx, next_prepared, accuracy_failover);
             job_done = true;
             LOG_DEBUG("Done: " << idx << "(exec subrequest)");
         } catch (const std::exception& ex) {
@@ -654,7 +655,8 @@ void ov::npuw::JustInferRequest::run_subrequest_for_success(std::size_t idx, boo
             LOG_INFO("- Trying next device...");
 
             // Altering iterators here!! Contracts should be changed!
-            comp_model_desc.device_it++;
+            auto& proto_comp_model_desc = m_npuw_model->m_compiled_submodels[real_idx];
+            proto_comp_model_desc.device_it++;
             if (!m_npuw_model->compile_for_success(real_idx)) {
                 OPENVINO_THROW("Failed to compile. No more devices are left!");
             }
@@ -670,36 +672,41 @@ void ov::npuw::JustInferRequest::run_subrequest_for_success(std::size_t idx, boo
             std::swap(m_subrequests[real_idx], m_funcall_pipeline[real_idx].subrequest);
         }
     }
+
+    failover |= accuracy_failover;
 }
 
-void ov::npuw::JustInferRequest::unsafe_during(std::size_t real_idx, const std::function<void()>& f) {
-    auto& comp_model_desc = m_npuw_model->m_compiled_submodels[real_idx];
-    if (!comp_model_desc.spatial) {
+void ov::npuw::JustInferRequest::unsafe_during(std::size_t idx,
+                                               const std::function<void()>& f,
+                                               bool& accuracy_failover) {
+    std::size_t real_idx = real(idx);
+    auto& proto_comp_model_desc = m_npuw_model->m_compiled_submodels[real_idx];
+    if (!proto_comp_model_desc.spatial) {
         // Non-spatial execution: trigger request asynchronously, run `f` in this context
-        auto& r = m_subrequests[real_idx];
-        r->start_async();
+        try_accurate_substart_async(idx);
         f();  // expect noexcept
-        r->wait();
+        try_accurate_subwait(idx, accuracy_failover);
     } else {
         // Spatial execution... Do the opposite - run f asynchronously, and meanwhile run the
         // spatial inference
         auto future = std::async(std::launch::async, f);
-        unsafe_infer(real_idx);
+        unsafe_infer(idx, accuracy_failover);
         future.wait();
     }
 }
 
-void ov::npuw::JustInferRequest::unsafe_infer(std::size_t real_idx) {
-    auto& comp_model_desc = m_npuw_model->m_compiled_submodels[real_idx];
-    auto& r = m_subrequests[real_idx];
-    if (!comp_model_desc.spatial) {
+void ov::npuw::JustInferRequest::unsafe_infer(std::size_t idx, bool& accuracy_failover) {
+    std::size_t real_idx = real(idx);
+    auto& proto_comp_model_desc = m_npuw_model->m_compiled_submodels[real_idx];
+    if (!proto_comp_model_desc.spatial) {
         // Run normally
-        r->infer();
+        try_accurate_subinfer(idx, accuracy_failover);
     } else {
+        auto& r = m_subrequests[real_idx];
         // Run over the specified range... Note: the full inputs/outputs
         // must be prepared in the m_spatial_io at this point
-        const auto& spatial = comp_model_desc.spatial.value();
-        const auto num_outputs = comp_model_desc.compiled_model->outputs().size();
+        const auto& spatial = proto_comp_model_desc.spatial.value();
+        const auto num_outputs = proto_comp_model_desc.compiled_model->outputs().size();
         NPUW_ASSERT(m_spatial_selector);
 
         // Create a sparse vector with full input sizes.
@@ -707,7 +714,7 @@ void ov::npuw::JustInferRequest::unsafe_infer(std::size_t real_idx) {
         // number of input parameters (activations) so some slots may be
         // not used here.
         // FIXME: All these preparations could be done statically (just once)
-        std::vector<ov::Shape> full_in_shapes(comp_model_desc.param_base);
+        std::vector<ov::Shape> full_in_shapes(proto_comp_model_desc.param_base);
         for (auto&& param : spatial.params) {
             full_in_shapes[param.idx] = m_spatial_io[real_idx].inputs.at(param.idx)->get_shape();
         }
@@ -732,7 +739,7 @@ void ov::npuw::JustInferRequest::unsafe_infer(std::size_t real_idx) {
 
             // Collect spatial inputs for this offset
             for (auto&& param : spatial.params) {
-                const auto& iport = comp_model_desc.compiled_model->inputs()[param.idx];
+                const auto& iport = proto_comp_model_desc.compiled_model->inputs()[param.idx];
                 const auto& iview =
                     ov::npuw::util::view(m_spatial_io[real_idx].inputs.at(param.idx), param.dim, offset, spatial.nway);
                 r->set_tensor(iport, iview);
@@ -740,7 +747,7 @@ void ov::npuw::JustInferRequest::unsafe_infer(std::size_t real_idx) {
 
             // Now set the spatial outputs
             for (std::size_t out_idx = 0u; out_idx < num_outputs; out_idx++) {
-                const auto& oport = comp_model_desc.compiled_model->outputs()[out_idx];
+                const auto& oport = proto_comp_model_desc.compiled_model->outputs()[out_idx];
                 r->set_tensor(oport,
                               ov::npuw::util::view(m_spatial_io[real_idx].outputs.at(out_idx),
                                                    spatial.out_dim,
@@ -749,7 +756,7 @@ void ov::npuw::JustInferRequest::unsafe_infer(std::size_t real_idx) {
             }  // for(outputs)
 
             // Now run the part
-            r->infer();
+            try_accurate_subinfer(idx, offset, spatial.nway, accuracy_failover);
         }  // for(full_nway_times)
 
         // Now process the tail, if required
@@ -762,7 +769,7 @@ void ov::npuw::JustInferRequest::unsafe_infer(std::size_t real_idx) {
                                                     offset,
                                                     spatial.tail_size);
 
-                const auto& iport = comp_model_desc.compiled_model->inputs()[param.idx];
+                const auto& iport = proto_comp_model_desc.compiled_model->inputs()[param.idx];
                 auto out_view = ov::npuw::util::view(m_spatial_io[real_idx].input_tails.at(param.idx),
                                                      param.dim,
                                                      0,
@@ -774,12 +781,12 @@ void ov::npuw::JustInferRequest::unsafe_infer(std::size_t real_idx) {
 
             // Now set the tail tensors
             for (std::size_t out_idx = 0u; out_idx < num_outputs; out_idx++) {
-                const auto& oport = comp_model_desc.compiled_model->outputs()[out_idx];
+                const auto& oport = proto_comp_model_desc.compiled_model->outputs()[out_idx];
                 r->set_tensor(oport, m_spatial_io[real_idx].output_tails.at(out_idx));
             }  // for(outputs)
 
             // Now run the tail infer
-            r->infer();
+            try_accurate_subinfer(idx, offset, spatial.tail_size, accuracy_failover);
 
             // Now copy the views from the output full-nway tensor to the output tensors
             for (std::size_t out_idx = 0u; out_idx < num_outputs; out_idx++) {
@@ -798,7 +805,8 @@ void ov::npuw::JustInferRequest::unsafe_infer(std::size_t real_idx) {
     }
 }
 
-void ov::npuw::JustInferRequest::unsafe_run_this_prep_next(std::size_t idx, bool& next_prepared) {
+void ov::npuw::JustInferRequest::unsafe_run_this_prep_next(std::size_t idx, bool& next_prepared,
+                                                           bool& accuracy_failover) {
     auto& comp_model_desc = m_npuw_model->m_compiled_submodels[idx];
     auto real_idx = comp_model_desc.replaced_by.value_or(idx);
     const std::size_t next_idx = next(idx + 1);
@@ -812,18 +820,18 @@ void ov::npuw::JustInferRequest::unsafe_run_this_prep_next(std::size_t idx, bool
             if (is_pipelined(real_idx)) {
                 // function pipelining is here! and the next rq is ours.
                 NPUW_ASSERT(m_funcall_pipeline[idx].next.value() == next_idx);
-                unsafe_during(real_idx, [&]() {
+                unsafe_during(idx, [&]() {
                     LOG_DEBUG("Unpacking closures for the NEXT subrequest[" << next_idx << "]...");
                     LOG_BLOCK();
                     // Note: do it here unconditionally - if this request fails,
                     // have to resubmit all the data to the recompiled pair anyway
                     bind_global_parameters(next_idx);
                     unpack_closure(next_idx, m_funcall_pipeline[real_idx].subrequest);
-                });
+                }, accuracy_failover);
             } else {
                 // Function pipelining is not used. THIS infer request
                 // is also the NEXT one. Nothing much to do here
-                unsafe_infer(real_idx);
+                unsafe_infer(idx, accuracy_failover);
                 bind_global_parameters(next_idx);
             }
         } else {
@@ -833,9 +841,9 @@ void ov::npuw::JustInferRequest::unsafe_run_this_prep_next(std::size_t idx, bool
             if (next_idx == 0) {
                 // Note: even if m_function_pipelining is ON,
                 // SWAP won't happen here - see the below check for .next
-                unsafe_infer(real_idx);
+                unsafe_infer(idx, accuracy_failover);
             } else {
-                unsafe_during(real_idx, [&]() {
+                unsafe_during(idx, [&]() {
                     if (!next_prepared) {
                         bind_global_parameters(next_idx);
                         next_prepared = true;
@@ -846,21 +854,21 @@ void ov::npuw::JustInferRequest::unsafe_run_this_prep_next(std::size_t idx, bool
                         LOG_BLOCK();
                         unpack_closure(my_next_idx, m_funcall_pipeline[real_idx].subrequest);
                     }
-                });
+                }, accuracy_failover);
             }
         }
     } else {
         // This is a regular subgraph. Start it async to prepare the next
         // parameters
         if (next_idx == 0) {
-            unsafe_infer(real_idx);
+            unsafe_infer(idx, accuracy_failover);
         } else {
-            unsafe_during(real_idx, [&]() {
+            unsafe_during(idx, [&]() {
                 if (!next_prepared) {
                     bind_global_parameters(next_idx);
                     next_prepared = true;
                 }
-            });
+            }, accuracy_failover);
         }
     }  // if (replaced_by)
 }
@@ -881,10 +889,6 @@ bool ov::npuw::JustInferRequest::supports_async_pipeline() const {
     return false;
 }
 
-void ov::npuw::JustInferRequest::update_subrequest_links(std::size_t) {
-    connect_subrequests();
-}
-
 bool ov::npuw::JustInferRequest::is_pipelined(std::size_t idx) const {
     const auto& desc = m_npuw_model->m_compiled_submodels[real(idx)];
     return m_use_function_pipelining && desc.replaced_by && !desc.forced_to_fcall;
diff --git a/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.hpp b/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.hpp
index 5e8273833b2f16..3a2021041d0ca0 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.hpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.hpp
@@ -81,7 +81,6 @@ class JustInferRequest final : public IBaseInferRequest {
     void complete_subrequest(std::size_t idx) override;
     void cancel_subrequest(std::size_t idx) override;
     bool supports_async_pipeline() const override;
-    void update_subrequest_links(std::size_t idx) override;
 
     TensorPtr alloc_global_out(std::size_t out_idx) override;
 
@@ -97,9 +96,9 @@ class JustInferRequest final : public IBaseInferRequest {
 
     void function_prologue(std::size_t idx);
 
-    void unsafe_during(std::size_t real_idx, const std::function<void()>& f);
-    void unsafe_infer(std::size_t real_idx);
-    void unsafe_run_this_prep_next(std::size_t idx, bool& next_prepared_p);
+    void unsafe_during(std::size_t idx, const std::function<void()>& f, bool& accuracy_failover);
+    void unsafe_infer(std::size_t idx, bool& accuracy_failover);
+    void unsafe_run_this_prep_next(std::size_t idx, bool& next_prepared, bool& accuracy_failover);
 
     void connect_subrequests();
     void recreate_subrequests(std::size_t idx);
diff --git a/src/plugins/intel_npu/src/plugin/npuw/unfold_sync_infer_request.cpp b/src/plugins/intel_npu/src/plugin/npuw/unfold_sync_infer_request.cpp
index 0e60f2886f142b..edc5b3a87240e9 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/unfold_sync_infer_request.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/unfold_sync_infer_request.cpp
@@ -37,6 +37,25 @@ ov::npuw::UnfoldInferRequest::UnfoldInferRequest(const std::shared_ptr<ov::npuw:
         // NB: UnfoldInferRequest is _NOT_ fail-safe! Fail means fail here
         m_subrequests[i] = proto_comp_model_desc.compiled_model->create_infer_request();
         m_subrequest_devices[i] = *proto_comp_model_desc.device_it;
+
+        if (m_npuw_model->m_acc_check) {
+            LOG_INFO("Create reference subrequest for Subgraph[" << i << "] on " << m_npuw_model->m_ref_device << "...");
+            LOG_BLOCK();
+            if (m_npuw_model->submodel_device(i) != m_npuw_model->m_ref_device) {
+                auto& ref_submodel = m_npuw_model->m_compiled_submodels.at(real(i)).ref_compiled_model;
+                ov::SoPtr<ov::IAsyncInferRequest> ref_infer_request = {ref_submodel->create_infer_request(),
+                                                                       ref_submodel._so};
+                NPUW_ASSERT(ref_infer_request);
+                m_ref_subrequests.at(i) = std::move(ref_infer_request);
+                LOG_INFO("Done");
+            } else {
+                LOG_INFO("Skip creation of reference subrequest for Subgraph["
+                        << i << "] on reference device: " << m_npuw_model->m_ref_device << ", as actual subrequest ["
+                        << i << "] has been already created on "
+                        << "it .");
+            }
+        }
+
         LOG_INFO("DONE");
     }  // for(submodels)
 
@@ -86,6 +105,7 @@ bool ov::npuw::UnfoldInferRequest::valid_subrequest(std::size_t idx) const {
 
 void ov::npuw::UnfoldInferRequest::infer() {
     const bool do_async = m_npuw_model->m_cfg.get<::intel_npu::NPUW_FUNCALL_ASYNC>();
+    bool accuracy_failover = false;
 
     auto prepare = [&](std::size_t idx) {
         if (idx >= m_subrequests.size()) {
@@ -94,16 +114,16 @@ void ov::npuw::UnfoldInferRequest::infer() {
         bind_global_params(idx, m_subrequests[idx]);
         bind_global_results(idx, m_subrequests[idx]);
     };
-    auto wait_and_clear = [](RqPtrs& rqs) {
-        for (auto&& r : rqs) {
-            r->wait();
+    auto wait_and_clear = [&](std::vector<std::size_t> rqs_ids) {
+        for (auto&& r_id : rqs_ids) {
+            try_accurate_subwait(r_id, accuracy_failover);
         }
-        rqs.clear();
+        rqs_ids.clear();
     };
 
     if (do_async) {
         std::size_t past_repl_id = 0u;
-        RqPtrs previous_requests;
+        std::vector<std::size_t> previous_requests;
 
         prepare(0);
         for (std::size_t idx = 0; idx < m_num_submodels; idx++) {
@@ -121,8 +141,8 @@ void ov::npuw::UnfoldInferRequest::infer() {
                 wait_and_clear(previous_requests);
                 past_repl_id = this_repl_id;
             }
-            subr->start_async();
-            previous_requests.push_back(subr);
+            try_accurate_substart_async(idx);
+            previous_requests.push_back(idx);
             prepare(idx + 1);
         }
         wait_and_clear(previous_requests);
@@ -134,9 +154,189 @@ void ov::npuw::UnfoldInferRequest::infer() {
                 prepare(idx + 1);
                 continue;
             }
-            subr->start_async();
+            try_accurate_substart_async(idx);
             prepare(idx + 1);
-            subr->wait();
+            try_accurate_subwait(idx, accuracy_failover);
         }
     }  // (async)
+
+    if (accuracy_failover) {
+        LOG_ERROR("Refined device distribution:");
+        LOG_BLOCK();
+        m_npuw_model->log_device_dist(ov::npuw::LogLevel::Error);
+    }
+}
+
+namespace {
+    void set_inputs(const ov::SoPtr<ov::IAsyncInferRequest>& from, ov::SoPtr<ov::IAsyncInferRequest>& to) {
+        const auto& from_comp_model = from->get_compiled_model();
+        const auto& to_comp_model = to->get_compiled_model();
+        for (size_t i = 0; i < from_comp_model->inputs().size(); i++) {
+            const auto& itnsr = from->get_tensor(from_comp_model->inputs()[i]);
+            to->set_tensor(to_comp_model->inputs()[i], itnsr);
+        }
+    }
+
+    void copy_results(const ov::SoPtr<ov::IAsyncInferRequest>& from, ov::SoPtr<ov::IAsyncInferRequest>& to) {
+        const auto& from_comp_model = from->get_compiled_model();
+        const auto& to_comp_model = to->get_compiled_model();
+        for (size_t i = 0; i < to_comp_model->outputs().size(); i++) {
+            const auto& from_tnsr = from->get_tensor(from_comp_model->outputs()[i]);
+            const auto& to_tnsr = to->get_tensor(to_comp_model->outputs()[i]);
+            from_tnsr->copy_to(to_tnsr._ptr);
+        }
+    }
+
+    std::stringstream create_launch_msg(std::size_t idx,  std::size_t real_idx) {
+        std::stringstream log_msg_stream;
+        log_msg_stream << "Launching subrequest[" << idx << "]" <<
+        ((real_idx == idx) ? std::string("...").c_str() :
+                             std::string(std::string(", which is actually subrequest[") +
+                                std::to_string(real_idx) + "]").c_str());
+        return log_msg_stream;
+    }
+} // anonymous namespace
+
+void ov::npuw::UnfoldInferRequest::try_accurate_substart_async(std::size_t subidx) {
+    auto& act_subr = m_subrequests.at(subidx);
+    if (!m_npuw_model->m_acc_check) {
+        act_subr->start_async();
+        return;
+    }
+
+    std::stringstream log_msg_stream = create_launch_msg(subidx, subidx);
+    log_msg_stream << "...";
+    LOG_INFO(log_msg_stream.str());
+    LOG_BLOCK();
+
+    if (m_npuw_model->m_compiled_submodels[real(subidx)].switched_to_ref) {
+        LOG_INFO("Subrequest was inaccurate somewhere before, launching it on reference device.");
+
+        auto& act_subr = m_subrequests.at(subidx);
+        auto& ref_subr = m_ref_subrequests.at(subidx);
+
+        set_inputs(act_subr, ref_subr);
+        ref_subr->start_async();
+    } else {
+        act_subr->start_async();
+    }
+}
+
+void ov::npuw::UnfoldInferRequest::try_accurate_subwait(std::size_t subidx, bool& accuracy_failover) {
+    auto& act_subr = m_subrequests.at(subidx);
+    if (!m_npuw_model->m_acc_check) {
+        act_subr->wait();
+        return;
+    }
+
+    LOG_BLOCK();
+
+    if (m_npuw_model->m_compiled_submodels[real(subidx)].switched_to_ref) {
+        auto& act_subr = m_subrequests.at(subidx);
+        auto& ref_subr = m_ref_subrequests.at(subidx);
+
+        ref_subr->wait();
+        copy_results(ref_subr, act_subr);
+    } else {
+        act_subr->wait();
+        ensure_subrequest_is_accurate(subidx, accuracy_failover);
+    }
+}
+
+void ov::npuw::UnfoldInferRequest::ensure_subrequest_is_accurate(std::size_t idx, bool& accuracy_failover) {
+    if (!m_npuw_model->m_acc_check) {
+         return;
+    }
+
+    LOG_INFO("Check if subrequest[" << idx << "] is accurate...");
+    LOG_BLOCK();
+
+    std::size_t real_idx = real(idx);
+    OPENVINO_ASSERT(m_npuw_model->m_compiled_submodels[real_idx].switched_to_ref == false);
+
+    if (m_npuw_model->submodel_device(idx) == m_npuw_model->m_ref_device) {
+        LOG_INFO("Skipped, subrequest[" << idx << "] is launched on reference device.");
+        return;
+    }
+
+    accuracy_failover = false;
+    auto& actual_subr = m_subrequests.at(idx);
+    auto& ref_subr = m_ref_subrequests.at(idx);
+
+    // Setting inputs:
+    set_inputs(actual_subr, ref_subr);
+
+    // Running inference:
+    ref_subr->infer();
+
+    // Comparing results of actual and reference inferfences:
+    LOG_INFO("Compare actual outputs against references:");
+    bool tensors_converge = true;
+    const auto& actual_comp_model = actual_subr->get_compiled_model();
+    const auto& ref_comp_model = ref_subr->get_compiled_model();
+    std::vector<bool> converges(actual_comp_model->outputs().size());
+    std::vector<double> metrics(actual_comp_model->outputs().size());
+    for (size_t i = 0; i < actual_comp_model->outputs().size(); i++) {
+        const auto& actual_tensor = actual_subr->get_tensor(actual_comp_model->outputs()[i]);
+        const auto& ref_tensor = ref_subr->get_tensor(ref_comp_model->outputs()[i]);
+        converges[i] = m_npuw_model->m_acc_check(actual_tensor, ref_tensor, &metrics[i]);
+        tensors_converge &= converges[i];
+    }
+    if (tensors_converge == false) {
+        if (ov::npuw::get_log_level() == ov::npuw::LogLevel::Error) {
+            // For just log level error print header message:
+            LOG_ERROR("Check if subrequest[" << idx << "] is accurate...");
+        }
+    }
+    // Log comparison details:
+    for (size_t i = 0; i < actual_comp_model->outputs().size(); i++) {
+        if (converges[i]) {
+            LOG_INFO(" - " << actual_comp_model->outputs()[i]);
+            LOG_BLOCK();
+            LOG_INFO(m_npuw_model->m_acc_check_name << " loss: " << metrics[i] <<
+                      ", threshold: " << m_npuw_model->m_acc_check_threshold << ".");
+            LOG_INFO("PASS");
+        } else {
+            LOG_ERROR(" - " << actual_comp_model->outputs()[i]);
+            LOG_BLOCK();
+            LOG_ERROR(m_npuw_model->m_acc_check_name << " loss: " << metrics[i] <<
+                      ", threshold: " << m_npuw_model->m_acc_check_threshold << ".");
+            LOG_ERROR("FAIL");
+        }
+    }
+
+    // If comparison fails, copy reference results to original tensors and mark subgraph as
+    // switched to reference:
+    if (tensors_converge) {
+        LOG_INFO("PASS");
+    } else {
+        LOG_ERROR("FAIL");
+        LOG_ERROR("Subrequest[" << idx << "] is inaccurate, failover to reference results.");
+        if (idx != real_idx) {
+            LOG_ERROR("As subrequest[" << idx << "] is actually " << "subrequest[" << real_idx <<
+                       "], all subrequests, corresponding to last, will be further " <<
+                       "launched on " << m_npuw_model->m_ref_device << ".'");
+        } else if (m_npuw_model->m_compiled_submodels[real_idx].replaced_by) {
+            LOG_ERROR("As subrequest[" << real_idx << "] is actually " << "a function, all " <<
+                      "subrequests, corresponding to it, will be further launched on " <<
+                      m_npuw_model->m_ref_device << ".");
+        }
+
+        if (m_npuw_model->m_cfg.get<::intel_npu::NPUW_ACC_DUMP_FAILS>()) {
+            // Not here anymore due to optimizations.
+            const auto model = m_npuw_model->m_compiled_submodels[real_idx].model;
+            const auto model_path = std::string("inaccurate_") + model->get_friendly_name() + std::string(".xml");
+            ov::save_model(model, model_path);
+            dump_input_tensors(idx, true);
+            dump_output_tensors(idx, true);
+        }
+
+        // Due to complex memory management logic it is safe to just copy
+        // results back to already properly allocated and linked tensors:
+        copy_results(ref_subr, actual_subr);
+        m_npuw_model->m_compiled_submodels[real_idx].switched_to_ref = true;
+        accuracy_failover = true;
+    }
+
+    LOG_INFO("Done");
 }
diff --git a/src/plugins/intel_npu/src/plugin/npuw/unfold_sync_infer_request.hpp b/src/plugins/intel_npu/src/plugin/npuw/unfold_sync_infer_request.hpp
index 76b67571ec4c40..170304eb52633b 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/unfold_sync_infer_request.hpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/unfold_sync_infer_request.hpp
@@ -32,10 +32,13 @@ class UnfoldInferRequest final : public IBaseInferRequest {
     bool supports_async_pipeline() const override {
         return false;
     }
-    void update_subrequest_links(std::size_t) override {}
 
 private:
     void infer() override;
+    void try_accurate_substart_async(std::size_t idx) override;
+    void try_accurate_subwait(std::size_t idx, bool& accuracy_failover) override;
+    void ensure_subrequest_is_accurate(std::size_t idx, bool& accuracy_failover) override;
+
 };
 
 }  // namespace npuw