openvinotoolkit · AsyaPronina · Feb 4, 2025 · Feb 6, 2025 · Feb 12, 2025 · Apr 9, 2025
@@ -46,6 +46,11 @@ ov_add_test_target(
   openvino::npu_common
   openvino::npu_al)
 
+if(ENABLE_INTEL_CPU)
+    add_dependencies(${TARGET_NAME} openvino_intel_cpu_plugin)
+    target_compile_definitions(${TARGET_NAME} PRIVATE WITH_CPU_PLUGIN)
+endif()
+
 if(WIN32)
     if(ENABLE_DX12)
         target_compile_definitions(${TARGET_NAME} PRIVATE ENABLE_DX12)

@@ -1,9 +1,9 @@
-// Copyright (C) 2024 Intel Corporation
+// Copyright (C) 2024-2025 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
 #include "behavior_tests.hpp"
-#include "comparators/nrmse.hpp"
+#include "test_engine/comparators/nrmse.hpp"
 #include "openvino/util/shared_object.hpp"
 #include "openvino/util/file_util.hpp"
 

@@ -1,4 +1,4 @@
-// Copyright (C) 2024 Intel Corporation
+// Copyright (C) 2024-2025 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 #pragma once
@@ -8,10 +8,10 @@
 #include <memory>
 #include <random>
 
-#include "generators/model_generator.hpp"
+#include "test_engine/generators/model_generator.hpp"
+#include "test_engine/mocks/mock_plugins.hpp"
+#include "test_engine/mocks/register_in_ov.hpp"
 #include "intel_npu/npuw_private_properties.hpp"
-#include "mocks/mock_plugins.hpp"
-#include "mocks/register_in_ov.hpp"
 #include "openvino/runtime/core.hpp"
 #include "openvino/runtime/iplugin.hpp"
 #include "openvino/runtime/make_tensor.hpp"

@@ -0,0 +1,100 @@
+// Copyright (C) 2024-2025 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "test_engine/comparators/nrmse.hpp"
+#include "test_engine/simple_llm_pipeline.hpp"
+#include "intel_npu/npuw_private_properties.hpp"
+#include "openvino/util/shared_object.hpp"
+#include "openvino/util/file_util.hpp"
+
+#include <filesystem>
+#include <algorithm>
+
+#include <gtest/gtest.h>
+
+using namespace testing;
+using namespace ov::npuw::tests;
+using namespace ov::intel_npu::npuw;
+
+// this tests load plugin by library name: this is not available during static linkage
+#ifndef OPENVINO_STATIC_LIBRARY
+#ifdef WITH_CPU_PLUGIN
+#if defined(OPENVINO_ARCH_X86) || defined(OPENVINO_ARCH_X86_64)
+const char* cpu_plugin_file_name = "openvino_intel_cpu_plugin";
+
+namespace {
+const std::vector<int64_t> What_is_OpenVINO =
+    {529, 29989, 1792, 29989, 29958, 13, 5618, 338, 4673, 29963, 1177,
+     29949, 29973, 2, 29871, 13, 29966, 29989, 465, 22137, 29989, 29958, 13};
+const std::vector<int64_t> OpenVINO =
+    {6585, 29963, 1177, 29949}; 
+
+using LLMTestParams = std::tuple<std::string, ov::AnyMap, 
+                                 std::vector<int64_t>, std::vector<int64_t>>;
+} // anonymous namespace
+
+class LLMAccuracyTestsNPUW : public ::testing::TestWithParam<LLMTestParams> {
+public:
+    void SetUp() override {
+        // NOTE: TEMPLATE plugin in OpenVINO works for ~20 minute to generate
+        //       first token from prefill model and crashes on launch of
+        //       3rd subrequest in generate model.
+        //       There is no such issue with CPU plugin, so CPU plugin was choosen
+        //       for accuracy checks.
+        // Register CPU plugin in OpenVINO:
+        try {
+            core.register_plugin(std::string(cpu_plugin_file_name) + OV_BUILD_POSTFIX, "CPU");
+        } catch (ov::Exception& ex) {
+            if (std::string{ex.what()}.find("Device with \"CPU\"  is already registered in the OpenVINO Runtime")
+                == std::string::npos) {
+                throw ex;
+            }
+        }
+
+        auto param = GetParam();
+        ov::AnyMap config;
+        std::tie(model_path, config, input_ids, reference_ids) = param;
+        config["NPUW_DEVICES"] = "CPU";
+        config["NPUW_LLM_MIN_RESPONSE_LEN"] = 4;
+        simple_llm.initialize(model_path, core, config);
+    }
+
+protected:
+    ov::Core core;
+    SimpleLLMPipeline simple_llm;
+    std::string model_path;
+    std::vector<int64_t> input_ids;
+    std::vector<int64_t> actual_ids;
+    std::vector<int64_t> reference_ids;
+};
+
+TEST_P(LLMAccuracyTestsNPUW, ConfigIsAccurate) {
+    actual_ids = simple_llm.generate(input_ids);
+    for (auto i = 0; i < actual_ids.size(); ++i) {
+        ASSERT_EQ(actual_ids[i], reference_ids[i]);
+    }
+}
+
+INSTANTIATE_TEST_SUITE_P(LLMAccuracyNPUW_FAST_COMPILE, LLMAccuracyTestsNPUW,
+    ::testing::Combine(testing::Values("C:\\apronina\\models\\TinyLlama-1.1B-Chat-v1.0_int4_sym_group128_dyn_stateful\\TinyLlama-1.1B-Chat-v1.0_int4_sym_group128_dyn_stateful\\openvino_model.xml"),
+                       testing::Values(ov::AnyMap{}),
+                       testing::Values(What_is_OpenVINO),
+                       testing::Values(OpenVINO)));
+
+INSTANTIATE_TEST_SUITE_P(LLMAccuracyNPUW_BEST_PERF, LLMAccuracyTestsNPUW,
+::testing::Combine(testing::Values("C:\\apronina\\models\\TinyLlama-1.1B-Chat-v1.0_int4_sym_group128_dyn_stateful\\TinyLlama-1.1B-Chat-v1.0_int4_sym_group128_dyn_stateful\\openvino_model.xml"),
+                    testing::Values(ov::AnyMap{{"NPUW_LLM_GENERATE_HINT", "BEST_PERF"}}),
+                    testing::Values(What_is_OpenVINO),
+                    testing::Values(OpenVINO)));
+
+INSTANTIATE_TEST_SUITE_P(LLMAccuracyNPUW_DYNAMIC_BEST_PERF, LLMAccuracyTestsNPUW,
+    ::testing::Combine(testing::Values("C:\\apronina\\models\\TinyLlama-1.1B-Chat-v1.0_int4_sym_group128_dyn_stateful\\TinyLlama-1.1B-Chat-v1.0_int4_sym_group128_dyn_stateful\\openvino_model.xml"),
+                        testing::Values(ov::AnyMap{{"NPUW_LLM_GENERATE_HINT", "BEST_PERF"},
+                                                   {"NPUW_LLM_PREFILL_HINT", "DYNAMIC"}}),
+                        testing::Values(What_is_OpenVINO),
+                        testing::Values(OpenVINO)));
+
+#endif // defined(OPENVINO_ARCH_X86) || defined(OPENVINO_ARCH_X86_64)
+#endif // WITH_CPU_PLUGIN
+#endif // not OPENVINO_STATIC_LIBRARY
@@ -0,0 +1,197 @@
+// Copyright (C) 2024-2025 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "test_engine/simple_llm_pipeline.hpp"
+#include "test_engine/mocks/mock_plugins.hpp"
+#include "test_engine/mocks/register_in_ov.hpp"
+#include "intel_npu/npuw_private_properties.hpp"
+#include "openvino/util/shared_object.hpp"
+#include "openvino/util/file_util.hpp"
+
+#include <filesystem>
+
+using namespace testing;
+using namespace ov::npuw::tests;
+using namespace ov::intel_npu::npuw;
+
+namespace {
+    const std::vector<int64_t> What_is_OpenVINO =
+        {529, 29989, 1792, 29989, 29958, 13, 5618, 338, 4673, 29963, 1177,
+         29949, 29973, 2, 29871, 13, 29966, 29989, 465, 22137, 29989, 29958, 13};
+} // anonymous namespace
+
+#define TIMES(times) times 
+
+#define THROW(...) \
+    .WillOnce(Throw(std::runtime_error(__VA_ARGS__)))
+
+#define EXPECT_COMPILE_MODEL(device, times, ...)                        \
+    EXPECT_CALL(*device##_plugin,                                       \
+        compile_model(A<const std::shared_ptr<const ov::Model>&>(), _)) \
+        .Times(times)                                                   \
+        __VA_ARGS__                                                     \
+
+#define MODEL(idx) idx
+
+#define INFER_REQ(idx) idx
+
+#define EXPECT_CREATE_SYNC_INFER_REQ(device, model_idx, times, ...)                            \
+    device##_plugin->set_expectations_to_comp_models(model_idx, [](MockCompiledModel& model) { \
+        EXPECT_CALL(model, create_sync_infer_request())                                        \
+        .Times(times)                                                                          \
+        __VA_ARGS__;                                                                           \
+    })
+
+#define EXPECT_INFER(device, model_idx, times, ...) \
+    device##_plugin->set_expectations_to_infer_reqs(model_idx, 0, [](MockInferRequest& request) { \
+        EXPECT_CALL(request, infer())                                                             \
+        .Times(times)                                                                             \
+        __VA_ARGS__;                                                                              \
+    });
+
+#define EXPECT_INFER_FOR(device, model_idx, req_idx, times, ...) \
+    device##_plugin->set_expectations_to_infer_reqs(model_idx, req_idx, [](MockInferRequest& request) { \
+        EXPECT_CALL(request, infer())                                                                   \
+        .Times(times)                                                                                   \
+        __VA_ARGS__;                                                                                    \
+    });
+
+namespace ov {
+namespace npuw {
+namespace tests {
+
+class LLMBehaviorTestsNPUW : public ::testing::Test {
+public:
+    void SetUp() override {
+        mock_npu_for_prefill_plugin = std::make_shared<MockNpuPluginForPrefill>();
+        mock_npu_for_prefill_plugin->create_implementation();
+        mock_npu_for_generate_plugin = std::make_shared<MockNpuPluginForGenerate>();
+        mock_npu_for_generate_plugin->create_implementation();
+        config["++NPUW_LLM_PREFILL_CONFIG"] = ov::AnyMap{{"NPUW_DEVICES", "MockNPUForPrefill"}};
+        config["++NPUW_LLM_GENERATE_CONFIG"] = ov::AnyMap{{"NPUW_DEVICES", "MockNPUForGenerate"}};
+        config["NPUW_LLM_MIN_RESPONSE_LEN"] = 2;
+    }
+
+    // Make sure it is called after expectations are set!
+    void register_mock_plugins_in_ov() {
+        m_shared_objects.push_back(reg_plugin<MockNpuPluginForPrefill>(core, mock_npu_for_prefill_plugin));
+        m_shared_objects.push_back(reg_plugin<MockNpuPluginForGenerate>(core, mock_npu_for_generate_plugin));
+    }
+
+protected:
+    ov::Core core;
+    SimpleLLMPipeline simple_llm;
+    std::shared_ptr<MockNpuPluginForPrefill> mock_npu_for_prefill_plugin;
+    std::shared_ptr<MockNpuPluginForGenerate> mock_npu_for_generate_plugin;
+    ov::AnyMap config;
+
+private:
+    std::vector<std::shared_ptr<void>> m_shared_objects;
+};
+}  // namespace tests
+}  // namespace npuw
+}  // namespace ov
+
+TEST_F(LLMBehaviorTestsNPUW, LLMBehaviorNPUW_FAST_COMPILE) {
+    // Set expectations first:
+    {
+        InSequence seq;
+        // Generate model:
+        EXPECT_COMPILE_MODEL(mock_npu_for_generate, TIMES(3));
+        // Prefill model:
+        EXPECT_COMPILE_MODEL(mock_npu_for_prefill, TIMES(3));
+    }
+
+    // ------------------------ Prefill model ---------------------------
+    // 1 infer request for head:
+    EXPECT_CREATE_SYNC_INFER_REQ(mock_npu_for_prefill, MODEL(0), TIMES(1));  
+    // 2 infer requests for function, `create_sync_infer_request()`
+    // should be called twice here (2 requests to form pipelining):
+    EXPECT_CREATE_SYNC_INFER_REQ(mock_npu_for_prefill, MODEL(1), TIMES(2));
+    // 1 infer request for tail:
+    EXPECT_CREATE_SYNC_INFER_REQ(mock_npu_for_prefill, MODEL(2), TIMES(1));
+
+    // Head's infer request is called once:
+    EXPECT_INFER(mock_npu_for_prefill, MODEL(0), TIMES(1));
+    // There are 20 repeated functions, so:
+    // Repeated block's 1st infer request is called 10 times:
+    EXPECT_INFER_FOR(mock_npu_for_prefill, MODEL(1), INFER_REQ(0), TIMES(10));
+    // Repeated block's 2nd infer request (brother of 1st one) is called 10 times:
+    EXPECT_INFER_FOR(mock_npu_for_prefill, MODEL(1), INFER_REQ(1), TIMES(10));
+    // Tail's infer request is called once:
+    EXPECT_INFER(mock_npu_for_prefill, MODEL(2), TIMES(1));
+
+    // ------------------------ Generate model ---------------------------
+    // 1 infer request for head:
+    EXPECT_CREATE_SYNC_INFER_REQ(mock_npu_for_generate, MODEL(0), TIMES(1));  
+    // `create_sync_infer_request()` should be called 20 times
+    // to create 20 separate infer requests here (due to UNFOLD_IREQS):
+    EXPECT_CREATE_SYNC_INFER_REQ(mock_npu_for_generate, MODEL(1), TIMES(20));
+    // 1 infer request for tail:
+    EXPECT_CREATE_SYNC_INFER_REQ(mock_npu_for_generate, MODEL(2), TIMES(1));
+
+    // Head's infer request is called once:
+    EXPECT_INFER(mock_npu_for_generate, MODEL(0), TIMES(1));
+    // Different 20 infer requests of 2nd submodel should be called:
+    for (int i = 0; i < 20; ++i) {
+        EXPECT_INFER_FOR(mock_npu_for_generate, MODEL(1), INFER_REQ(i), TIMES(1));
+    }
+    // Tail's infer request is called once:
+    EXPECT_INFER(mock_npu_for_generate, MODEL(2), TIMES(1));
+
+    // Register mock objects as plugins in OpenVINO:
+    register_mock_plugins_in_ov();
+
+    // Do the actual test:
+    simple_llm.initialize("C:\\apronina\\models\\TinyLlama-1.1B-Chat-v1.0_int4_sym_group128_dyn_stateful\\TinyLlama-1.1B-Chat-v1.0_int4_sym_group128_dyn_stateful\\openvino_model.xml",
+                          core, config); 
+    EXPECT_NO_THROW(simple_llm.generate(What_is_OpenVINO));
+}
+
+TEST_F(LLMBehaviorTestsNPUW, LLMBehaviorNPUW_BEST_PERF) {
+    // Set expectations first:
+    {
+        InSequence seq;
+        // Generate model:
+        EXPECT_COMPILE_MODEL(mock_npu_for_generate, TIMES(1));
+        // Prefill model:
+        EXPECT_COMPILE_MODEL(mock_npu_for_prefill, TIMES(3));
+    }
+
+    // ------------------------ Prefill model ---------------------------
+    // 1 infer request for head:
+    EXPECT_CREATE_SYNC_INFER_REQ(mock_npu_for_prefill, MODEL(0), TIMES(1));  
+    // 2 infer requests for function, `create_sync_infer_request()`
+    // should be called twice here (2 requests to form pipelining):
+    EXPECT_CREATE_SYNC_INFER_REQ(mock_npu_for_prefill, MODEL(1), TIMES(2));
+    // 1 infer request for tail:
+    EXPECT_CREATE_SYNC_INFER_REQ(mock_npu_for_prefill, MODEL(2), TIMES(1));
+
+    // Head's infer request is called once:
+    EXPECT_INFER(mock_npu_for_prefill, MODEL(0), TIMES(1));
+    // There are 20 repeated functions, so:
+    // Repeated block's 1st infer request is called 10 times:
+    EXPECT_INFER_FOR(mock_npu_for_prefill, MODEL(1), INFER_REQ(0), TIMES(10));
+    // Repeated block's 2nd infer request (brother of 1st one) is called 10 times:
+    EXPECT_INFER_FOR(mock_npu_for_prefill, MODEL(1), INFER_REQ(1), TIMES(10));
+    // Tail's infer request is called once:
+    EXPECT_INFER(mock_npu_for_prefill, MODEL(2), TIMES(1));
+
+    // ------------------------ Generate model ---------------------------
+    // 1 infer request for the whole generate model:
+    EXPECT_CREATE_SYNC_INFER_REQ(mock_npu_for_generate, MODEL(0), TIMES(1));  
+    // Infer request is called only once:
+    EXPECT_INFER(mock_npu_for_generate, MODEL(0), TIMES(1));
+
+    // Register mock objects as plugins in OpenVINO:
+    register_mock_plugins_in_ov();
+
+    // Do the actual test:
+
+    config["NPUW_LLM_GENERATE_HINT"] = "BEST_PERF";
+    simple_llm.initialize("C:\\apronina\\models\\TinyLlama-1.1B-Chat-v1.0_int4_sym_group128_dyn_stateful\\TinyLlama-1.1B-Chat-v1.0_int4_sym_group128_dyn_stateful\\openvino_model.xml",
+                            core, config); 
+    EXPECT_NO_THROW(simple_llm.generate(What_is_OpenVINO));
+}
+
@@ -303,6 +303,8 @@ MockPluginBase<DeviceType>::~MockPluginBase() {
 }
 
 template class MockPluginBase<ov::npuw::tests::mocks::Npu>;
+template class MockPluginBase<ov::npuw::tests::mocks::NpuForPrefill>;
+template class MockPluginBase<ov::npuw::tests::mocks::NpuForGenerate>;
 template class MockPluginBase<ov::npuw::tests::mocks::Cpu>;
 
 }  // namespace tests

@@ -159,13 +159,23 @@ struct Npu {
     static constexpr const char* name = "MockNPU";
     static constexpr int num_device_ids = 3;
 };
+struct NpuForPrefill {
+    static constexpr const char* name = "MockNPUForPrefill";
+    static constexpr int num_device_ids = 3;
+};
+struct NpuForGenerate {
+    static constexpr const char* name = "MockNPUForGenerate";
+    static constexpr int num_device_ids = 3;
+};
 struct Cpu {
     static constexpr const char* name = "MockCPU";
     static constexpr int num_device_ids = 2;
 };
 } //namespace mocks
 
 using MockNpuPlugin = testing::NiceMock<MockPluginBase<mocks::Npu>>;
+using MockNpuPluginForPrefill = testing::NiceMock<MockPluginBase<mocks::NpuForPrefill>>;
+using MockNpuPluginForGenerate = testing::NiceMock<MockPluginBase<mocks::NpuForGenerate>>;
 using MockCpuPlugin = testing::NiceMock<MockPluginBase<mocks::Cpu>>;
 
 }  // namespace tests