LLM Behavior tests

AsyaPronina · AsyaPronina · commit 53a65f7e27b5 · 2025-04-09T22:45:07.000+01:00
diff --git a/src/plugins/intel_npu/tests/functional/behavior/npuw/behavior_tests.cpp b/src/plugins/intel_npu/tests/functional/behavior/npuw/behavior_tests.cpp
@@ -1,4 +1,4 @@
-// Copyright (C) 2024 Intel Corporation
+// Copyright (C) 2024-2025 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
diff --git a/src/plugins/intel_npu/tests/functional/behavior/npuw/behavior_tests.hpp b/src/plugins/intel_npu/tests/functional/behavior/npuw/behavior_tests.hpp
@@ -1,4 +1,4 @@
-// Copyright (C) 2024 Intel Corporation
+// Copyright (C) 2024-2025 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 #pragma once
diff --git a/src/plugins/intel_npu/tests/functional/behavior/npuw/llm_accuracy_tests.cpp b/src/plugins/intel_npu/tests/functional/behavior/npuw/llm_accuracy_tests.cpp
@@ -1,4 +1,4 @@
-// Copyright (C) 2024 Intel Corporation
+// Copyright (C) 2024-2025 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -17,20 +17,11 @@ using namespace testing;
 using namespace ov::npuw::tests;
 using namespace ov::intel_npu::npuw;
 
-
-#if defined(OPENVINO_ARCH_ARM) || defined(OPENVINO_ARCH_ARM64)
-const char* cpu_plugin_file_name = "openvino_arm_cpu_plugin";
-#elif defined(OPENVINO_ARCH_X86) || defined(OPENVINO_ARCH_X86_64)
-const char* cpu_plugin_file_name = "openvino_intel_cpu_plugin";
-#elif defined(OPENVINO_ARCH_RISCV64)
-const char* cpu_plugin_file_name = "openvino_riscv_cpu_plugin";
-#else
-#    error "Undefined system processor"
-#endif
-
 // this tests load plugin by library name: this is not available during static linkage
 #ifndef OPENVINO_STATIC_LIBRARY
 #ifdef WITH_CPU_PLUGIN
+#if defined(OPENVINO_ARCH_X86) || defined(OPENVINO_ARCH_X86_64)
+const char* cpu_plugin_file_name = "openvino_intel_cpu_plugin";
 
 namespace {
 const std::vector<int64_t> What_is_OpenVINO =
@@ -43,12 +34,8 @@ using LLMTestParams = std::tuple<std::string, ov::AnyMap,
                                  std::vector<int64_t>, std::vector<int64_t>>;
 } // anonymous namespace
 
-class LLMAccuracyTest : public ::testing::TestWithParam<LLMTestParams> {
+class LLMAccuracyTestsNPUW : public ::testing::TestWithParam<LLMTestParams> {
 public:
-    ov::Core core;
-    ov::AnyMap use_npuw_props;
-    std::shared_ptr<ov::Model> model;
-
     void SetUp() override {
         // NOTE: TEMPLATE plugin in OpenVINO works for ~20 minute to generate
         //       first token from prefill model and crashes on launch of
@@ -67,54 +54,47 @@ class LLMAccuracyTest : public ::testing::TestWithParam<LLMTestParams> {
 
         auto param = GetParam();
         ov::AnyMap config;
-        std::tie(m_model_path, config, m_input_ids, m_reference_ids) = param;
+        std::tie(model_path, config, input_ids, reference_ids) = param;
         config["NPUW_DEVICES"] = "CPU";
         config["NPUW_LLM_MIN_RESPONSE_LEN"] = 4;
-        m_simple_llm.initialize(m_model_path, core, config);
+        simple_llm.initialize(model_path, core, config);
     }
 
-    void generate() {
-        m_actual_ids = m_simple_llm.generate(m_input_ids);
-        PRINT_ARRAY(m_actual_ids, m_actual_ids.size());
-    }
-
-    void accurate() {
-        for (auto i = 0; i < m_actual_ids.size(); ++i) {
-            ASSERT_EQ(m_actual_ids[i], m_reference_ids[i]);
-        }
-    }
-
-private:
-    std::string m_model_path;
-    SimpleLLMPipeline m_simple_llm;
-    std::vector<int64_t> m_input_ids;
-    std::vector<int64_t> m_actual_ids;
-    std::vector<int64_t> m_reference_ids;
+protected:
+    ov::Core core;
+    SimpleLLMPipeline simple_llm;
+    std::string model_path;
+    std::vector<int64_t> input_ids;
+    std::vector<int64_t> actual_ids;
+    std::vector<int64_t> reference_ids;
 };
 
-TEST_P(LLMAccuracyTest, ConfigIsAccurate) {
-    generate();
-    accurate();
+TEST_P(LLMAccuracyTestsNPUW, ConfigIsAccurate) {
+    actual_ids = simple_llm.generate(input_ids);
+    for (auto i = 0; i < actual_ids.size(); ++i) {
+        ASSERT_EQ(actual_ids[i], reference_ids[i]);
+    }
 }
 
-INSTANTIATE_TEST_SUITE_P(LLMAccuracyNPUW_FAST_COMPILE, LLMAccuracyTest,
+INSTANTIATE_TEST_SUITE_P(LLMAccuracyNPUW_FAST_COMPILE, LLMAccuracyTestsNPUW,
     ::testing::Combine(testing::Values("C:\\apronina\\models\\TinyLlama-1.1B-Chat-v1.0_int4_sym_group128_dyn_stateful\\TinyLlama-1.1B-Chat-v1.0_int4_sym_group128_dyn_stateful\\openvino_model.xml"),
                        testing::Values(ov::AnyMap{}),
                        testing::Values(What_is_OpenVINO),
                        testing::Values(OpenVINO)));
 
-INSTANTIATE_TEST_SUITE_P(LLMAccuracyNPUW_BEST_PERF, LLMAccuracyTest,
+INSTANTIATE_TEST_SUITE_P(LLMAccuracyNPUW_BEST_PERF, LLMAccuracyTestsNPUW,
 ::testing::Combine(testing::Values("C:\\apronina\\models\\TinyLlama-1.1B-Chat-v1.0_int4_sym_group128_dyn_stateful\\TinyLlama-1.1B-Chat-v1.0_int4_sym_group128_dyn_stateful\\openvino_model.xml"),
                     testing::Values(ov::AnyMap{{"NPUW_LLM_GENERATE_HINT", "BEST_PERF"}}),
                     testing::Values(What_is_OpenVINO),
                     testing::Values(OpenVINO)));
 
-INSTANTIATE_TEST_SUITE_P(LLMAccuracyNPUW_DYNAMIC_BEST_PERF, LLMAccuracyTest,
+INSTANTIATE_TEST_SUITE_P(LLMAccuracyNPUW_DYNAMIC_BEST_PERF, LLMAccuracyTestsNPUW,
     ::testing::Combine(testing::Values("C:\\apronina\\models\\TinyLlama-1.1B-Chat-v1.0_int4_sym_group128_dyn_stateful\\TinyLlama-1.1B-Chat-v1.0_int4_sym_group128_dyn_stateful\\openvino_model.xml"),
                         testing::Values(ov::AnyMap{{"NPUW_LLM_GENERATE_HINT", "BEST_PERF"},
                                                    {"NPUW_LLM_PREFILL_HINT", "DYNAMIC"}}),
                         testing::Values(What_is_OpenVINO),
                         testing::Values(OpenVINO)));
 
+#endif // defined(OPENVINO_ARCH_X86) || defined(OPENVINO_ARCH_X86_64)
 #endif // WITH_CPU_PLUGIN
 #endif // not OPENVINO_STATIC_LIBRARY
diff --git a/src/plugins/intel_npu/tests/functional/behavior/npuw/llm_behavior_tests.cpp b/src/plugins/intel_npu/tests/functional/behavior/npuw/llm_behavior_tests.cpp
@@ -0,0 +1,197 @@
+// Copyright (C) 2024-2025 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "test_engine/simple_llm_pipeline.hpp"
+#include "test_engine/mocks/mock_plugins.hpp"
+#include "test_engine/mocks/register_in_ov.hpp"
+#include "intel_npu/npuw_private_properties.hpp"
+#include "openvino/util/shared_object.hpp"
+#include "openvino/util/file_util.hpp"
+
+#include <filesystem>
+
+using namespace testing;
+using namespace ov::npuw::tests;
+using namespace ov::intel_npu::npuw;
+
+namespace {
+    const std::vector<int64_t> What_is_OpenVINO =
+        {529, 29989, 1792, 29989, 29958, 13, 5618, 338, 4673, 29963, 1177,
+         29949, 29973, 2, 29871, 13, 29966, 29989, 465, 22137, 29989, 29958, 13};
+} // anonymous namespace
+
+#define TIMES(times) times 
+
+#define THROW(...) \
+    .WillOnce(Throw(std::runtime_error(__VA_ARGS__)))
+
+#define EXPECT_COMPILE_MODEL(device, times, ...)                        \
+    EXPECT_CALL(*device##_plugin,                                       \
+        compile_model(A<const std::shared_ptr<const ov::Model>&>(), _)) \
+        .Times(times)                                                   \
+        __VA_ARGS__                                                     \
+
+#define MODEL(idx) idx
+
+#define INFER_REQ(idx) idx
+
+#define EXPECT_CREATE_SYNC_INFER_REQ(device, model_idx, times, ...)                            \
+    device##_plugin->set_expectations_to_comp_models(model_idx, [](MockCompiledModel& model) { \
+        EXPECT_CALL(model, create_sync_infer_request())                                        \
+        .Times(times)                                                                          \
+        __VA_ARGS__;                                                                           \
+    })
+
+#define EXPECT_INFER(device, model_idx, times, ...) \
+    device##_plugin->set_expectations_to_infer_reqs(model_idx, 0, [](MockInferRequest& request) { \
+        EXPECT_CALL(request, infer())                                                             \
+        .Times(times)                                                                             \
+        __VA_ARGS__;                                                                              \
+    });
+
+#define EXPECT_INFER_FOR(device, model_idx, req_idx, times, ...) \
+    device##_plugin->set_expectations_to_infer_reqs(model_idx, req_idx, [](MockInferRequest& request) { \
+        EXPECT_CALL(request, infer())                                                                   \
+        .Times(times)                                                                                   \
+        __VA_ARGS__;                                                                                    \
+    });
+
+namespace ov {
+namespace npuw {
+namespace tests {
+
+class LLMBehaviorTestsNPUW : public ::testing::Test {
+public:
+    void SetUp() override {
+        mock_npu_for_prefill_plugin = std::make_shared<MockNpuPluginForPrefill>();
+        mock_npu_for_prefill_plugin->create_implementation();
+        mock_npu_for_generate_plugin = std::make_shared<MockNpuPluginForGenerate>();
+        mock_npu_for_generate_plugin->create_implementation();
+        config["++NPUW_LLM_PREFILL_CONFIG"] = ov::AnyMap{{"NPUW_DEVICES", "MockNPUForPrefill"}};
+        config["++NPUW_LLM_GENERATE_CONFIG"] = ov::AnyMap{{"NPUW_DEVICES", "MockNPUForGenerate"}};
+        config["NPUW_LLM_MIN_RESPONSE_LEN"] = 2;
+    }
+
+    // Make sure it is called after expectations are set!
+    void register_mock_plugins_in_ov() {
+        m_shared_objects.push_back(reg_plugin<MockNpuPluginForPrefill>(core, mock_npu_for_prefill_plugin));
+        m_shared_objects.push_back(reg_plugin<MockNpuPluginForGenerate>(core, mock_npu_for_generate_plugin));
+    }
+
+protected:
+    ov::Core core;
+    SimpleLLMPipeline simple_llm;
+    std::shared_ptr<MockNpuPluginForPrefill> mock_npu_for_prefill_plugin;
+    std::shared_ptr<MockNpuPluginForGenerate> mock_npu_for_generate_plugin;
+    ov::AnyMap config;
+
+private:
+    std::vector<std::shared_ptr<void>> m_shared_objects;
+};
+}  // namespace tests
+}  // namespace npuw
+}  // namespace ov
+
+TEST_F(LLMBehaviorTestsNPUW, LLMBehaviorNPUW_FAST_COMPILE) {
+    // Set expectations first:
+    {
+        InSequence seq;
+        // Generate model:
+        EXPECT_COMPILE_MODEL(mock_npu_for_generate, TIMES(3));
+        // Prefill model:
+        EXPECT_COMPILE_MODEL(mock_npu_for_prefill, TIMES(3));
+    }
+
+    // ------------------------ Prefill model ---------------------------
+    // 1 infer request for head:
+    EXPECT_CREATE_SYNC_INFER_REQ(mock_npu_for_prefill, MODEL(0), TIMES(1));  
+    // 2 infer requests for function, `create_sync_infer_request()`
+    // should be called twice here (2 requests to form pipelining):
+    EXPECT_CREATE_SYNC_INFER_REQ(mock_npu_for_prefill, MODEL(1), TIMES(2));
+    // 1 infer request for tail:
+    EXPECT_CREATE_SYNC_INFER_REQ(mock_npu_for_prefill, MODEL(2), TIMES(1));
+
+    // Head's infer request is called once:
+    EXPECT_INFER(mock_npu_for_prefill, MODEL(0), TIMES(1));
+    // There are 20 repeated functions, so:
+    // Repeated block's 1st infer request is called 10 times:
+    EXPECT_INFER_FOR(mock_npu_for_prefill, MODEL(1), INFER_REQ(0), TIMES(10));
+    // Repeated block's 2nd infer request (brother of 1st one) is called 10 times:
+    EXPECT_INFER_FOR(mock_npu_for_prefill, MODEL(1), INFER_REQ(1), TIMES(10));
+    // Tail's infer request is called once:
+    EXPECT_INFER(mock_npu_for_prefill, MODEL(2), TIMES(1));
+
+    // ------------------------ Generate model ---------------------------
+    // 1 infer request for head:
+    EXPECT_CREATE_SYNC_INFER_REQ(mock_npu_for_generate, MODEL(0), TIMES(1));  
+    // 2 infer requests for function, `create_sync_infer_request()`
+    // should be called 20 times here (due to UNFOLD_IREQS):
+    EXPECT_CREATE_SYNC_INFER_REQ(mock_npu_for_generate, MODEL(1), TIMES(20));
+    // 1 infer request for tail:
+    EXPECT_CREATE_SYNC_INFER_REQ(mock_npu_for_generate, MODEL(2), TIMES(1));
+
+    // Head's infer request is called once:
+    EXPECT_INFER(mock_npu_for_generate, MODEL(0), TIMES(1));
+    // Different 20 infer requests of 2nd submodel should be called:
+    for (int i = 0; i < 20; ++i) {
+        EXPECT_INFER_FOR(mock_npu_for_generate, MODEL(1), INFER_REQ(i), TIMES(1));
+    }
+    // Tail's infer request is called once:
+    EXPECT_INFER(mock_npu_for_generate, MODEL(2), TIMES(1));
+
+    // Register mock objects as plugins in OpenVINO:
+    register_mock_plugins_in_ov();
+
+    // Do the actual test:
+    simple_llm.initialize("C:\\apronina\\models\\TinyLlama-1.1B-Chat-v1.0_int4_sym_group128_dyn_stateful\\TinyLlama-1.1B-Chat-v1.0_int4_sym_group128_dyn_stateful\\openvino_model.xml",
+                          core, config); 
+    EXPECT_NO_THROW(simple_llm.generate(What_is_OpenVINO));
+}
+ 
+TEST_F(LLMBehaviorTestsNPUW, LLMBehaviorNPUW_BEST_PERF) {
+    // Set expectations first:
+    {
+        InSequence seq;
+        // Generate model:
+        EXPECT_COMPILE_MODEL(mock_npu_for_generate, TIMES(1));
+        // Prefill model:
+        EXPECT_COMPILE_MODEL(mock_npu_for_prefill, TIMES(3));
+    }
+
+    // ------------------------ Prefill model ---------------------------
+    // 1 infer request for head:
+    EXPECT_CREATE_SYNC_INFER_REQ(mock_npu_for_prefill, MODEL(0), TIMES(1));  
+    // 2 infer requests for function, `create_sync_infer_request()`
+    // should be called twice here (2 requests to form pipelining):
+    EXPECT_CREATE_SYNC_INFER_REQ(mock_npu_for_prefill, MODEL(1), TIMES(2));
+    // 1 infer request for tail:
+    EXPECT_CREATE_SYNC_INFER_REQ(mock_npu_for_prefill, MODEL(2), TIMES(1));
+
+    // Head's infer request is called once:
+    EXPECT_INFER(mock_npu_for_prefill, MODEL(0), TIMES(1));
+    // There are 20 repeated functions, so:
+    // Repeated block's 1st infer request is called 10 times:
+    EXPECT_INFER_FOR(mock_npu_for_prefill, MODEL(1), INFER_REQ(0), TIMES(10));
+    // Repeated block's 2nd infer request (brother of 1st one) is called 10 times:
+    EXPECT_INFER_FOR(mock_npu_for_prefill, MODEL(1), INFER_REQ(1), TIMES(10));
+    // Tail's infer request is called once:
+    EXPECT_INFER(mock_npu_for_prefill, MODEL(2), TIMES(1));
+
+    // ------------------------ Generate model ---------------------------
+    // 1 infer request for the whole generate model:
+    EXPECT_CREATE_SYNC_INFER_REQ(mock_npu_for_generate, MODEL(0), TIMES(1));  
+    // Infer request is called only once:
+    EXPECT_INFER(mock_npu_for_generate, MODEL(0), TIMES(1));
+
+    // Register mock objects as plugins in OpenVINO:
+    register_mock_plugins_in_ov();
+
+    // Do the actual test:
+
+    config["NPUW_LLM_GENERATE_HINT"] = "BEST_PERF";
+    simple_llm.initialize("C:\\apronina\\models\\TinyLlama-1.1B-Chat-v1.0_int4_sym_group128_dyn_stateful\\TinyLlama-1.1B-Chat-v1.0_int4_sym_group128_dyn_stateful\\openvino_model.xml",
+                            core, config); 
+    EXPECT_NO_THROW(simple_llm.generate(What_is_OpenVINO));
+}
+
diff --git a/src/plugins/intel_npu/tests/functional/behavior/npuw/test_engine/mocks/mock_plugins.cpp b/src/plugins/intel_npu/tests/functional/behavior/npuw/test_engine/mocks/mock_plugins.cpp
@@ -303,6 +303,8 @@ MockPluginBase<DeviceType>::~MockPluginBase() {
 }
 
 template class MockPluginBase<ov::npuw::tests::mocks::Npu>;
+template class MockPluginBase<ov::npuw::tests::mocks::NpuForPrefill>;
+template class MockPluginBase<ov::npuw::tests::mocks::NpuForGenerate>;
 template class MockPluginBase<ov::npuw::tests::mocks::Cpu>;
 
 }  // namespace tests
diff --git a/src/plugins/intel_npu/tests/functional/behavior/npuw/test_engine/mocks/mock_plugins.hpp b/src/plugins/intel_npu/tests/functional/behavior/npuw/test_engine/mocks/mock_plugins.hpp
@@ -159,13 +159,23 @@ struct Npu {
     static constexpr const char* name = "MockNPU";
     static constexpr int num_device_ids = 3;
 };
+struct NpuForPrefill {
+    static constexpr const char* name = "MockNPUForPrefill";
+    static constexpr int num_device_ids = 3;
+};
+struct NpuForGenerate {
+    static constexpr const char* name = "MockNPUForGenerate";
+    static constexpr int num_device_ids = 3;
+};
 struct Cpu {
     static constexpr const char* name = "MockCPU";
     static constexpr int num_device_ids = 2;
 };
 } //namespace mocks
 
 using MockNpuPlugin = testing::NiceMock<MockPluginBase<mocks::Npu>>;
+using MockNpuPluginForPrefill = testing::NiceMock<MockPluginBase<mocks::NpuForPrefill>>;
+using MockNpuPluginForGenerate = testing::NiceMock<MockPluginBase<mocks::NpuForGenerate>>;
 using MockCpuPlugin = testing::NiceMock<MockPluginBase<mocks::Cpu>>;
 
 }  // namespace tests

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-// Copyright (C) 2024 Intel Corporation`
	`1`	`+// Copyright (C) 2024-2025 Intel Corporation`
`2`	`2`	`// SPDX-License-Identifier: Apache-2.0`
`3`	`3`	`//`
`4`	`4`
Original file line number	Diff line number	Diff line change
`@@ -303,6 +303,8 @@ MockPluginBase<DeviceType>::~MockPluginBase() {`
`303`	`303`	`}`
`304`	`304`
`305`	`305`	`template class MockPluginBase<ov::npuw::tests::mocks::Npu>;`
	`306`	`+template class MockPluginBase<ov::npuw::tests::mocks::NpuForPrefill>;`
	`307`	`+template class MockPluginBase<ov::npuw::tests::mocks::NpuForGenerate>;`
`306`	`308`	`template class MockPluginBase<ov::npuw::tests::mocks::Cpu>;`
`307`	`309`
`308`	`310`	`} // namespace tests`