Skip to content

LLM NPUW tests for PR checks #30051

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions src/plugins/intel_npu/tests/functional/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,11 @@ ov_add_test_target(
openvino::npu_common
openvino::npu_al)

if(ENABLE_INTEL_CPU)
add_dependencies(${TARGET_NAME} openvino_intel_cpu_plugin)
target_compile_definitions(${TARGET_NAME} PRIVATE WITH_CPU_PLUGIN)
endif()

if(WIN32)
if(ENABLE_DX12)
target_compile_definitions(${TARGET_NAME} PRIVATE ENABLE_DX12)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
// Copyright (C) 2024 Intel Corporation
// Copyright (C) 2024-2025 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#include "behavior_tests.hpp"
#include "comparators/nrmse.hpp"
#include "test_engine/comparators/nrmse.hpp"
#include "openvino/util/shared_object.hpp"
#include "openvino/util/file_util.hpp"

Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright (C) 2024 Intel Corporation
// Copyright (C) 2024-2025 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
#pragma once
Expand All @@ -8,10 +8,10 @@
#include <memory>
#include <random>

#include "generators/model_generator.hpp"
#include "test_engine/generators/model_generator.hpp"
#include "test_engine/mocks/mock_plugins.hpp"
#include "test_engine/mocks/register_in_ov.hpp"
#include "intel_npu/npuw_private_properties.hpp"
#include "mocks/mock_plugins.hpp"
#include "mocks/register_in_ov.hpp"
#include "openvino/runtime/core.hpp"
#include "openvino/runtime/iplugin.hpp"
#include "openvino/runtime/make_tensor.hpp"
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
// Copyright (C) 2024-2025 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#include "test_engine/comparators/nrmse.hpp"
#include "test_engine/simple_llm_pipeline.hpp"
#include "intel_npu/npuw_private_properties.hpp"
#include "openvino/util/shared_object.hpp"
#include "openvino/util/file_util.hpp"

#include <filesystem>
#include <algorithm>

#include <gtest/gtest.h>

using namespace testing;
using namespace ov::npuw::tests;
using namespace ov::intel_npu::npuw;

// this tests load plugin by library name: this is not available during static linkage
#ifndef OPENVINO_STATIC_LIBRARY
#ifdef WITH_CPU_PLUGIN
#if defined(OPENVINO_ARCH_X86) || defined(OPENVINO_ARCH_X86_64)
const char* cpu_plugin_file_name = "openvino_intel_cpu_plugin";

namespace {
const std::vector<int64_t> What_is_OpenVINO =
{529, 29989, 1792, 29989, 29958, 13, 5618, 338, 4673, 29963, 1177,
29949, 29973, 2, 29871, 13, 29966, 29989, 465, 22137, 29989, 29958, 13};
const std::vector<int64_t> OpenVINO =
{6585, 29963, 1177, 29949};
Comment on lines +26 to +31
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this namespace can have a name token_ids.


using LLMTestParams = std::tuple<std::string, ov::AnyMap,
std::vector<int64_t>, std::vector<int64_t>>;
} // anonymous namespace

class LLMAccuracyTestsNPUW : public ::testing::TestWithParam<LLMTestParams> {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This shouldn't be called an ACCURACY test to avoid confusion with the real accuracy tests.

public:
void SetUp() override {
// NOTE: TEMPLATE plugin in OpenVINO works for ~20 minute to generate
// first token from prefill model and crashes on launch of
// 3rd subrequest in generate model.
// There is no such issue with CPU plugin, so CPU plugin was choosen
// for accuracy checks.
// Register CPU plugin in OpenVINO:
try {
core.register_plugin(std::string(cpu_plugin_file_name) + OV_BUILD_POSTFIX, "CPU");
} catch (ov::Exception& ex) {
if (std::string{ex.what()}.find("Device with \"CPU\" is already registered in the OpenVINO Runtime")
== std::string::npos) {
throw ex;
}
}

auto param = GetParam();
ov::AnyMap config;
std::tie(model_path, config, input_ids, reference_ids) = param;
config["NPUW_DEVICES"] = "CPU";
config["NPUW_LLM_MIN_RESPONSE_LEN"] = 4;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what's the prompt length? Not sure why you limit this one here, but if your test prompt is short you may want to limit the PROMPT_LEN as well to improve the prefill's performance on CPU.

simple_llm.initialize(model_path, core, config);
}

protected:
ov::Core core;
SimpleLLMPipeline simple_llm;
std::string model_path;
std::vector<int64_t> input_ids;
std::vector<int64_t> actual_ids;
std::vector<int64_t> reference_ids;
};

TEST_P(LLMAccuracyTestsNPUW, ConfigIsAccurate) {
actual_ids = simple_llm.generate(input_ids);
for (auto i = 0; i < actual_ids.size(); ++i) {
ASSERT_EQ(actual_ids[i], reference_ids[i]);
}
}

INSTANTIATE_TEST_SUITE_P(LLMAccuracyNPUW_FAST_COMPILE, LLMAccuracyTestsNPUW,
::testing::Combine(testing::Values("C:\\apronina\\models\\TinyLlama-1.1B-Chat-v1.0_int4_sym_group128_dyn_stateful\\TinyLlama-1.1B-Chat-v1.0_int4_sym_group128_dyn_stateful\\openvino_model.xml"),
testing::Values(ov::AnyMap{}),
testing::Values(What_is_OpenVINO),
testing::Values(OpenVINO)));

INSTANTIATE_TEST_SUITE_P(LLMAccuracyNPUW_BEST_PERF, LLMAccuracyTestsNPUW,
::testing::Combine(testing::Values("C:\\apronina\\models\\TinyLlama-1.1B-Chat-v1.0_int4_sym_group128_dyn_stateful\\TinyLlama-1.1B-Chat-v1.0_int4_sym_group128_dyn_stateful\\openvino_model.xml"),
testing::Values(ov::AnyMap{{"NPUW_LLM_GENERATE_HINT", "BEST_PERF"}}),
testing::Values(What_is_OpenVINO),
testing::Values(OpenVINO)));

INSTANTIATE_TEST_SUITE_P(LLMAccuracyNPUW_DYNAMIC_BEST_PERF, LLMAccuracyTestsNPUW,
::testing::Combine(testing::Values("C:\\apronina\\models\\TinyLlama-1.1B-Chat-v1.0_int4_sym_group128_dyn_stateful\\TinyLlama-1.1B-Chat-v1.0_int4_sym_group128_dyn_stateful\\openvino_model.xml"),
testing::Values(ov::AnyMap{{"NPUW_LLM_GENERATE_HINT", "BEST_PERF"},
{"NPUW_LLM_PREFILL_HINT", "DYNAMIC"}}),
testing::Values(What_is_OpenVINO),
testing::Values(OpenVINO)));

#endif // defined(OPENVINO_ARCH_X86) || defined(OPENVINO_ARCH_X86_64)
#endif // WITH_CPU_PLUGIN
#endif // not OPENVINO_STATIC_LIBRARY
Original file line number Diff line number Diff line change
@@ -0,0 +1,197 @@
// Copyright (C) 2024-2025 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#include "test_engine/simple_llm_pipeline.hpp"
#include "test_engine/mocks/mock_plugins.hpp"
#include "test_engine/mocks/register_in_ov.hpp"
#include "intel_npu/npuw_private_properties.hpp"
#include "openvino/util/shared_object.hpp"
#include "openvino/util/file_util.hpp"

#include <filesystem>

using namespace testing;
using namespace ov::npuw::tests;
using namespace ov::intel_npu::npuw;

namespace {
const std::vector<int64_t> What_is_OpenVINO =
{529, 29989, 1792, 29989, 29958, 13, 5618, 338, 4673, 29963, 1177,
29949, 29973, 2, 29871, 13, 29966, 29989, 465, 22137, 29989, 29958, 13};
} // anonymous namespace

#define TIMES(times) times

#define THROW(...) \
.WillOnce(Throw(std::runtime_error(__VA_ARGS__)))

#define EXPECT_COMPILE_MODEL(device, times, ...) \
EXPECT_CALL(*device##_plugin, \
compile_model(A<const std::shared_ptr<const ov::Model>&>(), _)) \
.Times(times) \
__VA_ARGS__ \

#define MODEL(idx) idx

#define INFER_REQ(idx) idx

#define EXPECT_CREATE_SYNC_INFER_REQ(device, model_idx, times, ...) \
device##_plugin->set_expectations_to_comp_models(model_idx, [](MockCompiledModel& model) { \
EXPECT_CALL(model, create_sync_infer_request()) \
.Times(times) \
__VA_ARGS__; \
})

#define EXPECT_INFER(device, model_idx, times, ...) \
device##_plugin->set_expectations_to_infer_reqs(model_idx, 0, [](MockInferRequest& request) { \
EXPECT_CALL(request, infer()) \
.Times(times) \
__VA_ARGS__; \
});

#define EXPECT_INFER_FOR(device, model_idx, req_idx, times, ...) \
device##_plugin->set_expectations_to_infer_reqs(model_idx, req_idx, [](MockInferRequest& request) { \
EXPECT_CALL(request, infer()) \
.Times(times) \
__VA_ARGS__; \
});

namespace ov {
namespace npuw {
namespace tests {

class LLMBehaviorTestsNPUW : public ::testing::Test {
public:
void SetUp() override {
mock_npu_for_prefill_plugin = std::make_shared<MockNpuPluginForPrefill>();
mock_npu_for_prefill_plugin->create_implementation();
mock_npu_for_generate_plugin = std::make_shared<MockNpuPluginForGenerate>();
mock_npu_for_generate_plugin->create_implementation();
config["++NPUW_LLM_PREFILL_CONFIG"] = ov::AnyMap{{"NPUW_DEVICES", "MockNPUForPrefill"}};
config["++NPUW_LLM_GENERATE_CONFIG"] = ov::AnyMap{{"NPUW_DEVICES", "MockNPUForGenerate"}};
config["NPUW_LLM_MIN_RESPONSE_LEN"] = 2;
}

// Make sure it is called after expectations are set!
void register_mock_plugins_in_ov() {
m_shared_objects.push_back(reg_plugin<MockNpuPluginForPrefill>(core, mock_npu_for_prefill_plugin));
m_shared_objects.push_back(reg_plugin<MockNpuPluginForGenerate>(core, mock_npu_for_generate_plugin));
}

protected:
ov::Core core;
SimpleLLMPipeline simple_llm;
std::shared_ptr<MockNpuPluginForPrefill> mock_npu_for_prefill_plugin;
std::shared_ptr<MockNpuPluginForGenerate> mock_npu_for_generate_plugin;
ov::AnyMap config;

private:
std::vector<std::shared_ptr<void>> m_shared_objects;
};
} // namespace tests
} // namespace npuw
} // namespace ov

TEST_F(LLMBehaviorTestsNPUW, LLMBehaviorNPUW_FAST_COMPILE) {
// Set expectations first:
{
InSequence seq;
// Generate model:
EXPECT_COMPILE_MODEL(mock_npu_for_generate, TIMES(3));
// Prefill model:
EXPECT_COMPILE_MODEL(mock_npu_for_prefill, TIMES(3));
}

// ------------------------ Prefill model ---------------------------
// 1 infer request for head:
EXPECT_CREATE_SYNC_INFER_REQ(mock_npu_for_prefill, MODEL(0), TIMES(1));
// 2 infer requests for function, `create_sync_infer_request()`
// should be called twice here (2 requests to form pipelining):
EXPECT_CREATE_SYNC_INFER_REQ(mock_npu_for_prefill, MODEL(1), TIMES(2));
// 1 infer request for tail:
EXPECT_CREATE_SYNC_INFER_REQ(mock_npu_for_prefill, MODEL(2), TIMES(1));

// Head's infer request is called once:
EXPECT_INFER(mock_npu_for_prefill, MODEL(0), TIMES(1));
// There are 20 repeated functions, so:
// Repeated block's 1st infer request is called 10 times:
EXPECT_INFER_FOR(mock_npu_for_prefill, MODEL(1), INFER_REQ(0), TIMES(10));
// Repeated block's 2nd infer request (brother of 1st one) is called 10 times:
EXPECT_INFER_FOR(mock_npu_for_prefill, MODEL(1), INFER_REQ(1), TIMES(10));
// Tail's infer request is called once:
EXPECT_INFER(mock_npu_for_prefill, MODEL(2), TIMES(1));

// ------------------------ Generate model ---------------------------
// 1 infer request for head:
EXPECT_CREATE_SYNC_INFER_REQ(mock_npu_for_generate, MODEL(0), TIMES(1));
// `create_sync_infer_request()` should be called 20 times
// to create 20 separate infer requests here (due to UNFOLD_IREQS):
EXPECT_CREATE_SYNC_INFER_REQ(mock_npu_for_generate, MODEL(1), TIMES(20));
// 1 infer request for tail:
EXPECT_CREATE_SYNC_INFER_REQ(mock_npu_for_generate, MODEL(2), TIMES(1));

// Head's infer request is called once:
EXPECT_INFER(mock_npu_for_generate, MODEL(0), TIMES(1));
// Different 20 infer requests of 2nd submodel should be called:
for (int i = 0; i < 20; ++i) {
EXPECT_INFER_FOR(mock_npu_for_generate, MODEL(1), INFER_REQ(i), TIMES(1));
}
// Tail's infer request is called once:
EXPECT_INFER(mock_npu_for_generate, MODEL(2), TIMES(1));

// Register mock objects as plugins in OpenVINO:
register_mock_plugins_in_ov();

// Do the actual test:
simple_llm.initialize("C:\\apronina\\models\\TinyLlama-1.1B-Chat-v1.0_int4_sym_group128_dyn_stateful\\TinyLlama-1.1B-Chat-v1.0_int4_sym_group128_dyn_stateful\\openvino_model.xml",
core, config);
EXPECT_NO_THROW(simple_llm.generate(What_is_OpenVINO));
}

TEST_F(LLMBehaviorTestsNPUW, LLMBehaviorNPUW_BEST_PERF) {
// Set expectations first:
{
InSequence seq;
// Generate model:
EXPECT_COMPILE_MODEL(mock_npu_for_generate, TIMES(1));
// Prefill model:
EXPECT_COMPILE_MODEL(mock_npu_for_prefill, TIMES(3));
}

// ------------------------ Prefill model ---------------------------
// 1 infer request for head:
EXPECT_CREATE_SYNC_INFER_REQ(mock_npu_for_prefill, MODEL(0), TIMES(1));
// 2 infer requests for function, `create_sync_infer_request()`
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

to polish

// should be called twice here (2 requests to form pipelining):
EXPECT_CREATE_SYNC_INFER_REQ(mock_npu_for_prefill, MODEL(1), TIMES(2));
// 1 infer request for tail:
EXPECT_CREATE_SYNC_INFER_REQ(mock_npu_for_prefill, MODEL(2), TIMES(1));

// Head's infer request is called once:
EXPECT_INFER(mock_npu_for_prefill, MODEL(0), TIMES(1));
// There are 20 repeated functions, so:
// Repeated block's 1st infer request is called 10 times:
EXPECT_INFER_FOR(mock_npu_for_prefill, MODEL(1), INFER_REQ(0), TIMES(10));
// Repeated block's 2nd infer request (brother of 1st one) is called 10 times:
EXPECT_INFER_FOR(mock_npu_for_prefill, MODEL(1), INFER_REQ(1), TIMES(10));
// Tail's infer request is called once:
EXPECT_INFER(mock_npu_for_prefill, MODEL(2), TIMES(1));

// ------------------------ Generate model ---------------------------
// 1 infer request for the whole generate model:
EXPECT_CREATE_SYNC_INFER_REQ(mock_npu_for_generate, MODEL(0), TIMES(1));
// Infer request is called only once:
EXPECT_INFER(mock_npu_for_generate, MODEL(0), TIMES(1));

// Register mock objects as plugins in OpenVINO:
register_mock_plugins_in_ov();

// Do the actual test:

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

to remove

config["NPUW_LLM_GENERATE_HINT"] = "BEST_PERF";
simple_llm.initialize("C:\\apronina\\models\\TinyLlama-1.1B-Chat-v1.0_int4_sym_group128_dyn_stateful\\TinyLlama-1.1B-Chat-v1.0_int4_sym_group128_dyn_stateful\\openvino_model.xml",
core, config);
EXPECT_NO_THROW(simple_llm.generate(What_is_OpenVINO));
}

Original file line number Diff line number Diff line change
Expand Up @@ -303,6 +303,8 @@ MockPluginBase<DeviceType>::~MockPluginBase() {
}

template class MockPluginBase<ov::npuw::tests::mocks::Npu>;
template class MockPluginBase<ov::npuw::tests::mocks::NpuForPrefill>;
template class MockPluginBase<ov::npuw::tests::mocks::NpuForGenerate>;
template class MockPluginBase<ov::npuw::tests::mocks::Cpu>;

} // namespace tests
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -159,13 +159,23 @@ struct Npu {
static constexpr const char* name = "MockNPU";
static constexpr int num_device_ids = 3;
};
struct NpuForPrefill {
static constexpr const char* name = "MockNPUForPrefill";
static constexpr int num_device_ids = 3;
};
struct NpuForGenerate {
static constexpr const char* name = "MockNPUForGenerate";
static constexpr int num_device_ids = 3;
};
struct Cpu {
static constexpr const char* name = "MockCPU";
static constexpr int num_device_ids = 2;
};
} //namespace mocks

using MockNpuPlugin = testing::NiceMock<MockPluginBase<mocks::Npu>>;
using MockNpuPluginForPrefill = testing::NiceMock<MockPluginBase<mocks::NpuForPrefill>>;
using MockNpuPluginForGenerate = testing::NiceMock<MockPluginBase<mocks::NpuForGenerate>>;
using MockCpuPlugin = testing::NiceMock<MockPluginBase<mocks::Cpu>>;

} // namespace tests
Expand Down
Loading
Loading