-
Notifications
You must be signed in to change notification settings - Fork 2.7k
LLM NPUW tests for PR checks #30051
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
LLM NPUW tests for PR checks #30051
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,100 @@ | ||
// Copyright (C) 2024-2025 Intel Corporation | ||
// SPDX-License-Identifier: Apache-2.0 | ||
// | ||
|
||
#include "test_engine/comparators/nrmse.hpp" | ||
#include "test_engine/simple_llm_pipeline.hpp" | ||
#include "intel_npu/npuw_private_properties.hpp" | ||
#include "openvino/util/shared_object.hpp" | ||
#include "openvino/util/file_util.hpp" | ||
|
||
#include <filesystem> | ||
#include <algorithm> | ||
|
||
#include <gtest/gtest.h> | ||
|
||
using namespace testing; | ||
using namespace ov::npuw::tests; | ||
using namespace ov::intel_npu::npuw; | ||
|
||
// this tests load plugin by library name: this is not available during static linkage | ||
#ifndef OPENVINO_STATIC_LIBRARY | ||
#ifdef WITH_CPU_PLUGIN | ||
#if defined(OPENVINO_ARCH_X86) || defined(OPENVINO_ARCH_X86_64) | ||
const char* cpu_plugin_file_name = "openvino_intel_cpu_plugin"; | ||
|
||
namespace { | ||
const std::vector<int64_t> What_is_OpenVINO = | ||
{529, 29989, 1792, 29989, 29958, 13, 5618, 338, 4673, 29963, 1177, | ||
29949, 29973, 2, 29871, 13, 29966, 29989, 465, 22137, 29989, 29958, 13}; | ||
const std::vector<int64_t> OpenVINO = | ||
{6585, 29963, 1177, 29949}; | ||
|
||
using LLMTestParams = std::tuple<std::string, ov::AnyMap, | ||
std::vector<int64_t>, std::vector<int64_t>>; | ||
} // anonymous namespace | ||
|
||
class LLMAccuracyTestsNPUW : public ::testing::TestWithParam<LLMTestParams> { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This shouldn't be called an ACCURACY test to avoid confusion with the real accuracy tests. |
||
public: | ||
void SetUp() override { | ||
// NOTE: TEMPLATE plugin in OpenVINO works for ~20 minute to generate | ||
// first token from prefill model and crashes on launch of | ||
// 3rd subrequest in generate model. | ||
// There is no such issue with CPU plugin, so CPU plugin was choosen | ||
// for accuracy checks. | ||
// Register CPU plugin in OpenVINO: | ||
try { | ||
core.register_plugin(std::string(cpu_plugin_file_name) + OV_BUILD_POSTFIX, "CPU"); | ||
} catch (ov::Exception& ex) { | ||
if (std::string{ex.what()}.find("Device with \"CPU\" is already registered in the OpenVINO Runtime") | ||
== std::string::npos) { | ||
throw ex; | ||
} | ||
} | ||
|
||
auto param = GetParam(); | ||
ov::AnyMap config; | ||
std::tie(model_path, config, input_ids, reference_ids) = param; | ||
config["NPUW_DEVICES"] = "CPU"; | ||
config["NPUW_LLM_MIN_RESPONSE_LEN"] = 4; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what's the prompt length? Not sure why you limit this one here, but if your test prompt is short you may want to limit the PROMPT_LEN as well to improve the prefill's performance on CPU. |
||
simple_llm.initialize(model_path, core, config); | ||
} | ||
|
||
protected: | ||
ov::Core core; | ||
SimpleLLMPipeline simple_llm; | ||
std::string model_path; | ||
std::vector<int64_t> input_ids; | ||
std::vector<int64_t> actual_ids; | ||
std::vector<int64_t> reference_ids; | ||
}; | ||
|
||
TEST_P(LLMAccuracyTestsNPUW, ConfigIsAccurate) { | ||
actual_ids = simple_llm.generate(input_ids); | ||
for (auto i = 0; i < actual_ids.size(); ++i) { | ||
ASSERT_EQ(actual_ids[i], reference_ids[i]); | ||
} | ||
} | ||
|
||
INSTANTIATE_TEST_SUITE_P(LLMAccuracyNPUW_FAST_COMPILE, LLMAccuracyTestsNPUW, | ||
::testing::Combine(testing::Values("C:\\apronina\\models\\TinyLlama-1.1B-Chat-v1.0_int4_sym_group128_dyn_stateful\\TinyLlama-1.1B-Chat-v1.0_int4_sym_group128_dyn_stateful\\openvino_model.xml"), | ||
testing::Values(ov::AnyMap{}), | ||
testing::Values(What_is_OpenVINO), | ||
testing::Values(OpenVINO))); | ||
|
||
INSTANTIATE_TEST_SUITE_P(LLMAccuracyNPUW_BEST_PERF, LLMAccuracyTestsNPUW, | ||
::testing::Combine(testing::Values("C:\\apronina\\models\\TinyLlama-1.1B-Chat-v1.0_int4_sym_group128_dyn_stateful\\TinyLlama-1.1B-Chat-v1.0_int4_sym_group128_dyn_stateful\\openvino_model.xml"), | ||
testing::Values(ov::AnyMap{{"NPUW_LLM_GENERATE_HINT", "BEST_PERF"}}), | ||
testing::Values(What_is_OpenVINO), | ||
testing::Values(OpenVINO))); | ||
|
||
INSTANTIATE_TEST_SUITE_P(LLMAccuracyNPUW_DYNAMIC_BEST_PERF, LLMAccuracyTestsNPUW, | ||
::testing::Combine(testing::Values("C:\\apronina\\models\\TinyLlama-1.1B-Chat-v1.0_int4_sym_group128_dyn_stateful\\TinyLlama-1.1B-Chat-v1.0_int4_sym_group128_dyn_stateful\\openvino_model.xml"), | ||
testing::Values(ov::AnyMap{{"NPUW_LLM_GENERATE_HINT", "BEST_PERF"}, | ||
{"NPUW_LLM_PREFILL_HINT", "DYNAMIC"}}), | ||
testing::Values(What_is_OpenVINO), | ||
testing::Values(OpenVINO))); | ||
|
||
#endif // defined(OPENVINO_ARCH_X86) || defined(OPENVINO_ARCH_X86_64) | ||
#endif // WITH_CPU_PLUGIN | ||
#endif // not OPENVINO_STATIC_LIBRARY |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,197 @@ | ||
// Copyright (C) 2024-2025 Intel Corporation | ||
// SPDX-License-Identifier: Apache-2.0 | ||
// | ||
|
||
#include "test_engine/simple_llm_pipeline.hpp" | ||
#include "test_engine/mocks/mock_plugins.hpp" | ||
#include "test_engine/mocks/register_in_ov.hpp" | ||
#include "intel_npu/npuw_private_properties.hpp" | ||
#include "openvino/util/shared_object.hpp" | ||
#include "openvino/util/file_util.hpp" | ||
|
||
#include <filesystem> | ||
|
||
using namespace testing; | ||
using namespace ov::npuw::tests; | ||
using namespace ov::intel_npu::npuw; | ||
|
||
namespace { | ||
const std::vector<int64_t> What_is_OpenVINO = | ||
{529, 29989, 1792, 29989, 29958, 13, 5618, 338, 4673, 29963, 1177, | ||
29949, 29973, 2, 29871, 13, 29966, 29989, 465, 22137, 29989, 29958, 13}; | ||
} // anonymous namespace | ||
|
||
#define TIMES(times) times | ||
|
||
#define THROW(...) \ | ||
.WillOnce(Throw(std::runtime_error(__VA_ARGS__))) | ||
|
||
#define EXPECT_COMPILE_MODEL(device, times, ...) \ | ||
EXPECT_CALL(*device##_plugin, \ | ||
compile_model(A<const std::shared_ptr<const ov::Model>&>(), _)) \ | ||
.Times(times) \ | ||
__VA_ARGS__ \ | ||
|
||
#define MODEL(idx) idx | ||
|
||
#define INFER_REQ(idx) idx | ||
|
||
#define EXPECT_CREATE_SYNC_INFER_REQ(device, model_idx, times, ...) \ | ||
device##_plugin->set_expectations_to_comp_models(model_idx, [](MockCompiledModel& model) { \ | ||
EXPECT_CALL(model, create_sync_infer_request()) \ | ||
.Times(times) \ | ||
__VA_ARGS__; \ | ||
}) | ||
|
||
#define EXPECT_INFER(device, model_idx, times, ...) \ | ||
device##_plugin->set_expectations_to_infer_reqs(model_idx, 0, [](MockInferRequest& request) { \ | ||
EXPECT_CALL(request, infer()) \ | ||
.Times(times) \ | ||
__VA_ARGS__; \ | ||
}); | ||
|
||
#define EXPECT_INFER_FOR(device, model_idx, req_idx, times, ...) \ | ||
device##_plugin->set_expectations_to_infer_reqs(model_idx, req_idx, [](MockInferRequest& request) { \ | ||
EXPECT_CALL(request, infer()) \ | ||
.Times(times) \ | ||
__VA_ARGS__; \ | ||
}); | ||
|
||
namespace ov { | ||
namespace npuw { | ||
namespace tests { | ||
|
||
class LLMBehaviorTestsNPUW : public ::testing::Test { | ||
public: | ||
void SetUp() override { | ||
mock_npu_for_prefill_plugin = std::make_shared<MockNpuPluginForPrefill>(); | ||
mock_npu_for_prefill_plugin->create_implementation(); | ||
mock_npu_for_generate_plugin = std::make_shared<MockNpuPluginForGenerate>(); | ||
mock_npu_for_generate_plugin->create_implementation(); | ||
config["++NPUW_LLM_PREFILL_CONFIG"] = ov::AnyMap{{"NPUW_DEVICES", "MockNPUForPrefill"}}; | ||
config["++NPUW_LLM_GENERATE_CONFIG"] = ov::AnyMap{{"NPUW_DEVICES", "MockNPUForGenerate"}}; | ||
config["NPUW_LLM_MIN_RESPONSE_LEN"] = 2; | ||
} | ||
|
||
// Make sure it is called after expectations are set! | ||
void register_mock_plugins_in_ov() { | ||
m_shared_objects.push_back(reg_plugin<MockNpuPluginForPrefill>(core, mock_npu_for_prefill_plugin)); | ||
m_shared_objects.push_back(reg_plugin<MockNpuPluginForGenerate>(core, mock_npu_for_generate_plugin)); | ||
} | ||
|
||
protected: | ||
ov::Core core; | ||
SimpleLLMPipeline simple_llm; | ||
std::shared_ptr<MockNpuPluginForPrefill> mock_npu_for_prefill_plugin; | ||
std::shared_ptr<MockNpuPluginForGenerate> mock_npu_for_generate_plugin; | ||
ov::AnyMap config; | ||
|
||
private: | ||
std::vector<std::shared_ptr<void>> m_shared_objects; | ||
}; | ||
} // namespace tests | ||
} // namespace npuw | ||
} // namespace ov | ||
|
||
TEST_F(LLMBehaviorTestsNPUW, LLMBehaviorNPUW_FAST_COMPILE) { | ||
// Set expectations first: | ||
{ | ||
InSequence seq; | ||
// Generate model: | ||
EXPECT_COMPILE_MODEL(mock_npu_for_generate, TIMES(3)); | ||
// Prefill model: | ||
EXPECT_COMPILE_MODEL(mock_npu_for_prefill, TIMES(3)); | ||
} | ||
|
||
// ------------------------ Prefill model --------------------------- | ||
// 1 infer request for head: | ||
EXPECT_CREATE_SYNC_INFER_REQ(mock_npu_for_prefill, MODEL(0), TIMES(1)); | ||
// 2 infer requests for function, `create_sync_infer_request()` | ||
// should be called twice here (2 requests to form pipelining): | ||
EXPECT_CREATE_SYNC_INFER_REQ(mock_npu_for_prefill, MODEL(1), TIMES(2)); | ||
// 1 infer request for tail: | ||
EXPECT_CREATE_SYNC_INFER_REQ(mock_npu_for_prefill, MODEL(2), TIMES(1)); | ||
|
||
// Head's infer request is called once: | ||
EXPECT_INFER(mock_npu_for_prefill, MODEL(0), TIMES(1)); | ||
// There are 20 repeated functions, so: | ||
// Repeated block's 1st infer request is called 10 times: | ||
EXPECT_INFER_FOR(mock_npu_for_prefill, MODEL(1), INFER_REQ(0), TIMES(10)); | ||
// Repeated block's 2nd infer request (brother of 1st one) is called 10 times: | ||
EXPECT_INFER_FOR(mock_npu_for_prefill, MODEL(1), INFER_REQ(1), TIMES(10)); | ||
// Tail's infer request is called once: | ||
EXPECT_INFER(mock_npu_for_prefill, MODEL(2), TIMES(1)); | ||
|
||
// ------------------------ Generate model --------------------------- | ||
// 1 infer request for head: | ||
EXPECT_CREATE_SYNC_INFER_REQ(mock_npu_for_generate, MODEL(0), TIMES(1)); | ||
// `create_sync_infer_request()` should be called 20 times | ||
// to create 20 separate infer requests here (due to UNFOLD_IREQS): | ||
EXPECT_CREATE_SYNC_INFER_REQ(mock_npu_for_generate, MODEL(1), TIMES(20)); | ||
// 1 infer request for tail: | ||
EXPECT_CREATE_SYNC_INFER_REQ(mock_npu_for_generate, MODEL(2), TIMES(1)); | ||
|
||
// Head's infer request is called once: | ||
EXPECT_INFER(mock_npu_for_generate, MODEL(0), TIMES(1)); | ||
// Different 20 infer requests of 2nd submodel should be called: | ||
for (int i = 0; i < 20; ++i) { | ||
EXPECT_INFER_FOR(mock_npu_for_generate, MODEL(1), INFER_REQ(i), TIMES(1)); | ||
} | ||
// Tail's infer request is called once: | ||
EXPECT_INFER(mock_npu_for_generate, MODEL(2), TIMES(1)); | ||
|
||
// Register mock objects as plugins in OpenVINO: | ||
register_mock_plugins_in_ov(); | ||
|
||
// Do the actual test: | ||
simple_llm.initialize("C:\\apronina\\models\\TinyLlama-1.1B-Chat-v1.0_int4_sym_group128_dyn_stateful\\TinyLlama-1.1B-Chat-v1.0_int4_sym_group128_dyn_stateful\\openvino_model.xml", | ||
core, config); | ||
EXPECT_NO_THROW(simple_llm.generate(What_is_OpenVINO)); | ||
} | ||
|
||
TEST_F(LLMBehaviorTestsNPUW, LLMBehaviorNPUW_BEST_PERF) { | ||
// Set expectations first: | ||
{ | ||
InSequence seq; | ||
// Generate model: | ||
EXPECT_COMPILE_MODEL(mock_npu_for_generate, TIMES(1)); | ||
// Prefill model: | ||
EXPECT_COMPILE_MODEL(mock_npu_for_prefill, TIMES(3)); | ||
} | ||
|
||
// ------------------------ Prefill model --------------------------- | ||
// 1 infer request for head: | ||
EXPECT_CREATE_SYNC_INFER_REQ(mock_npu_for_prefill, MODEL(0), TIMES(1)); | ||
// 2 infer requests for function, `create_sync_infer_request()` | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. to polish |
||
// should be called twice here (2 requests to form pipelining): | ||
EXPECT_CREATE_SYNC_INFER_REQ(mock_npu_for_prefill, MODEL(1), TIMES(2)); | ||
// 1 infer request for tail: | ||
EXPECT_CREATE_SYNC_INFER_REQ(mock_npu_for_prefill, MODEL(2), TIMES(1)); | ||
|
||
// Head's infer request is called once: | ||
EXPECT_INFER(mock_npu_for_prefill, MODEL(0), TIMES(1)); | ||
// There are 20 repeated functions, so: | ||
// Repeated block's 1st infer request is called 10 times: | ||
EXPECT_INFER_FOR(mock_npu_for_prefill, MODEL(1), INFER_REQ(0), TIMES(10)); | ||
// Repeated block's 2nd infer request (brother of 1st one) is called 10 times: | ||
EXPECT_INFER_FOR(mock_npu_for_prefill, MODEL(1), INFER_REQ(1), TIMES(10)); | ||
// Tail's infer request is called once: | ||
EXPECT_INFER(mock_npu_for_prefill, MODEL(2), TIMES(1)); | ||
|
||
// ------------------------ Generate model --------------------------- | ||
// 1 infer request for the whole generate model: | ||
EXPECT_CREATE_SYNC_INFER_REQ(mock_npu_for_generate, MODEL(0), TIMES(1)); | ||
// Infer request is called only once: | ||
EXPECT_INFER(mock_npu_for_generate, MODEL(0), TIMES(1)); | ||
|
||
// Register mock objects as plugins in OpenVINO: | ||
register_mock_plugins_in_ov(); | ||
|
||
// Do the actual test: | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. to remove |
||
config["NPUW_LLM_GENERATE_HINT"] = "BEST_PERF"; | ||
simple_llm.initialize("C:\\apronina\\models\\TinyLlama-1.1B-Chat-v1.0_int4_sym_group128_dyn_stateful\\TinyLlama-1.1B-Chat-v1.0_int4_sym_group128_dyn_stateful\\openvino_model.xml", | ||
core, config); | ||
EXPECT_NO_THROW(simple_llm.generate(What_is_OpenVINO)); | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this namespace can have a name
token_ids
.