Skip to content

Commit d383e13

Browse files
committed
vulkan: Split large mul_mat_id to fit in shared memory
1 parent a5d1fb6 commit d383e13

File tree

2 files changed

+38
-4
lines changed

2 files changed

+38
-4
lines changed

ggml/src/ggml-vulkan/ggml-vulkan.cpp

Lines changed: 33 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5964,7 +5964,30 @@ static void ggml_vk_mul_mat_id(ggml_backend_vk_context * ctx, vk_context& subctx
59645964
if (src2->ne[1] == 1 && (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type))) {
59655965
ggml_vk_mul_mat_vec_id_q_f16(ctx, subctx, src0, src1, src2, dst, dryrun);
59665966
} else {
5967-
ggml_vk_mul_mat_id_q_f16(ctx, subctx, src0, src1, src2, dst, dryrun);
5967+
// Split based on number of ids, to fix in shared memory
5968+
const uint32_t nei0 = (uint32_t)src2->ne[0];
5969+
const uint32_t nei1 = (uint32_t)src2->ne[1];
5970+
5971+
GGML_ASSERT(nei0 <= 4096);
5972+
const uint32_t split_size = std::min(nei1, 4096u / nei0);
5973+
5974+
for (uint32_t token_start = 0; token_start < nei1; token_start += split_size) {
5975+
const uint32_t n_tokens = std::min(split_size, nei1 - token_start);
5976+
5977+
ggml_tensor src1_copy = *src1;
5978+
ggml_tensor src2_copy = *src2;
5979+
ggml_tensor dst_copy = *dst;
5980+
5981+
src1_copy.view_offs += token_start * src1_copy.nb[2];
5982+
src2_copy.view_offs += token_start * src2_copy.nb[1];
5983+
dst_copy.view_offs += token_start * dst_copy.nb[2];
5984+
5985+
src1_copy.ne[2] = n_tokens;
5986+
src2_copy.ne[1] = n_tokens;
5987+
dst_copy.ne[2] = n_tokens;
5988+
5989+
ggml_vk_mul_mat_id_q_f16(ctx, subctx, src0, &src1_copy, &src2_copy, &dst_copy, dryrun);
5990+
}
59685991
}
59695992
}
59705993

@@ -10127,9 +10150,15 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
1012710150
ggml_type src0_type = op->src[0]->type;
1012810151
ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
1012910152
const vk_device& device = ggml_vk_get_device(ctx->device);
10130-
if (op->op == GGML_OP_MUL_MAT_ID && !device->mul_mat_id_s[src0_type] && !device->mul_mat_id_m[src0_type] && !device->mul_mat_id_l[src0_type]) {
10131-
// If there's not enough shared memory for row_ids and the result tile, fallback to CPU
10132-
return false;
10153+
if (op->op == GGML_OP_MUL_MAT_ID) {
10154+
if (!device->mul_mat_id_s[src0_type] && !device->mul_mat_id_m[src0_type] && !device->mul_mat_id_l[src0_type]) {
10155+
// If there's not enough shared memory for row_ids and the result tile, fallback to CPU
10156+
return false;
10157+
}
10158+
// Check against size of shared memory variable
10159+
if (op->src[2]->ne[0] > 4096) {
10160+
return false;
10161+
}
1013310162
}
1013410163
switch (src0_type) {
1013510164
case GGML_TYPE_F32:

tests/test-backend-ops.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4623,6 +4623,11 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
46234623
// this case is verified (pass) in Intel(R) Data Center GPU Max 1100 (sycl backend) and NV A30 (cuda backend)
46244624
// test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F16, 512, 262144, 9216, {1, 1}, {1, 1}));
46254625

4626+
// test large experts*tokens
4627+
for (bool b : {false, true}) {
4628+
test_cases.emplace_back(new test_mul_mat_id(GGML_TYPE_F16, GGML_TYPE_F32, 16, 16, b, 32, 1024, 16));
4629+
}
4630+
46264631
for (ggml_type type_a : base_types) {
46274632
for (ggml_type type_b : {GGML_TYPE_F32 /*, GGML_TYPE_F16 */}) {
46284633
for (int n_mats : {4, 8}) {

0 commit comments

Comments
 (0)