diff --git a/llama.cpp b/llama.cpp index 91432a16348d5..a8f675bdee811 100644 --- a/llama.cpp +++ b/llama.cpp @@ -6525,6 +6525,13 @@ struct llm_build_context { cb(cur, "kqv_out", il); } + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + } + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); cb(ffn_inp, "ffn_inp", il);