From e4a16f2493b03500a9936646f4af48472a79f5d7 Mon Sep 17 00:00:00 2001
From: root <hexin@xverse.cn>
Date: Wed, 27 Mar 2024 04:22:09 +0000
Subject: [PATCH] llama.cpp: Include the changes from #6122 to exclude the
 unused outputs of the last layers.

---
 llama.cpp | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/llama.cpp b/llama.cpp
index 91432a16348d5..a8f675bdee811 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -6525,6 +6525,13 @@ struct llm_build_context {
                 cb(cur, "kqv_out", il);
             }
 
+            if (il == n_layer - 1) {
+                // skip computing output for unused tokens
+                struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+                cur   = ggml_get_rows(ctx0,      cur, inp_out_ids);
+                inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+            }
+
             struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
             cb(ffn_inp, "ffn_inp", il);