server : fix kv cache management (#3588)

ggerganov · Oct 12, 2023 · 57dd55e · 57dd55e
1 parent b8fe4b5
commit 57dd55e
Showing 1 changed file with 7 additions and 3 deletions.
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
@@ -405,13 +405,17 @@ struct llama_server_context
  // compare the evaluated prompt with the new prompt
  n_past = common_part(embd, prompt_tokens);
  embd = prompt_tokens;
+
  if (n_past == num_prompt_tokens)
  {
  // we have to evaluate at least 1 token to generate logits.
  printf("we have to evaluate at least 1 token to generate logits\n");
  n_past--;
  }
 
+ // since #3228 we now have to manually manage the KV cache
+ llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
+
  LOG_VERBOSE("prompt ingested", {
  {"n_past", n_past},
  {"cached", tokens_to_str(ctx, embd.cbegin(), embd.cbegin() + n_past)},
@@ -461,16 +465,16 @@ struct llama_server_context
  // compare the evaluated prompt with the new prompt
  n_past = common_part(embd, prompt_tokens);
 
- // since #3228 we now have to manually manage the KV cache
- llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
-
  embd = prompt_tokens;
  if (n_past == num_prompt_tokens)
  {
  // we have to evaluate at least 1 token to generate logits.
  n_past--;
  }
 
+ // since #3228 we now have to manually manage the KV cache
+ llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
+
  LOG_VERBOSE("prompt ingested", {
  {"n_past", n_past},
  {"cached", tokens_to_str(ctx, embd.cbegin(), embd.cbegin() + n_past)},