ggerganov · ggerganov · Mar 26, 2024 · Mar 15, 2024 · Mar 15, 2024 · Mar 16, 2024
diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp
@@ -403,6 +403,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
  tokens[batch_start] = llama_token_bos(llama_get_model(ctx));
  }
 
+ // TODO: use batch.logits to save computations instead of relying on logits_all == true
  if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
  fprintf(stderr, "%s : failed to eval\n", __func__);
  return false;

diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp
@@ -132,7 +132,6 @@ int main(int argc, char ** argv) {
  llama_context * ctx = NULL;
 
  // load the target model
- params.logits_all = true;
  std::tie(model, ctx) = llama_init_from_gpt_params(params);
 
  // load the prompts from an external file if there are any

diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
@@ -744,7 +744,8 @@ struct server_context {
  {
  const int32_t n_batch = llama_n_batch(ctx);
 
- batch = llama_batch_init(n_batch, 0, params.n_parallel);
+ // only a single seq_id per token is needed
+ batch = llama_batch_init(n_batch, 0, 1);
  }
 
  metrics.init();

diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
@@ -65,7 +65,6 @@ int main(int argc, char ** argv) {
  llama_context * ctx_dft = NULL;
 
  // load the target model
- params.logits_all = true;
  std::tie(model_tgt, ctx_tgt) = llama_init_from_gpt_params(params);
 
  // load the draft model