ggerganov · ggerganov · Mar 26, 2024 · Mar 15, 2024 · Mar 15, 2024 · Mar 16, 2024
diff --git a/README.md b/README.md
@@ -10,6 +10,7 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
 
 ### Recent API changes
 
+- [2024 Mar 26] Logits and embeddings API updated for compactness https://github.com/ggerganov/llama.cpp/pull/6122
 - [2024 Mar 13] Add `llama_synchronize()` + `llama_context_params.n_ubatch` https://github.com/ggerganov/llama.cpp/pull/6017
 - [2024 Mar 8] `llama_kv_cache_seq_rm()` returns a `bool` instead of `void`, and new `llama_n_seq_max()` returns the upper limit of acceptable `seq_id` in batches (relevant when dealing with multiple sequences) https://github.com/ggerganov/llama.cpp/pull/5328
 - [2024 Mar 4] Embeddings API updated https://github.com/ggerganov/llama.cpp/pull/5796
@@ -633,6 +634,15 @@ Building the program with BLAS support may lead to some performance improvements
 
 - #### Vulkan
 
+> [!WARNING]
+>
+> Vulkan support has been broken in https://github.com/ggerganov/llama.cpp/pull/6122
+> due to relying on `GGML_OP_GET_ROWS` which is not yet properly supported by the Vulkan backend,
+> but should be fixed relatively soon (possibly in https://github.com/ggerganov/llama.cpp/pull/6155
+> (ref: https://github.com/ggerganov/llama.cpp/pull/6122#issuecomment-2015327635)).
+>
+> Meanwhile, if you want to use the Vulkan backend, you should use the commit right before the breaking change, https://github.com/ggerganov/llama.cpp/commit/55c1b2a3bbd470e9e2a3a0618b92cf64a885f806
+
  **With docker**:
 
  You don't need to install Vulkan SDK. It will be installed inside the container.

diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp
@@ -424,6 +424,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
  tokens[batch_start] = llama_token_bos(llama_get_model(ctx));
  }
 
+ // TODO: use batch.logits to save computations instead of relying on logits_all == true
  if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
  fprintf(stderr, "%s : failed to eval\n", __func__);
  return false;

diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp
@@ -132,7 +132,6 @@ int main(int argc, char ** argv) {
  llama_context * ctx = NULL;
 
  // load the target model
- params.logits_all = true;
  std::tie(model, ctx) = llama_init_from_gpt_params(params);
 
  // load the prompts from an external file if there are any

diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
@@ -746,7 +746,8 @@ struct server_context {
  {
  const int32_t n_batch = llama_n_batch(ctx);
 
- batch = llama_batch_init(n_batch, 0, params.n_parallel);
+ // only a single seq_id per token is needed
+ batch = llama_batch_init(n_batch, 0, 1);
  }
 
  metrics.init();

diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
@@ -65,7 +65,6 @@ int main(int argc, char ** argv) {
  llama_context * ctx_dft = NULL;
 
  // load the target model
- params.logits_all = true;
  std::tie(model_tgt, ctx_tgt) = llama_init_from_gpt_params(params);
 
  // load the draft model

diff --git a/ggml-cuda.cu b/ggml-cuda.cu
@@ -11370,7 +11370,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
  for (int i = 0; i < cgraph->n_nodes; i++) {
  ggml_tensor * node = cgraph->nodes[i];
 
- if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
+ if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
  continue;
  }
 

diff --git a/ggml-kompute.cpp b/ggml-kompute.cpp
@@ -1430,6 +1430,10 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
  struct ggml_tensor * dst = gf->nodes[i];
  GGML_ASSERT(dst->data != nullptr);
 
+ if (ggml_is_empty(dst)) {
+ continue;
+ }
+
  switch (dst->op) {
  case GGML_OP_NONE:
  case GGML_OP_RESHAPE:

diff --git a/ggml-metal.m b/ggml-metal.m
@@ -837,6 +837,10 @@ static enum ggml_status ggml_metal_graph_compute(
  struct ggml_tensor * src2 = gf->nodes[i]->src[2];
  struct ggml_tensor * dst = gf->nodes[i];
 
+ if (ggml_is_empty(dst)) {
+ continue;
+ }
+
  switch (dst->op) {
  case GGML_OP_NONE:
  case GGML_OP_RESHAPE:

diff --git a/ggml-opencl.cpp b/ggml-opencl.cpp
@@ -2234,6 +2234,11 @@ static ggml_backend_buffer_type_t ggml_backend_opencl_get_default_buffer_type(gg
 static ggml_status ggml_backend_opencl_graph_compute(ggml_backend_t backend, ggml_cgraph * graph) {
  for (int i = 0; i < graph->n_nodes; ++i) {
  ggml_tensor * node = graph->nodes[i];
+
+ if (ggml_is_empty(node)) {
+ continue;
+ }
+
  switch (node->op) {
  case GGML_OP_MUL_MAT:
  ggml_cl_mul_mat(node->src[0], node->src[1], node, nullptr, 0);

diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp
@@ -16973,7 +16973,7 @@ GGML_CALL static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t back
  params.ith = 0;
  for (int i = 0; i < cgraph->n_nodes; i++) {
  ggml_tensor * node = cgraph->nodes[i];
- if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
+ if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
  continue;
  }
 #ifndef NDEBUG

diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
@@ -5566,7 +5566,7 @@ GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backen
  for (int i = 0; i < cgraph->n_nodes; i++) {
  ggml_tensor * node = cgraph->nodes[i];
 
- if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
+ if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
  continue;
  }
 

diff --git a/ggml.c b/ggml.c
@@ -2594,6 +2594,16 @@ static inline bool ggml_is_padded_1d(const struct ggml_tensor * tensor) {
  tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
 }
 
+GGML_CALL bool ggml_is_empty(const struct ggml_tensor * tensor) {
+ for (int i = 0; i < GGML_MAX_DIMS; ++i) {
+ if (tensor->ne[i] == 0) {
+ // empty if any dimension has no elements
+ return true;
+ }
+ }
+ return false;
+}
+
 bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
 
@@ -2608,7 +2618,7 @@ bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor
 static inline bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
  static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
 
- return
+ return ggml_is_empty(t0) ? ggml_is_empty(t1) :
  (t1->ne[0]%t0->ne[0] == 0) &&
  (t1->ne[1]%t0->ne[1] == 0) &&
  (t1->ne[2]%t0->ne[2] == 0) &&
@@ -16093,7 +16103,7 @@ static void ggml_compute_forward_cross_entropy_loss_back(
 static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
  GGML_ASSERT(params);
 
- if (tensor->op == GGML_OP_NONE) {
+ if (tensor->op == GGML_OP_NONE || ggml_is_empty(tensor)) {
  return;
  }
 
@@ -17962,6 +17972,12 @@ static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const
 static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_threads) {
  int n_tasks = 0;
 
+ if (ggml_is_empty(node)) {
+ // no need to multi-thread a no-op
+ n_tasks = 1;
+ return n_tasks;
+ }
+
  switch (node->op) {
  case GGML_OP_CPY:
  case GGML_OP_DUP:

diff --git a/ggml.h b/ggml.h
@@ -748,6 +748,7 @@ extern "C" {
  GGML_API GGML_CALL bool ggml_is_transposed(const struct ggml_tensor * tensor);
  GGML_API GGML_CALL bool ggml_is_contiguous(const struct ggml_tensor * tensor);
  GGML_API GGML_CALL bool ggml_is_permuted (const struct ggml_tensor * tensor);
+ GGML_API GGML_CALL bool ggml_is_empty (const struct ggml_tensor * tensor);
  GGML_API bool ggml_is_scalar (const struct ggml_tensor * tensor);
  GGML_API bool ggml_is_vector (const struct ggml_tensor * tensor);
  GGML_API bool ggml_is_matrix (const struct ggml_tensor * tensor);