pacman100 · pacman100 · Mar 1, 2024 · Mar 1, 2024
diff --git a/llama.cpp b/llama.cpp
@@ -4527,14 +4527,15 @@ static bool llm_load_tensors(
 
  // output
  {
- model.output_norm = ml.create_tensor(ctx_output,  tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
- model.output_norm_b = ml.create_tensor(ctx_output,  tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
- 
- model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,  "weight"), {n_embd, n_vocab}, false);
+ model.output_norm  = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
+ model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
+
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
  // if output is NULL, init from the input tok embed
  if (model.output == NULL) {
- model.output = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
+ model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
  ml.n_created--; // artificial tensor
+ ml.size_data += ggml_nbytes(model.output);
  }
 
  }
@@ -4545,7 +4546,7 @@ static bool llm_load_tensors(
 
  auto & layer = model.layers[i];
 
- layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
+ layer.attn_norm  = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
  layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
 
  layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
@@ -4554,20 +4555,20 @@ static bool llm_load_tensors(
  layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
 
  // optional bias tensors
- layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, false);
- layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, false);
- layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, false);
- layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, false);
+ layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
+ layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
+ layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
+ layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
 
- layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
+ layer.ffn_norm  = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
  layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
 
  layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
  layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
 
  // optional bias tensors
- layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, false);
- layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP , "bias", i), { n_ff}, false);
+ layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
+ layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP ,  "bias", i), {  n_ff});
  }
  } break;
  default:
@@ -7718,7 +7719,7 @@ struct llm_build_context {
  cb(ffn_inp, "ffn_inp", il);
 
  // feed-forward network
- 
+
  cur = llm_build_norm(ctx0, ffn_inp, hparams,
  model.layers[il].ffn_norm, model.layers[il].ffn_norm_b,
  LLM_NORM, cb, il);
@@ -12271,7 +12272,6 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
  case LLM_ARCH_ORION:
  case LLM_ARCH_INTERNLM2:
  case LLM_ARCH_MINICPM:
- case LLM_ARCH_STARCODER2:
  return LLAMA_ROPE_TYPE_NORM;
 
  // the pairs of head values are offset by n_rot/2
@@ -12284,6 +12284,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
  case LLM_ARCH_QWEN2:
  case LLM_ARCH_PHI2:
  case LLM_ARCH_GEMMA:
+ case LLM_ARCH_STARCODER2:
  return LLAMA_ROPE_TYPE_NEOX;
 
  // all model arches should be listed explicitly here