ggerganov · ggerganov · Mar 14, 2024 · Mar 8, 2024 · Mar 9, 2024 · Mar 9, 2024
diff --git a/examples/common-ggml.cpp b/examples/common-ggml.cpp
@@ -90,8 +90,6 @@ bool ggml_common_quantize_0(
  std::vector<ggml_fp16_t> data_f16;
  std::vector<float> data_f32;
 
- std::vector<int64_t> hist_all(1 << 4, 0);
-
  while (true) {
  int32_t n_dims;
  int32_t length;
@@ -176,8 +174,6 @@ bool ggml_common_quantize_0(
  work.resize(nelements); // for quantization
 
  size_t cur_size = 0;
- std::vector<int64_t> hist_cur(1 << 4, 0);
-
  switch ((ggml_type) ttype) {
  case GGML_TYPE_Q4_0:
  case GGML_TYPE_Q4_1:
@@ -190,7 +186,7 @@ bool ggml_common_quantize_0(
  case GGML_TYPE_Q5_K:
  case GGML_TYPE_Q6_K:
  {
- cur_size = ggml_quantize_chunk((ggml_type) ttype, data_f32.data(), work.data(), 0, nelements/ne[0], ne[0], hist_cur.data(), nullptr);
+ cur_size = ggml_quantize_chunk((ggml_type) ttype, data_f32.data(), work.data(), 0, nelements/ne[0], ne[0], nullptr);
  } break;
  case GGML_TYPE_F32:
  case GGML_TYPE_F16:
@@ -217,15 +213,7 @@ bool ggml_common_quantize_0(
  fout.write(reinterpret_cast<char *>(work.data()), cur_size);
  total_size_new += cur_size;
 
- printf("size = %8.2f MB -> %8.2f MB | hist: ", nelements * sizeof(float)/1024.0/1024.0, cur_size/1024.0/1024.0);
- for (int i = 0; i < (int) hist_cur.size(); ++i) {
- hist_all[i] += hist_cur[i];
- }
-
- for (int i = 0; i < (int) hist_cur.size(); ++i) {
- printf("%5.3f ", hist_cur[i] / (float)nelements);
- }
- printf("\n");
+ printf("size = %8.2f MB -> %8.2f MB\n", nelements * sizeof(float)/1024.0/1024.0, cur_size/1024.0/1024.0);
  } else {
  printf("size = %8.3f MB\n", data_u8.size()/1024.0/1024.0);
  fout.write(reinterpret_cast<char *>(data_u8.data()), data_u8.size());
@@ -238,18 +226,5 @@ bool ggml_common_quantize_0(
  printf("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
  printf("%s: quant size = %8.2f MB | ftype = %d (%s)\n", __func__, total_size_new/1024.0/1024.0, ftype, ggml_type_name(qtype));
 
- {
- int64_t sum_all = 0;
- for (int i = 0; i < (int) hist_all.size(); ++i) {
- sum_all += hist_all[i];
- }
-
- printf("%s: hist: ", __func__);
- for (int i = 0; i < (int) hist_all.size(); ++i) {
- printf("%5.3f ", hist_all[i] / (float)sum_all);
- }
- printf("\n");
- }
-
  return true;
 }
diff --git a/examples/gpt-2/main-batched.cpp b/examples/gpt-2/main-batched.cpp
@@ -419,21 +419,19 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
 
  // allocate the tensors into the backend buffer
  {
- ggml_tallocr * alloc = ggml_tallocr_new(model.kv_cache.buffer);
+ ggml_tallocr alloc = ggml_tallocr_new(model.kv_cache.buffer);
 
  // this updates the pointers in the tensors to point to the correct location in the buffer
  // this is necessary since the ggml_context is .no_alloc == true
  // note that the buffer can actually be a device buffer, depending on the backend
- ggml_tallocr_alloc(alloc, model.kv_cache.k);
- ggml_tallocr_alloc(alloc, model.kv_cache.v);
-
- ggml_tallocr_free(alloc);
+ ggml_tallocr_alloc(&alloc, model.kv_cache.k);
+ ggml_tallocr_alloc(&alloc, model.kv_cache.v);
  }
  }
 
  // load weights
  {
- ggml_tallocr * alloc = ggml_tallocr_new(model.buffer_w);
+ ggml_tallocr alloc = ggml_tallocr_new(model.buffer_w);
 
  size_t total_size = 0;
 
@@ -495,7 +493,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
  return false;
  }
 
- ggml_tallocr_alloc(alloc, tensor);
+ ggml_tallocr_alloc(&alloc, tensor);
 
  if (ggml_backend_is_cpu (model.backend)
 #ifdef GGML_USE_METAL
@@ -525,7 +523,6 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
  total_size += ggml_nbytes(tensor);
  }
 
- ggml_tallocr_free(alloc);
  printf("%s: model size = %8.2f MB\n", __func__, total_size/1024.0/1024.0);
  }
 

diff --git a/examples/gpt-2/main-sched.cpp b/examples/gpt-2/main-sched.cpp
@@ -342,7 +342,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
  }
 
  // allocate buffers
- std::map<ggml_backend_t, std::unique_ptr<ggml_tallocr, decltype(&ggml_tallocr_free)>> backend_buffers;
+ std::map<ggml_backend_t, ggml_tallocr> backend_buffers;
  for (auto backend : model.backends) {
  // compute the size of the buffer
  size_t size = 0;
@@ -359,7 +359,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
  model.buffers_w.push_back(buffer);
 
  // create an allocator for the buffer to allocate the tensors
- auto alloc = std::unique_ptr<ggml_tallocr, decltype(&ggml_tallocr_free)>(ggml_tallocr_new(buffer), ggml_tallocr_free);
+ auto alloc = ggml_tallocr_new(buffer);
  backend_buffers.insert(std::make_pair(backend, std::move(alloc)));
  } else {
  model.buffers_w.push_back(NULL);
@@ -394,15 +394,13 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
 
  // allocate the tensors into the backend buffer
  {
- ggml_tallocr * alloc = ggml_tallocr_new(model.buffer_kv);
+ ggml_tallocr alloc = ggml_tallocr_new(model.buffer_kv);
 
  // this updates the pointers in the tensors to point to the correct location in the buffer
  // this is necessary since the ggml_context is .no_alloc == true
  // note that the buffer can actually be a device buffer, depending on the backend
- ggml_tallocr_alloc(alloc, model.memory_k);
- ggml_tallocr_alloc(alloc, model.memory_v);
-
- ggml_tallocr_free(alloc);
+ ggml_tallocr_alloc(&alloc, model.memory_k);
+ ggml_tallocr_alloc(&alloc, model.memory_v);
  }
  }
 
@@ -470,7 +468,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
 
  // allocate the tensor
  ggml_backend_t backend = tensor_backends[name];
- ggml_tallocr * alloc = backend_buffers.find(backend)->second.get();
+ ggml_tallocr * alloc = &backend_buffers.find(backend)->second;
  ggml_tallocr_alloc(alloc, tensor);
  //printf("%s: [%5.5s] %s\n", __func__, ggml_backend_name(backend), name.c_str());
 
@@ -490,7 +488,8 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
 
  // GPT-2 models share the WTE tensor as the LM head
  if (name == "model/wte" && has_lm_head == false) {
- ggml_tallocr_alloc(backend_buffers.find(tensor_backends["model/lm_head"])->second.get(), model.lm_head);
+ ggml_tallocr * alloc_head = &backend_buffers.find(tensor_backends["model/lm_head"])->second;
+ ggml_tallocr_alloc(alloc_head, model.lm_head);
  //printf("%s: [%5.5s] %s (copied)\n", __func__, ggml_backend_name(tensor_backends["model/lm_head"]), "model/lm_head");
  ggml_backend_tensor_copy(tensor, model.lm_head);
  total_size += ggml_nbytes(model.lm_head);
@@ -524,10 +523,9 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
  printf("%s: backend_in = %s (%zu bytes)\n", __func__, ggml_backend_name(backend_input), input_size);
 
  // allocate the tensors into the backend buffer
- ggml_tallocr * alloc = ggml_tallocr_new(model.buffer_input);
- ggml_tallocr_alloc(alloc, model.embd);
- ggml_tallocr_alloc(alloc, model.position);
- ggml_tallocr_free(alloc);
+ ggml_tallocr alloc = ggml_tallocr_new(model.buffer_input);
+ ggml_tallocr_alloc(&alloc, model.embd);
+ ggml_tallocr_alloc(&alloc, model.position);
  }
 
  return true;
@@ -867,6 +865,7 @@ bool gpt2_eval(
  struct ggml_cgraph * gf = gpt2_graph(model, n_past, embd_inp);
 
  // run the computation
+ ggml_backend_sched_reset(sched);
  ggml_backend_sched_graph_compute(sched, gf);
 
  //if (n_past%100 == 0) {
@@ -934,7 +933,7 @@ int main(int argc, char ** argv) {
  ggml_backend_sched_t sched;
  {
  // initialize the scheduler
- sched = ggml_backend_sched_new(model.backends.data(), NULL, model.backends.size(), GPT2_MAX_NODES);
+ sched = ggml_backend_sched_new(model.backends.data(), NULL, model.backends.size(), GPT2_MAX_NODES, false);
 
  // create the worst case graph for memory usage estimation
  int n_tokens = std::min(model.hparams.n_ctx, params.n_batch);