From 375f9c1bc45c87e0e27c34556dcbb5aec7f486d5 Mon Sep 17 00:00:00 2001 From: slaren Date: Thu, 14 Mar 2024 16:45:27 +0100 Subject: [PATCH] update examples and tests --- examples/common-ggml.cpp | 29 ++--------------------------- examples/gpt-2/main-batched.cpp | 13 +++++-------- examples/gpt-2/main-sched.cpp | 27 +++++++++++++-------------- tests/test-backend-buffer.cpp | 5 ++--- tests/test-conv1d.cpp | 8 +++----- tests/test-conv2d.cpp | 8 +++----- tests/test-mul-mat.cpp | 8 +++----- 7 files changed, 31 insertions(+), 67 deletions(-) diff --git a/examples/common-ggml.cpp b/examples/common-ggml.cpp index 53811ad19..cf2478f0a 100644 --- a/examples/common-ggml.cpp +++ b/examples/common-ggml.cpp @@ -90,8 +90,6 @@ bool ggml_common_quantize_0( std::vector data_f16; std::vector data_f32; - std::vector hist_all(1 << 4, 0); - while (true) { int32_t n_dims; int32_t length; @@ -176,8 +174,6 @@ bool ggml_common_quantize_0( work.resize(nelements); // for quantization size_t cur_size = 0; - std::vector hist_cur(1 << 4, 0); - switch ((ggml_type) ttype) { case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: @@ -190,7 +186,7 @@ bool ggml_common_quantize_0( case GGML_TYPE_Q5_K: case GGML_TYPE_Q6_K: { - cur_size = ggml_quantize_chunk((ggml_type) ttype, data_f32.data(), work.data(), 0, nelements/ne[0], ne[0], hist_cur.data(), nullptr); + cur_size = ggml_quantize_chunk((ggml_type) ttype, data_f32.data(), work.data(), 0, nelements/ne[0], ne[0], nullptr); } break; case GGML_TYPE_F32: case GGML_TYPE_F16: @@ -217,15 +213,7 @@ bool ggml_common_quantize_0( fout.write(reinterpret_cast(work.data()), cur_size); total_size_new += cur_size; - printf("size = %8.2f MB -> %8.2f MB | hist: ", nelements * sizeof(float)/1024.0/1024.0, cur_size/1024.0/1024.0); - for (int i = 0; i < (int) hist_cur.size(); ++i) { - hist_all[i] += hist_cur[i]; - } - - for (int i = 0; i < (int) hist_cur.size(); ++i) { - printf("%5.3f ", hist_cur[i] / (float)nelements); - } - printf("\n"); + printf("size = %8.2f MB -> %8.2f MB\n", nelements * sizeof(float)/1024.0/1024.0, cur_size/1024.0/1024.0); } else { printf("size = %8.3f MB\n", data_u8.size()/1024.0/1024.0); fout.write(reinterpret_cast(data_u8.data()), data_u8.size()); @@ -238,18 +226,5 @@ bool ggml_common_quantize_0( printf("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0); printf("%s: quant size = %8.2f MB | ftype = %d (%s)\n", __func__, total_size_new/1024.0/1024.0, ftype, ggml_type_name(qtype)); - { - int64_t sum_all = 0; - for (int i = 0; i < (int) hist_all.size(); ++i) { - sum_all += hist_all[i]; - } - - printf("%s: hist: ", __func__); - for (int i = 0; i < (int) hist_all.size(); ++i) { - printf("%5.3f ", hist_all[i] / (float)sum_all); - } - printf("\n"); - } - return true; } diff --git a/examples/gpt-2/main-batched.cpp b/examples/gpt-2/main-batched.cpp index 6ad1838b7..9ba4496cf 100644 --- a/examples/gpt-2/main-batched.cpp +++ b/examples/gpt-2/main-batched.cpp @@ -419,21 +419,19 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & // allocate the tensors into the backend buffer { - ggml_tallocr * alloc = ggml_tallocr_new(model.kv_cache.buffer); + ggml_tallocr alloc = ggml_tallocr_new(model.kv_cache.buffer); // this updates the pointers in the tensors to point to the correct location in the buffer // this is necessary since the ggml_context is .no_alloc == true // note that the buffer can actually be a device buffer, depending on the backend - ggml_tallocr_alloc(alloc, model.kv_cache.k); - ggml_tallocr_alloc(alloc, model.kv_cache.v); - - ggml_tallocr_free(alloc); + ggml_tallocr_alloc(&alloc, model.kv_cache.k); + ggml_tallocr_alloc(&alloc, model.kv_cache.v); } } // load weights { - ggml_tallocr * alloc = ggml_tallocr_new(model.buffer_w); + ggml_tallocr alloc = ggml_tallocr_new(model.buffer_w); size_t total_size = 0; @@ -495,7 +493,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & return false; } - ggml_tallocr_alloc(alloc, tensor); + ggml_tallocr_alloc(&alloc, tensor); if (ggml_backend_is_cpu (model.backend) #ifdef GGML_USE_METAL @@ -525,7 +523,6 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & total_size += ggml_nbytes(tensor); } - ggml_tallocr_free(alloc); printf("%s: model size = %8.2f MB\n", __func__, total_size/1024.0/1024.0); } diff --git a/examples/gpt-2/main-sched.cpp b/examples/gpt-2/main-sched.cpp index e753d5fb3..ad10aa961 100644 --- a/examples/gpt-2/main-sched.cpp +++ b/examples/gpt-2/main-sched.cpp @@ -342,7 +342,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & } // allocate buffers - std::map> backend_buffers; + std::map backend_buffers; for (auto backend : model.backends) { // compute the size of the buffer size_t size = 0; @@ -359,7 +359,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & model.buffers_w.push_back(buffer); // create an allocator for the buffer to allocate the tensors - auto alloc = std::unique_ptr(ggml_tallocr_new(buffer), ggml_tallocr_free); + auto alloc = ggml_tallocr_new(buffer); backend_buffers.insert(std::make_pair(backend, std::move(alloc))); } else { model.buffers_w.push_back(NULL); @@ -394,15 +394,13 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & // allocate the tensors into the backend buffer { - ggml_tallocr * alloc = ggml_tallocr_new(model.buffer_kv); + ggml_tallocr alloc = ggml_tallocr_new(model.buffer_kv); // this updates the pointers in the tensors to point to the correct location in the buffer // this is necessary since the ggml_context is .no_alloc == true // note that the buffer can actually be a device buffer, depending on the backend - ggml_tallocr_alloc(alloc, model.memory_k); - ggml_tallocr_alloc(alloc, model.memory_v); - - ggml_tallocr_free(alloc); + ggml_tallocr_alloc(&alloc, model.memory_k); + ggml_tallocr_alloc(&alloc, model.memory_v); } } @@ -470,7 +468,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & // allocate the tensor ggml_backend_t backend = tensor_backends[name]; - ggml_tallocr * alloc = backend_buffers.find(backend)->second.get(); + ggml_tallocr * alloc = &backend_buffers.find(backend)->second; ggml_tallocr_alloc(alloc, tensor); //printf("%s: [%5.5s] %s\n", __func__, ggml_backend_name(backend), name.c_str()); @@ -490,7 +488,8 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & // GPT-2 models share the WTE tensor as the LM head if (name == "model/wte" && has_lm_head == false) { - ggml_tallocr_alloc(backend_buffers.find(tensor_backends["model/lm_head"])->second.get(), model.lm_head); + ggml_tallocr * alloc_head = &backend_buffers.find(tensor_backends["model/lm_head"])->second; + ggml_tallocr_alloc(alloc_head, model.lm_head); //printf("%s: [%5.5s] %s (copied)\n", __func__, ggml_backend_name(tensor_backends["model/lm_head"]), "model/lm_head"); ggml_backend_tensor_copy(tensor, model.lm_head); total_size += ggml_nbytes(model.lm_head); @@ -524,10 +523,9 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & printf("%s: backend_in = %s (%zu bytes)\n", __func__, ggml_backend_name(backend_input), input_size); // allocate the tensors into the backend buffer - ggml_tallocr * alloc = ggml_tallocr_new(model.buffer_input); - ggml_tallocr_alloc(alloc, model.embd); - ggml_tallocr_alloc(alloc, model.position); - ggml_tallocr_free(alloc); + ggml_tallocr alloc = ggml_tallocr_new(model.buffer_input); + ggml_tallocr_alloc(&alloc, model.embd); + ggml_tallocr_alloc(&alloc, model.position); } return true; @@ -867,6 +865,7 @@ bool gpt2_eval( struct ggml_cgraph * gf = gpt2_graph(model, n_past, embd_inp); // run the computation + ggml_backend_sched_reset(sched); ggml_backend_sched_graph_compute(sched, gf); //if (n_past%100 == 0) { @@ -934,7 +933,7 @@ int main(int argc, char ** argv) { ggml_backend_sched_t sched; { // initialize the scheduler - sched = ggml_backend_sched_new(model.backends.data(), NULL, model.backends.size(), GPT2_MAX_NODES); + sched = ggml_backend_sched_new(model.backends.data(), NULL, model.backends.size(), GPT2_MAX_NODES, false); // create the worst case graph for memory usage estimation int n_tokens = std::min(model.hparams.n_ctx, params.n_batch); diff --git a/tests/test-backend-buffer.cpp b/tests/test-backend-buffer.cpp index d3c9fe0f9..19ba22a52 100644 --- a/tests/test-backend-buffer.cpp +++ b/tests/test-backend-buffer.cpp @@ -40,8 +40,8 @@ static void test_buffer(ggml_backend_t backend, ggml_backend_buffer_type_t buft) GGML_ASSERT(ggml_backend_buffer_get_alloc_size(buffer, tensor) >= n * sizeof(float)); - ggml_tallocr_t allocr = ggml_tallocr_new(buffer); - ggml_tallocr_alloc(allocr, tensor); + struct ggml_tallocr allocr = ggml_tallocr_new(buffer); + ggml_tallocr_alloc(&allocr, tensor); GGML_ASSERT(tensor->data != NULL); @@ -59,7 +59,6 @@ static void test_buffer(ggml_backend_t backend, ggml_backend_buffer_type_t buft) GGML_ASSERT(memcmp(data, data2, sizeof(data)) == 0); - ggml_tallocr_free(allocr); ggml_backend_buffer_free(buffer); ggml_free(ctx); } diff --git a/tests/test-conv1d.cpp b/tests/test-conv1d.cpp index 79d18b872..b6daa7b69 100644 --- a/tests/test-conv1d.cpp +++ b/tests/test-conv1d.cpp @@ -111,10 +111,10 @@ void load_model(test_model & model, bool use_gpu = false) { model.b = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, IL, IC, N); // create a allocator - ggml_tallocr_t alloc = ggml_tallocr_new(model.buffer); + ggml_tallocr alloc = ggml_tallocr_new(model.buffer); // alloc memory - ggml_tallocr_alloc(alloc, model.a); + ggml_tallocr_alloc(&alloc, model.a); // load data to buffer if(ggml_backend_is_cpu(model.backend)) { @@ -124,7 +124,7 @@ void load_model(test_model & model, bool use_gpu = false) { } // alloc memory - ggml_tallocr_alloc(alloc, model.b); + ggml_tallocr_alloc(&alloc, model.b); if(ggml_backend_is_cpu(model.backend) #ifdef GGML_USE_METAL @@ -135,8 +135,6 @@ void load_model(test_model & model, bool use_gpu = false) { } else { ggml_backend_tensor_set(model.b, bdata, 0, ggml_nbytes(model.b)); } - - ggml_tallocr_free(alloc); } struct ggml_cgraph * build_graph(const test_model& model) { diff --git a/tests/test-conv2d.cpp b/tests/test-conv2d.cpp index 1aea5eef3..35d565444 100644 --- a/tests/test-conv2d.cpp +++ b/tests/test-conv2d.cpp @@ -111,10 +111,10 @@ void load_model(test_model & model, bool use_gpu = false) { model.b = ggml_new_tensor_4d(model.ctx, GGML_TYPE_F32, IW, IH, IC, N); // create a allocator - ggml_tallocr_t alloc = ggml_tallocr_new(model.buffer); + struct ggml_tallocr alloc = ggml_tallocr_new(model.buffer); // alloc memory - ggml_tallocr_alloc(alloc, model.a); + ggml_tallocr_alloc(&alloc, model.a); // load data to buffer if(ggml_backend_is_cpu(model.backend)) { @@ -124,7 +124,7 @@ void load_model(test_model & model, bool use_gpu = false) { } // alloc memory - ggml_tallocr_alloc(alloc, model.b); + ggml_tallocr_alloc(&alloc, model.b); if(ggml_backend_is_cpu(model.backend) #ifdef GGML_USE_METAL @@ -135,8 +135,6 @@ void load_model(test_model & model, bool use_gpu = false) { } else { ggml_backend_tensor_set(model.b, bdata, 0, ggml_nbytes(model.b)); } - - ggml_tallocr_free(alloc); } struct ggml_cgraph * build_graph(const test_model& model) { diff --git a/tests/test-mul-mat.cpp b/tests/test-mul-mat.cpp index f082dea16..07a6ffeba 100644 --- a/tests/test-mul-mat.cpp +++ b/tests/test-mul-mat.cpp @@ -85,10 +85,10 @@ void load_model(test_model & model, float* a, float* b, int M, int N, int K, boo printf("Matrix B: [%i, %i]\n", K, N); // create a allocator - ggml_tallocr_t alloc = ggml_tallocr_new(model.buffer); + struct ggml_tallocr alloc = ggml_tallocr_new(model.buffer); // alloc memory - ggml_tallocr_alloc(alloc, model.a); + ggml_tallocr_alloc(&alloc, model.a); // load data to buffer if(ggml_backend_is_cpu(model.backend) @@ -102,7 +102,7 @@ void load_model(test_model & model, float* a, float* b, int M, int N, int K, boo } // alloc memory - ggml_tallocr_alloc(alloc, model.b); + ggml_tallocr_alloc(&alloc, model.b); if(ggml_backend_is_cpu(model.backend) #ifdef GGML_USE_METAL @@ -113,8 +113,6 @@ void load_model(test_model & model, float* a, float* b, int M, int N, int K, boo } else { ggml_backend_tensor_set(model.b, b, 0, ggml_nbytes(model.b)); // cuda requires copy the data directly to device } - - ggml_tallocr_free(alloc); } struct ggml_cgraph * build_graph(const test_model& model) {