Skip to content

Commit

Permalink
update examples and tests
Browse files Browse the repository at this point in the history
  • Loading branch information
slaren committed Mar 14, 2024
1 parent 2bacd6a commit d1c8942
Show file tree
Hide file tree
Showing 7 changed files with 31 additions and 68 deletions.
29 changes: 2 additions & 27 deletions examples/common-ggml.cpp
Expand Up @@ -90,8 +90,6 @@ bool ggml_common_quantize_0(
std::vector<ggml_fp16_t> data_f16;
std::vector<float> data_f32;

std::vector<int64_t> hist_all(1 << 4, 0);

while (true) {
int32_t n_dims;
int32_t length;
Expand Down Expand Up @@ -176,8 +174,6 @@ bool ggml_common_quantize_0(
work.resize(nelements); // for quantization

size_t cur_size = 0;
std::vector<int64_t> hist_cur(1 << 4, 0);

switch ((ggml_type) ttype) {
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
Expand All @@ -190,7 +186,7 @@ bool ggml_common_quantize_0(
case GGML_TYPE_Q5_K:
case GGML_TYPE_Q6_K:
{
cur_size = ggml_quantize_chunk((ggml_type) ttype, data_f32.data(), work.data(), 0, nelements/ne[0], ne[0], hist_cur.data(), nullptr);
cur_size = ggml_quantize_chunk((ggml_type) ttype, data_f32.data(), work.data(), 0, nelements/ne[0], ne[0], nullptr);
} break;
case GGML_TYPE_F32:
case GGML_TYPE_F16:
Expand All @@ -217,15 +213,7 @@ bool ggml_common_quantize_0(
fout.write(reinterpret_cast<char *>(work.data()), cur_size);
total_size_new += cur_size;

printf("size = %8.2f MB -> %8.2f MB | hist: ", nelements * sizeof(float)/1024.0/1024.0, cur_size/1024.0/1024.0);
for (int i = 0; i < (int) hist_cur.size(); ++i) {
hist_all[i] += hist_cur[i];
}

for (int i = 0; i < (int) hist_cur.size(); ++i) {
printf("%5.3f ", hist_cur[i] / (float)nelements);
}
printf("\n");
printf("size = %8.2f MB -> %8.2f MB\n", nelements * sizeof(float)/1024.0/1024.0, cur_size/1024.0/1024.0);
} else {
printf("size = %8.3f MB\n", data_u8.size()/1024.0/1024.0);
fout.write(reinterpret_cast<char *>(data_u8.data()), data_u8.size());
Expand All @@ -238,18 +226,5 @@ bool ggml_common_quantize_0(
printf("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
printf("%s: quant size = %8.2f MB | ftype = %d (%s)\n", __func__, total_size_new/1024.0/1024.0, ftype, ggml_type_name(qtype));

{
int64_t sum_all = 0;
for (int i = 0; i < (int) hist_all.size(); ++i) {
sum_all += hist_all[i];
}

printf("%s: hist: ", __func__);
for (int i = 0; i < (int) hist_all.size(); ++i) {
printf("%5.3f ", hist_all[i] / (float)sum_all);
}
printf("\n");
}

return true;
}
13 changes: 5 additions & 8 deletions examples/gpt-2/main-batched.cpp
Expand Up @@ -419,21 +419,19 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &

// allocate the tensors into the backend buffer
{
ggml_tallocr * alloc = ggml_tallocr_new(model.kv_cache.buffer);
ggml_tallocr alloc = ggml_tallocr_new(model.kv_cache.buffer);

// this updates the pointers in the tensors to point to the correct location in the buffer
// this is necessary since the ggml_context is .no_alloc == true
// note that the buffer can actually be a device buffer, depending on the backend
ggml_tallocr_alloc(alloc, model.kv_cache.k);
ggml_tallocr_alloc(alloc, model.kv_cache.v);

ggml_tallocr_free(alloc);
ggml_tallocr_alloc(&alloc, model.kv_cache.k);
ggml_tallocr_alloc(&alloc, model.kv_cache.v);
}
}

// load weights
{
ggml_tallocr * alloc = ggml_tallocr_new(model.buffer_w);
ggml_tallocr alloc = ggml_tallocr_new(model.buffer_w);

size_t total_size = 0;

Expand Down Expand Up @@ -495,7 +493,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
return false;
}

ggml_tallocr_alloc(alloc, tensor);
ggml_tallocr_alloc(&alloc, tensor);

if (ggml_backend_is_cpu (model.backend)
#ifdef GGML_USE_METAL
Expand Down Expand Up @@ -525,7 +523,6 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
total_size += ggml_nbytes(tensor);
}

ggml_tallocr_free(alloc);
printf("%s: model size = %8.2f MB\n", __func__, total_size/1024.0/1024.0);
}

Expand Down
28 changes: 13 additions & 15 deletions examples/gpt-2/main-sched.cpp
Expand Up @@ -342,7 +342,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
}

// allocate buffers
std::map<ggml_backend_t, std::unique_ptr<ggml_tallocr, decltype(&ggml_tallocr_free)>> backend_buffers;
std::map<ggml_backend_t, ggml_tallocr> backend_buffers;
for (auto backend : model.backends) {
// compute the size of the buffer
size_t size = 0;
Expand All @@ -359,7 +359,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
model.buffers_w.push_back(buffer);

// create an allocator for the buffer to allocate the tensors
auto alloc = std::unique_ptr<ggml_tallocr, decltype(&ggml_tallocr_free)>(ggml_tallocr_new(buffer), ggml_tallocr_free);
auto alloc = ggml_tallocr_new(buffer);
backend_buffers.insert(std::make_pair(backend, std::move(alloc)));
} else {
model.buffers_w.push_back(NULL);
Expand Down Expand Up @@ -394,15 +394,13 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &

// allocate the tensors into the backend buffer
{
ggml_tallocr * alloc = ggml_tallocr_new(model.buffer_kv);
ggml_tallocr alloc = ggml_tallocr_new(model.buffer_kv);

// this updates the pointers in the tensors to point to the correct location in the buffer
// this is necessary since the ggml_context is .no_alloc == true
// note that the buffer can actually be a device buffer, depending on the backend
ggml_tallocr_alloc(alloc, model.memory_k);
ggml_tallocr_alloc(alloc, model.memory_v);

ggml_tallocr_free(alloc);
ggml_tallocr_alloc(&alloc, model.memory_k);
ggml_tallocr_alloc(&alloc, model.memory_v);
}
}

Expand Down Expand Up @@ -470,8 +468,8 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &

// allocate the tensor
ggml_backend_t backend = tensor_backends[name];
ggml_tallocr * alloc = backend_buffers.find(backend)->second.get();
ggml_tallocr_alloc(alloc, tensor);
ggml_tallocr alloc = backend_buffers.find(backend)->second;
ggml_tallocr_alloc(&alloc, tensor);
//printf("%s: [%5.5s] %s\n", __func__, ggml_backend_name(backend), name.c_str());

if (ggml_backend_is_cpu(backend)
Expand All @@ -490,7 +488,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &

// GPT-2 models share the WTE tensor as the LM head
if (name == "model/wte" && has_lm_head == false) {
ggml_tallocr_alloc(backend_buffers.find(tensor_backends["model/lm_head"])->second.get(), model.lm_head);
ggml_tallocr_alloc(&backend_buffers.find(tensor_backends["model/lm_head"])->second, model.lm_head);
//printf("%s: [%5.5s] %s (copied)\n", __func__, ggml_backend_name(tensor_backends["model/lm_head"]), "model/lm_head");
ggml_backend_tensor_copy(tensor, model.lm_head);
total_size += ggml_nbytes(model.lm_head);
Expand Down Expand Up @@ -524,10 +522,9 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
printf("%s: backend_in = %s (%zu bytes)\n", __func__, ggml_backend_name(backend_input), input_size);

// allocate the tensors into the backend buffer
ggml_tallocr * alloc = ggml_tallocr_new(model.buffer_input);
ggml_tallocr_alloc(alloc, model.embd);
ggml_tallocr_alloc(alloc, model.position);
ggml_tallocr_free(alloc);
ggml_tallocr alloc = ggml_tallocr_new(model.buffer_input);
ggml_tallocr_alloc(&alloc, model.embd);
ggml_tallocr_alloc(&alloc, model.position);
}

return true;
Expand Down Expand Up @@ -867,6 +864,7 @@ bool gpt2_eval(
struct ggml_cgraph * gf = gpt2_graph(model, n_past, embd_inp);

// run the computation
ggml_backend_sched_reset(sched);
ggml_backend_sched_graph_compute(sched, gf);

//if (n_past%100 == 0) {
Expand Down Expand Up @@ -934,7 +932,7 @@ int main(int argc, char ** argv) {
ggml_backend_sched_t sched;
{
// initialize the scheduler
sched = ggml_backend_sched_new(model.backends.data(), NULL, model.backends.size(), GPT2_MAX_NODES);
sched = ggml_backend_sched_new(model.backends.data(), NULL, model.backends.size(), GPT2_MAX_NODES, false);

// create the worst case graph for memory usage estimation
int n_tokens = std::min(model.hparams.n_ctx, params.n_batch);
Expand Down
5 changes: 2 additions & 3 deletions tests/test-backend-buffer.cpp
Expand Up @@ -40,8 +40,8 @@ static void test_buffer(ggml_backend_t backend, ggml_backend_buffer_type_t buft)

GGML_ASSERT(ggml_backend_buffer_get_alloc_size(buffer, tensor) >= n * sizeof(float));

ggml_tallocr_t allocr = ggml_tallocr_new(buffer);
ggml_tallocr_alloc(allocr, tensor);
struct ggml_tallocr allocr = ggml_tallocr_new(buffer);
ggml_tallocr_alloc(&allocr, tensor);

GGML_ASSERT(tensor->data != NULL);

Expand All @@ -59,7 +59,6 @@ static void test_buffer(ggml_backend_t backend, ggml_backend_buffer_type_t buft)

GGML_ASSERT(memcmp(data, data2, sizeof(data)) == 0);

ggml_tallocr_free(allocr);
ggml_backend_buffer_free(buffer);
ggml_free(ctx);
}
Expand Down
8 changes: 3 additions & 5 deletions tests/test-conv1d.cpp
Expand Up @@ -111,10 +111,10 @@ void load_model(test_model & model, bool use_gpu = false) {
model.b = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, IL, IC, N);

// create a allocator
ggml_tallocr_t alloc = ggml_tallocr_new(model.buffer);
ggml_tallocr alloc = ggml_tallocr_new(model.buffer);

// alloc memory
ggml_tallocr_alloc(alloc, model.a);
ggml_tallocr_alloc(&alloc, model.a);

// load data to buffer
if(ggml_backend_is_cpu(model.backend)) {
Expand All @@ -124,7 +124,7 @@ void load_model(test_model & model, bool use_gpu = false) {
}

// alloc memory
ggml_tallocr_alloc(alloc, model.b);
ggml_tallocr_alloc(&alloc, model.b);

if(ggml_backend_is_cpu(model.backend)
#ifdef GGML_USE_METAL
Expand All @@ -135,8 +135,6 @@ void load_model(test_model & model, bool use_gpu = false) {
} else {
ggml_backend_tensor_set(model.b, bdata, 0, ggml_nbytes(model.b));
}

ggml_tallocr_free(alloc);
}

struct ggml_cgraph * build_graph(const test_model& model) {
Expand Down
8 changes: 3 additions & 5 deletions tests/test-conv2d.cpp
Expand Up @@ -111,10 +111,10 @@ void load_model(test_model & model, bool use_gpu = false) {
model.b = ggml_new_tensor_4d(model.ctx, GGML_TYPE_F32, IW, IH, IC, N);

// create a allocator
ggml_tallocr_t alloc = ggml_tallocr_new(model.buffer);
struct ggml_tallocr alloc = ggml_tallocr_new(model.buffer);

// alloc memory
ggml_tallocr_alloc(alloc, model.a);
ggml_tallocr_alloc(&alloc, model.a);

// load data to buffer
if(ggml_backend_is_cpu(model.backend)) {
Expand All @@ -124,7 +124,7 @@ void load_model(test_model & model, bool use_gpu = false) {
}

// alloc memory
ggml_tallocr_alloc(alloc, model.b);
ggml_tallocr_alloc(&alloc, model.b);

if(ggml_backend_is_cpu(model.backend)
#ifdef GGML_USE_METAL
Expand All @@ -135,8 +135,6 @@ void load_model(test_model & model, bool use_gpu = false) {
} else {
ggml_backend_tensor_set(model.b, bdata, 0, ggml_nbytes(model.b));
}

ggml_tallocr_free(alloc);
}

struct ggml_cgraph * build_graph(const test_model& model) {
Expand Down
8 changes: 3 additions & 5 deletions tests/test-mul-mat.cpp
Expand Up @@ -85,10 +85,10 @@ void load_model(test_model & model, float* a, float* b, int M, int N, int K, boo
printf("Matrix B: [%i, %i]\n", K, N);

// create a allocator
ggml_tallocr_t alloc = ggml_tallocr_new(model.buffer);
struct ggml_tallocr alloc = ggml_tallocr_new(model.buffer);

// alloc memory
ggml_tallocr_alloc(alloc, model.a);
ggml_tallocr_alloc(&alloc, model.a);

// load data to buffer
if(ggml_backend_is_cpu(model.backend)
Expand All @@ -102,7 +102,7 @@ void load_model(test_model & model, float* a, float* b, int M, int N, int K, boo
}

// alloc memory
ggml_tallocr_alloc(alloc, model.b);
ggml_tallocr_alloc(&alloc, model.b);

if(ggml_backend_is_cpu(model.backend)
#ifdef GGML_USE_METAL
Expand All @@ -113,8 +113,6 @@ void load_model(test_model & model, float* a, float* b, int M, int N, int K, boo
} else {
ggml_backend_tensor_set(model.b, b, 0, ggml_nbytes(model.b)); // cuda requires copy the data directly to device
}

ggml_tallocr_free(alloc);
}

struct ggml_cgraph * build_graph(const test_model& model) {
Expand Down

0 comments on commit d1c8942

Please sign in to comment.