Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

sync : llama.cpp #764

Merged
merged 21 commits into from Mar 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
51426b9
llama : support Mamba Selective State Space Models (llama/5328)
compilade Mar 8, 2024
6fca2e3
ggml : add ggml-common.h to deduplicate shared code (llama/5940)
ggerganov Mar 9, 2024
87755a9
ggml : remove old quantization functions (llama/5942)
ggerganov Mar 9, 2024
64927b1
ggml : fix unnecessary f32 -> f16 -> f32 casts (mmla) (llama/5951)
ggerganov Mar 9, 2024
9a340d0
ggml : remove __constant__ specifier for CUDA tables (llama/5940)
ggerganov Mar 10, 2024
fbde0d0
metal : move mm_id indices to shared mem (llama/5982)
ggerganov Mar 10, 2024
5ad7833
Add q3_s and q1_s (llama/5886)
abhilash1910 Mar 11, 2024
b87e474
Better 1.5 bit quantization (llama/5971)
ikawrakow Mar 11, 2024
12954cf
ggml, ci : Windows ARM runner and build fixes (llama/5979)
Xarbirus Mar 11, 2024
6523b9e
1.5 bit: we can do even better (llama/5999)
ikawrakow Mar 11, 2024
a17f521
sycl : update IQ1_S kernels (WIP - not working!) (llama/5995)
ggerganov Mar 12, 2024
8315615
ggml : fix UB in IQ2_S and IQ3_S (llama/6012)
ggerganov Mar 12, 2024
2afdf95
ggml : reuse quantum structs across backends (llama/5943)
ggerganov Mar 12, 2024
289f940
Update get version (llama/6025)
AidanBeltonS Mar 13, 2024
d280436
test-backend-ops : skip CPU backend by default (llama/6028)
slaren Mar 13, 2024
94dbea1
llama : add pipeline parallelism support (llama/6017)
slaren Mar 13, 2024
ebcaebe
metal : build metallib + fix embed path (llama/6015)
ggerganov Mar 14, 2024
82ce195
ggml : designate enum vals for integer types (llama/6050)
ggerganov Mar 14, 2024
f2e770c
sync : llama.cpp
ggerganov Mar 14, 2024
2bacd6a
ggml : add ggml-common.h
ggerganov Mar 14, 2024
375f9c1
update examples and tests
slaren Mar 14, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
29 changes: 2 additions & 27 deletions examples/common-ggml.cpp
Expand Up @@ -90,8 +90,6 @@ bool ggml_common_quantize_0(
std::vector<ggml_fp16_t> data_f16;
std::vector<float> data_f32;

std::vector<int64_t> hist_all(1 << 4, 0);

while (true) {
int32_t n_dims;
int32_t length;
Expand Down Expand Up @@ -176,8 +174,6 @@ bool ggml_common_quantize_0(
work.resize(nelements); // for quantization

size_t cur_size = 0;
std::vector<int64_t> hist_cur(1 << 4, 0);

switch ((ggml_type) ttype) {
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
Expand All @@ -190,7 +186,7 @@ bool ggml_common_quantize_0(
case GGML_TYPE_Q5_K:
case GGML_TYPE_Q6_K:
{
cur_size = ggml_quantize_chunk((ggml_type) ttype, data_f32.data(), work.data(), 0, nelements/ne[0], ne[0], hist_cur.data(), nullptr);
cur_size = ggml_quantize_chunk((ggml_type) ttype, data_f32.data(), work.data(), 0, nelements/ne[0], ne[0], nullptr);
} break;
case GGML_TYPE_F32:
case GGML_TYPE_F16:
Expand All @@ -217,15 +213,7 @@ bool ggml_common_quantize_0(
fout.write(reinterpret_cast<char *>(work.data()), cur_size);
total_size_new += cur_size;

printf("size = %8.2f MB -> %8.2f MB | hist: ", nelements * sizeof(float)/1024.0/1024.0, cur_size/1024.0/1024.0);
for (int i = 0; i < (int) hist_cur.size(); ++i) {
hist_all[i] += hist_cur[i];
}

for (int i = 0; i < (int) hist_cur.size(); ++i) {
printf("%5.3f ", hist_cur[i] / (float)nelements);
}
printf("\n");
printf("size = %8.2f MB -> %8.2f MB\n", nelements * sizeof(float)/1024.0/1024.0, cur_size/1024.0/1024.0);
} else {
printf("size = %8.3f MB\n", data_u8.size()/1024.0/1024.0);
fout.write(reinterpret_cast<char *>(data_u8.data()), data_u8.size());
Expand All @@ -238,18 +226,5 @@ bool ggml_common_quantize_0(
printf("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
printf("%s: quant size = %8.2f MB | ftype = %d (%s)\n", __func__, total_size_new/1024.0/1024.0, ftype, ggml_type_name(qtype));

{
int64_t sum_all = 0;
for (int i = 0; i < (int) hist_all.size(); ++i) {
sum_all += hist_all[i];
}

printf("%s: hist: ", __func__);
for (int i = 0; i < (int) hist_all.size(); ++i) {
printf("%5.3f ", hist_all[i] / (float)sum_all);
}
printf("\n");
}

return true;
}
13 changes: 5 additions & 8 deletions examples/gpt-2/main-batched.cpp
Expand Up @@ -419,21 +419,19 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &

// allocate the tensors into the backend buffer
{
ggml_tallocr * alloc = ggml_tallocr_new(model.kv_cache.buffer);
ggml_tallocr alloc = ggml_tallocr_new(model.kv_cache.buffer);

// this updates the pointers in the tensors to point to the correct location in the buffer
// this is necessary since the ggml_context is .no_alloc == true
// note that the buffer can actually be a device buffer, depending on the backend
ggml_tallocr_alloc(alloc, model.kv_cache.k);
ggml_tallocr_alloc(alloc, model.kv_cache.v);

ggml_tallocr_free(alloc);
ggml_tallocr_alloc(&alloc, model.kv_cache.k);
ggml_tallocr_alloc(&alloc, model.kv_cache.v);
}
}

// load weights
{
ggml_tallocr * alloc = ggml_tallocr_new(model.buffer_w);
ggml_tallocr alloc = ggml_tallocr_new(model.buffer_w);

size_t total_size = 0;

Expand Down Expand Up @@ -495,7 +493,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
return false;
}

ggml_tallocr_alloc(alloc, tensor);
ggml_tallocr_alloc(&alloc, tensor);

if (ggml_backend_is_cpu (model.backend)
#ifdef GGML_USE_METAL
Expand Down Expand Up @@ -525,7 +523,6 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
total_size += ggml_nbytes(tensor);
}

ggml_tallocr_free(alloc);
printf("%s: model size = %8.2f MB\n", __func__, total_size/1024.0/1024.0);
}

Expand Down
27 changes: 13 additions & 14 deletions examples/gpt-2/main-sched.cpp
Expand Up @@ -342,7 +342,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
}

// allocate buffers
std::map<ggml_backend_t, std::unique_ptr<ggml_tallocr, decltype(&ggml_tallocr_free)>> backend_buffers;
std::map<ggml_backend_t, ggml_tallocr> backend_buffers;
for (auto backend : model.backends) {
// compute the size of the buffer
size_t size = 0;
Expand All @@ -359,7 +359,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
model.buffers_w.push_back(buffer);

// create an allocator for the buffer to allocate the tensors
auto alloc = std::unique_ptr<ggml_tallocr, decltype(&ggml_tallocr_free)>(ggml_tallocr_new(buffer), ggml_tallocr_free);
auto alloc = ggml_tallocr_new(buffer);
backend_buffers.insert(std::make_pair(backend, std::move(alloc)));
} else {
model.buffers_w.push_back(NULL);
Expand Down Expand Up @@ -394,15 +394,13 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &

// allocate the tensors into the backend buffer
{
ggml_tallocr * alloc = ggml_tallocr_new(model.buffer_kv);
ggml_tallocr alloc = ggml_tallocr_new(model.buffer_kv);

// this updates the pointers in the tensors to point to the correct location in the buffer
// this is necessary since the ggml_context is .no_alloc == true
// note that the buffer can actually be a device buffer, depending on the backend
ggml_tallocr_alloc(alloc, model.memory_k);
ggml_tallocr_alloc(alloc, model.memory_v);

ggml_tallocr_free(alloc);
ggml_tallocr_alloc(&alloc, model.memory_k);
ggml_tallocr_alloc(&alloc, model.memory_v);
}
}

Expand Down Expand Up @@ -470,7 +468,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &

// allocate the tensor
ggml_backend_t backend = tensor_backends[name];
ggml_tallocr * alloc = backend_buffers.find(backend)->second.get();
ggml_tallocr * alloc = &backend_buffers.find(backend)->second;
ggml_tallocr_alloc(alloc, tensor);
//printf("%s: [%5.5s] %s\n", __func__, ggml_backend_name(backend), name.c_str());

Expand All @@ -490,7 +488,8 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &

// GPT-2 models share the WTE tensor as the LM head
if (name == "model/wte" && has_lm_head == false) {
ggml_tallocr_alloc(backend_buffers.find(tensor_backends["model/lm_head"])->second.get(), model.lm_head);
ggml_tallocr * alloc_head = &backend_buffers.find(tensor_backends["model/lm_head"])->second;
ggml_tallocr_alloc(alloc_head, model.lm_head);
//printf("%s: [%5.5s] %s (copied)\n", __func__, ggml_backend_name(tensor_backends["model/lm_head"]), "model/lm_head");
ggml_backend_tensor_copy(tensor, model.lm_head);
total_size += ggml_nbytes(model.lm_head);
Expand Down Expand Up @@ -524,10 +523,9 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
printf("%s: backend_in = %s (%zu bytes)\n", __func__, ggml_backend_name(backend_input), input_size);

// allocate the tensors into the backend buffer
ggml_tallocr * alloc = ggml_tallocr_new(model.buffer_input);
ggml_tallocr_alloc(alloc, model.embd);
ggml_tallocr_alloc(alloc, model.position);
ggml_tallocr_free(alloc);
ggml_tallocr alloc = ggml_tallocr_new(model.buffer_input);
ggml_tallocr_alloc(&alloc, model.embd);
ggml_tallocr_alloc(&alloc, model.position);
}

return true;
Expand Down Expand Up @@ -867,6 +865,7 @@ bool gpt2_eval(
struct ggml_cgraph * gf = gpt2_graph(model, n_past, embd_inp);

// run the computation
ggml_backend_sched_reset(sched);
ggml_backend_sched_graph_compute(sched, gf);

//if (n_past%100 == 0) {
Expand Down Expand Up @@ -934,7 +933,7 @@ int main(int argc, char ** argv) {
ggml_backend_sched_t sched;
{
// initialize the scheduler
sched = ggml_backend_sched_new(model.backends.data(), NULL, model.backends.size(), GPT2_MAX_NODES);
sched = ggml_backend_sched_new(model.backends.data(), NULL, model.backends.size(), GPT2_MAX_NODES, false);

// create the worst case graph for memory usage estimation
int n_tokens = std::min(model.hparams.n_ctx, params.n_batch);
Expand Down