-
Notifications
You must be signed in to change notification settings - Fork 909
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* add simple example for explain memory management and basic operation of ggml
- Loading branch information
Showing
5 changed files
with
389 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
# | ||
# simple-ctx | ||
|
||
set(TEST_TARGET simple-ctx) | ||
add_executable(${TEST_TARGET} simple-ctx.cpp) | ||
target_link_libraries(${TEST_TARGET} PRIVATE ggml) | ||
|
||
# | ||
# simple-backend | ||
|
||
set(TEST_TARGET simple-backend) | ||
add_executable(${TEST_TARGET} simple-backend.cpp) | ||
target_link_libraries(${TEST_TARGET} PRIVATE ggml) | ||
|
||
if (GGML_CUBLAS) | ||
add_compile_definitions(GGML_USE_CUBLAS) | ||
endif() | ||
|
||
if (GGML_METAL) | ||
add_compile_definitions(GGML_USE_METAL) | ||
endif() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
## Simple | ||
|
||
This example simply performs a matrix multiplication, solely for the purpose of demonstrating a basic usage of ggml and backend handling. The code is commented to help understand what each part does. | ||
|
||
$$ | ||
\begin{bmatrix} | ||
2 & 8 \\ | ||
5 & 1 \\ | ||
4 & 2 \\ | ||
8 & 6 \\ | ||
\end{bmatrix} | ||
\times | ||
\begin{bmatrix} | ||
10 & 9 & 5 \\ | ||
5 & 9 & 4 \\ | ||
\end{bmatrix} | ||
\= | ||
\begin{bmatrix} | ||
60 & 110 & 54 & 29 \\ | ||
55 & 90 & 126 & 28 \\ | ||
50 & 54 & 42 & 64 \\ | ||
\end{bmatrix} | ||
$$ | ||
|
||
The `simple-ctx` doesn't support gpu acceleration. `simple-backend` demonstrates how to use other backends like CUDA and Metal. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,216 @@ | ||
#include "ggml.h" | ||
#include "ggml/ggml-alloc.h" | ||
#include "ggml/ggml-backend.h" | ||
|
||
#ifdef GGML_USE_CUBLAS | ||
#include "ggml-cuda.h" | ||
#endif | ||
|
||
#ifdef GGML_USE_METAL | ||
#include "ggml-metal.h" | ||
#endif | ||
|
||
#include <cassert> | ||
#include <cmath> | ||
#include <cstdio> | ||
#include <cstring> | ||
#include <fstream> | ||
#include <map> | ||
#include <string> | ||
#include <vector> | ||
|
||
// This is a simple model with two tensors a and b | ||
struct simple_model { | ||
struct ggml_tensor * a; | ||
struct ggml_tensor * b; | ||
|
||
// the backend to perform the computation (CPU, CUDA, METAL) | ||
ggml_backend_t backend = NULL; | ||
|
||
// the backend buffer to storage the tensors data of a and b | ||
ggml_backend_buffer_t buffer; | ||
|
||
// the context to define the tensor information (dimensions, size, memory address) | ||
struct ggml_context * ctx; | ||
}; | ||
|
||
// initialize the tensors of the model in this case two matrices 2x2 | ||
void load_model(simple_model & model, float * a, float * b, int rows_A, int cols_A, int rows_B, int cols_B) { | ||
// initialize the backend | ||
#ifdef GGML_USE_CUBLAS | ||
fprintf(stderr, "%s: using CUDA backend\n", __func__); | ||
model.backend = ggml_backend_cuda_init(0); // init device 0 | ||
if (!model.backend) { | ||
fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__); | ||
} | ||
#endif | ||
|
||
#ifdef GGML_USE_METAL | ||
fprintf(stderr, "%s: using Metal backend\n", __func__); | ||
ggml_metal_log_set_callback(ggml_log_callback_default, nullptr); | ||
model.backend = ggml_backend_metal_init(); | ||
if (!model.backend) { | ||
fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__); | ||
} | ||
#endif | ||
|
||
// if there aren't GPU Backends fallback to CPU backend | ||
if (!model.backend) { | ||
model.backend = ggml_backend_cpu_init(); | ||
} | ||
|
||
int num_tensors = 2; | ||
|
||
struct ggml_init_params params { | ||
/*.mem_size =*/ ggml_tensor_overhead() * num_tensors, | ||
/*.mem_buffer =*/ NULL, | ||
/*.no_alloc =*/ true, | ||
}; | ||
|
||
// create context | ||
model.ctx = ggml_init(params); | ||
|
||
// create tensors | ||
model.a = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, cols_A, rows_A); | ||
model.b = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, cols_B, rows_B); | ||
|
||
// create a backend buffer (backend memory) and alloc the tensors from the context | ||
model.buffer = ggml_backend_alloc_ctx_tensors(model.ctx, model.backend); | ||
|
||
// load data from cpu memory to backend buffer | ||
ggml_backend_tensor_set(model.a, a, 0, ggml_nbytes(model.a)); | ||
ggml_backend_tensor_set(model.b, b, 0, ggml_nbytes(model.b)); | ||
} | ||
|
||
// build the compute graph to perform a matrix multiplication | ||
struct ggml_cgraph * build_graph(const simple_model& model) { | ||
static size_t buf_size = ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead(); | ||
static std::vector<uint8_t> buf(buf_size); | ||
|
||
struct ggml_init_params params0 = { | ||
/*.mem_size =*/ buf_size, | ||
/*.mem_buffer =*/ buf.data(), | ||
/*.no_alloc =*/ true, // the tensors will be allocated later by ggml_allocr_alloc_graph() | ||
}; | ||
|
||
// create a temporally context to build the graph | ||
struct ggml_context * ctx0 = ggml_init(params0); | ||
|
||
struct ggml_cgraph * gf = ggml_new_graph(ctx0); | ||
|
||
// result = a*b^T | ||
struct ggml_tensor * result = ggml_mul_mat(ctx0, model.a, model.b); | ||
|
||
// build operations nodes | ||
ggml_build_forward_expand(gf, result); | ||
|
||
// delete the temporally context used to build the graph | ||
ggml_free(ctx0); | ||
return gf; | ||
} | ||
|
||
// compute with backend | ||
struct ggml_tensor * compute(const simple_model & model, ggml_gallocr_t allocr) { | ||
// reset the allocator to free all the memory allocated during the previous inference | ||
|
||
struct ggml_cgraph * gf = build_graph(model); | ||
|
||
// allocate tensors | ||
ggml_gallocr_alloc_graph(allocr, gf); | ||
|
||
int n_threads = 1; // number of threads to perform some operations with multi-threading | ||
|
||
if (ggml_backend_is_cpu(model.backend)) { | ||
ggml_backend_cpu_set_n_threads(model.backend, n_threads); | ||
} | ||
|
||
#ifdef GGML_USE_METAL | ||
if (ggml_backend_is_metal(model.backend)) { | ||
ggml_backend_metal_set_n_cb(model.backend, n_threads); | ||
} | ||
#endif | ||
|
||
ggml_backend_graph_compute(model.backend, gf); | ||
|
||
// in this case, the output tensor is the last one in the graph | ||
return gf->nodes[gf->n_nodes - 1]; | ||
} | ||
|
||
int main(void) { | ||
ggml_time_init(); | ||
|
||
// initialize data of matrices to perform matrix multiplication | ||
const int rows_A = 4, cols_A = 2; | ||
|
||
float matrix_A[rows_A * cols_A] = { | ||
2, 8, | ||
5, 1, | ||
4, 2, | ||
8, 6 | ||
}; | ||
|
||
const int rows_B = 3, cols_B = 2; | ||
/* Transpose([ | ||
10, 9, 5, | ||
5, 9, 4 | ||
]) 2 rows, 3 cols */ | ||
float matrix_B[rows_B * cols_B] = { | ||
10, 5, | ||
9, 9, | ||
5, 4 | ||
}; | ||
|
||
simple_model model; | ||
load_model(model, matrix_A, matrix_B, rows_A, cols_A, rows_B, cols_B); | ||
|
||
// calculate the temporaly memory required to compute | ||
ggml_gallocr_t allocr = NULL; | ||
|
||
{ | ||
allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend)); | ||
|
||
// create the worst case graph for memory usage estimation | ||
struct ggml_cgraph * gf = build_graph(model); | ||
ggml_gallocr_reserve(allocr, gf); | ||
size_t mem_size = ggml_gallocr_get_buffer_size(allocr, 0); | ||
|
||
fprintf(stderr, "%s: compute buffer size: %.4f KB\n", __func__, mem_size/1024.0); | ||
} | ||
|
||
// perform computation | ||
struct ggml_tensor * result = compute(model, allocr); | ||
|
||
// create a array to print result | ||
std::vector<float> out_data(ggml_nelements(result)); | ||
|
||
// bring the data from the backend memory | ||
ggml_backend_tensor_get(result, out_data.data(), 0, ggml_nbytes(result)); | ||
|
||
// expected result: | ||
// [ 60.00 110.00 54.00 29.00 | ||
// 55.00 90.00 126.00 28.00 | ||
// 50.00 54.00 42.00 64.00 ] | ||
|
||
printf("mul mat (%d x %d) (transposed result):\n[", result->ne[0], result->ne[1]); | ||
for (int j = 0; j < result->ne[1] /* rows */; j++) { | ||
if (j > 0) { | ||
printf("\n"); | ||
} | ||
|
||
for (int i = 0; i < result->ne[0] /* cols */; i++) { | ||
printf(" %.2f", out_data[i * result->ne[1] + j]); | ||
} | ||
} | ||
printf(" ]\n"); | ||
|
||
// release backend memory used for computation | ||
ggml_gallocr_free(allocr); | ||
|
||
// free memory | ||
ggml_free(model.ctx); | ||
|
||
// release backend memory and free backend | ||
ggml_backend_buffer_free(model.buffer); | ||
ggml_backend_free(model.backend); | ||
return 0; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,126 @@ | ||
#include "ggml.h" | ||
|
||
#include <cassert> | ||
#include <cmath> | ||
#include <cstdio> | ||
#include <cstring> | ||
#include <fstream> | ||
#include <map> | ||
#include <string> | ||
#include <vector> | ||
|
||
// This is a simple model with two tensors a and b | ||
struct simple_model { | ||
struct ggml_tensor * a; | ||
struct ggml_tensor * b; | ||
|
||
// the context to define the tensor information (dimensions, size, memory data) | ||
struct ggml_context * ctx; | ||
}; | ||
|
||
// initialize the tensors of the model in this case two matrices 2x2 | ||
void load_model(simple_model & model, float * a, float * b, int rows_A, int cols_A, int rows_B, int cols_B) { | ||
size_t ctx_size = 0; | ||
{ | ||
ctx_size += rows_A * cols_A * ggml_type_size(GGML_TYPE_F32); // tensor a | ||
ctx_size += rows_B * cols_B * ggml_type_size(GGML_TYPE_F32); // tensor b | ||
ctx_size += 2 * ggml_tensor_overhead(), // tensors | ||
ctx_size += ggml_graph_overhead(); // compute graph | ||
ctx_size += 1024; // some overhead | ||
} | ||
|
||
struct ggml_init_params params { | ||
/*.mem_size =*/ ctx_size, | ||
/*.mem_buffer =*/ NULL, | ||
/*.no_alloc =*/ false, // NOTE: this should be false when using the legacy API | ||
}; | ||
|
||
// create context | ||
model.ctx = ggml_init(params); | ||
|
||
// create tensors | ||
model.a = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, cols_A, rows_A); | ||
model.b = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, cols_B, rows_B); | ||
|
||
memcpy(model.a->data, a, ggml_nbytes(model.a)); | ||
memcpy(model.b->data, b, ggml_nbytes(model.b)); | ||
} | ||
|
||
// build the compute graph to perform a matrix multiplication | ||
struct ggml_cgraph * build_graph(const simple_model& model) { | ||
struct ggml_cgraph * gf = ggml_new_graph(model.ctx); | ||
|
||
// result = a*b^T | ||
struct ggml_tensor * result = ggml_mul_mat(model.ctx, model.a, model.b); | ||
|
||
ggml_build_forward_expand(gf, result); | ||
return gf; | ||
} | ||
|
||
// compute with backend | ||
struct ggml_tensor * compute(const simple_model & model) { | ||
struct ggml_cgraph * gf = build_graph(model); | ||
|
||
int n_threads = 1; // number of threads to perform some operations with multi-threading | ||
|
||
ggml_graph_compute_with_ctx(model.ctx, gf, n_threads); | ||
|
||
// in this case, the output tensor is the last one in the graph | ||
return gf->nodes[gf->n_nodes - 1]; | ||
} | ||
|
||
int main(void) { | ||
ggml_time_init(); | ||
|
||
// initialize data of matrices to perform matrix multiplication | ||
const int rows_A = 4, cols_A = 2; | ||
|
||
float matrix_A[rows_A * cols_A] = { | ||
2, 8, | ||
5, 1, | ||
4, 2, | ||
8, 6 | ||
}; | ||
|
||
const int rows_B = 3, cols_B = 2; | ||
/* Transpose([ | ||
10, 9, 5, | ||
5, 9, 4 | ||
]) 2 rows, 3 cols */ | ||
float matrix_B[rows_B * cols_B] = { | ||
10, 5, | ||
9, 9, | ||
5, 4 | ||
}; | ||
|
||
simple_model model; | ||
load_model(model, matrix_A, matrix_B, rows_A, cols_A, rows_B, cols_B); | ||
|
||
// perform computation in cpu | ||
struct ggml_tensor * result = compute(model); | ||
|
||
// get the result data pointer as a float array to print | ||
std::vector<float> out_data(ggml_nelements(result)); | ||
memcpy(out_data.data(), result->data, ggml_nbytes(result)); | ||
|
||
// expected result: | ||
// [ 60.00 110.00 54.00 29.00 | ||
// 55.00 90.00 126.00 28.00 | ||
// 50.00 54.00 42.00 64.00 ] | ||
|
||
printf("mul mat (%d x %d) (transposed result):\n[", result->ne[0], result->ne[1]); | ||
for (int j = 0; j < result->ne[1] /* rows */; j++) { | ||
if (j > 0) { | ||
printf("\n"); | ||
} | ||
|
||
for (int i = 0; i < result->ne[0] /* cols */; i++) { | ||
printf(" %.2f", out_data[i * result->ne[1] + j]); | ||
} | ||
} | ||
printf(" ]\n"); | ||
|
||
// free memory | ||
ggml_free(model.ctx); | ||
return 0; | ||
} |