Skip to content

Commit d9cd6b5

Browse files
authored
ggml : add simple example (#713)
* add simple example for explain memory management and basic operation of ggml
1 parent 2b09231 commit d9cd6b5

File tree

5 files changed

+389
-0
lines changed

5 files changed

+389
-0
lines changed

examples/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,4 +24,5 @@ add_subdirectory(whisper)
2424
add_subdirectory(mnist)
2525
add_subdirectory(sam)
2626
add_subdirectory(yolo)
27+
add_subdirectory(simple)
2728
add_subdirectory(magika)

examples/simple/CMakeLists.txt

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
#
2+
# simple-ctx
3+
4+
set(TEST_TARGET simple-ctx)
5+
add_executable(${TEST_TARGET} simple-ctx.cpp)
6+
target_link_libraries(${TEST_TARGET} PRIVATE ggml)
7+
8+
#
9+
# simple-backend
10+
11+
set(TEST_TARGET simple-backend)
12+
add_executable(${TEST_TARGET} simple-backend.cpp)
13+
target_link_libraries(${TEST_TARGET} PRIVATE ggml)
14+
15+
if (GGML_CUBLAS)
16+
add_compile_definitions(GGML_USE_CUBLAS)
17+
endif()
18+
19+
if (GGML_METAL)
20+
add_compile_definitions(GGML_USE_METAL)
21+
endif()

examples/simple/README.md

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
## Simple
2+
3+
This example simply performs a matrix multiplication, solely for the purpose of demonstrating a basic usage of ggml and backend handling. The code is commented to help understand what each part does.
4+
5+
$$
6+
\begin{bmatrix}
7+
2 & 8 \\
8+
5 & 1 \\
9+
4 & 2 \\
10+
8 & 6 \\
11+
\end{bmatrix}
12+
\times
13+
\begin{bmatrix}
14+
10 & 9 & 5 \\
15+
5 & 9 & 4 \\
16+
\end{bmatrix}
17+
\=
18+
\begin{bmatrix}
19+
60 & 110 & 54 & 29 \\
20+
55 & 90 & 126 & 28 \\
21+
50 & 54 & 42 & 64 \\
22+
\end{bmatrix}
23+
$$
24+
25+
The `simple-ctx` doesn't support gpu acceleration. `simple-backend` demonstrates how to use other backends like CUDA and Metal.

examples/simple/simple-backend.cpp

Lines changed: 216 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,216 @@
1+
#include "ggml.h"
2+
#include "ggml/ggml-alloc.h"
3+
#include "ggml/ggml-backend.h"
4+
5+
#ifdef GGML_USE_CUBLAS
6+
#include "ggml-cuda.h"
7+
#endif
8+
9+
#ifdef GGML_USE_METAL
10+
#include "ggml-metal.h"
11+
#endif
12+
13+
#include <cassert>
14+
#include <cmath>
15+
#include <cstdio>
16+
#include <cstring>
17+
#include <fstream>
18+
#include <map>
19+
#include <string>
20+
#include <vector>
21+
22+
// This is a simple model with two tensors a and b
23+
struct simple_model {
24+
struct ggml_tensor * a;
25+
struct ggml_tensor * b;
26+
27+
// the backend to perform the computation (CPU, CUDA, METAL)
28+
ggml_backend_t backend = NULL;
29+
30+
// the backend buffer to storage the tensors data of a and b
31+
ggml_backend_buffer_t buffer;
32+
33+
// the context to define the tensor information (dimensions, size, memory address)
34+
struct ggml_context * ctx;
35+
};
36+
37+
// initialize the tensors of the model in this case two matrices 2x2
38+
void load_model(simple_model & model, float * a, float * b, int rows_A, int cols_A, int rows_B, int cols_B) {
39+
// initialize the backend
40+
#ifdef GGML_USE_CUBLAS
41+
fprintf(stderr, "%s: using CUDA backend\n", __func__);
42+
model.backend = ggml_backend_cuda_init(0); // init device 0
43+
if (!model.backend) {
44+
fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
45+
}
46+
#endif
47+
48+
#ifdef GGML_USE_METAL
49+
fprintf(stderr, "%s: using Metal backend\n", __func__);
50+
ggml_metal_log_set_callback(ggml_log_callback_default, nullptr);
51+
model.backend = ggml_backend_metal_init();
52+
if (!model.backend) {
53+
fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
54+
}
55+
#endif
56+
57+
// if there aren't GPU Backends fallback to CPU backend
58+
if (!model.backend) {
59+
model.backend = ggml_backend_cpu_init();
60+
}
61+
62+
int num_tensors = 2;
63+
64+
struct ggml_init_params params {
65+
/*.mem_size =*/ ggml_tensor_overhead() * num_tensors,
66+
/*.mem_buffer =*/ NULL,
67+
/*.no_alloc =*/ true,
68+
};
69+
70+
// create context
71+
model.ctx = ggml_init(params);
72+
73+
// create tensors
74+
model.a = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, cols_A, rows_A);
75+
model.b = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, cols_B, rows_B);
76+
77+
// create a backend buffer (backend memory) and alloc the tensors from the context
78+
model.buffer = ggml_backend_alloc_ctx_tensors(model.ctx, model.backend);
79+
80+
// load data from cpu memory to backend buffer
81+
ggml_backend_tensor_set(model.a, a, 0, ggml_nbytes(model.a));
82+
ggml_backend_tensor_set(model.b, b, 0, ggml_nbytes(model.b));
83+
}
84+
85+
// build the compute graph to perform a matrix multiplication
86+
struct ggml_cgraph * build_graph(const simple_model& model) {
87+
static size_t buf_size = ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead();
88+
static std::vector<uint8_t> buf(buf_size);
89+
90+
struct ggml_init_params params0 = {
91+
/*.mem_size =*/ buf_size,
92+
/*.mem_buffer =*/ buf.data(),
93+
/*.no_alloc =*/ true, // the tensors will be allocated later by ggml_allocr_alloc_graph()
94+
};
95+
96+
// create a temporally context to build the graph
97+
struct ggml_context * ctx0 = ggml_init(params0);
98+
99+
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
100+
101+
// result = a*b^T
102+
struct ggml_tensor * result = ggml_mul_mat(ctx0, model.a, model.b);
103+
104+
// build operations nodes
105+
ggml_build_forward_expand(gf, result);
106+
107+
// delete the temporally context used to build the graph
108+
ggml_free(ctx0);
109+
return gf;
110+
}
111+
112+
// compute with backend
113+
struct ggml_tensor * compute(const simple_model & model, ggml_gallocr_t allocr) {
114+
// reset the allocator to free all the memory allocated during the previous inference
115+
116+
struct ggml_cgraph * gf = build_graph(model);
117+
118+
// allocate tensors
119+
ggml_gallocr_alloc_graph(allocr, gf);
120+
121+
int n_threads = 1; // number of threads to perform some operations with multi-threading
122+
123+
if (ggml_backend_is_cpu(model.backend)) {
124+
ggml_backend_cpu_set_n_threads(model.backend, n_threads);
125+
}
126+
127+
#ifdef GGML_USE_METAL
128+
if (ggml_backend_is_metal(model.backend)) {
129+
ggml_backend_metal_set_n_cb(model.backend, n_threads);
130+
}
131+
#endif
132+
133+
ggml_backend_graph_compute(model.backend, gf);
134+
135+
// in this case, the output tensor is the last one in the graph
136+
return gf->nodes[gf->n_nodes - 1];
137+
}
138+
139+
int main(void) {
140+
ggml_time_init();
141+
142+
// initialize data of matrices to perform matrix multiplication
143+
const int rows_A = 4, cols_A = 2;
144+
145+
float matrix_A[rows_A * cols_A] = {
146+
2, 8,
147+
5, 1,
148+
4, 2,
149+
8, 6
150+
};
151+
152+
const int rows_B = 3, cols_B = 2;
153+
/* Transpose([
154+
10, 9, 5,
155+
5, 9, 4
156+
]) 2 rows, 3 cols */
157+
float matrix_B[rows_B * cols_B] = {
158+
10, 5,
159+
9, 9,
160+
5, 4
161+
};
162+
163+
simple_model model;
164+
load_model(model, matrix_A, matrix_B, rows_A, cols_A, rows_B, cols_B);
165+
166+
// calculate the temporaly memory required to compute
167+
ggml_gallocr_t allocr = NULL;
168+
169+
{
170+
allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend));
171+
172+
// create the worst case graph for memory usage estimation
173+
struct ggml_cgraph * gf = build_graph(model);
174+
ggml_gallocr_reserve(allocr, gf);
175+
size_t mem_size = ggml_gallocr_get_buffer_size(allocr, 0);
176+
177+
fprintf(stderr, "%s: compute buffer size: %.4f KB\n", __func__, mem_size/1024.0);
178+
}
179+
180+
// perform computation
181+
struct ggml_tensor * result = compute(model, allocr);
182+
183+
// create a array to print result
184+
std::vector<float> out_data(ggml_nelements(result));
185+
186+
// bring the data from the backend memory
187+
ggml_backend_tensor_get(result, out_data.data(), 0, ggml_nbytes(result));
188+
189+
// expected result:
190+
// [ 60.00 110.00 54.00 29.00
191+
// 55.00 90.00 126.00 28.00
192+
// 50.00 54.00 42.00 64.00 ]
193+
194+
printf("mul mat (%d x %d) (transposed result):\n[", result->ne[0], result->ne[1]);
195+
for (int j = 0; j < result->ne[1] /* rows */; j++) {
196+
if (j > 0) {
197+
printf("\n");
198+
}
199+
200+
for (int i = 0; i < result->ne[0] /* cols */; i++) {
201+
printf(" %.2f", out_data[i * result->ne[1] + j]);
202+
}
203+
}
204+
printf(" ]\n");
205+
206+
// release backend memory used for computation
207+
ggml_gallocr_free(allocr);
208+
209+
// free memory
210+
ggml_free(model.ctx);
211+
212+
// release backend memory and free backend
213+
ggml_backend_buffer_free(model.buffer);
214+
ggml_backend_free(model.backend);
215+
return 0;
216+
}

examples/simple/simple-ctx.cpp

Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
#include "ggml.h"
2+
3+
#include <cassert>
4+
#include <cmath>
5+
#include <cstdio>
6+
#include <cstring>
7+
#include <fstream>
8+
#include <map>
9+
#include <string>
10+
#include <vector>
11+
12+
// This is a simple model with two tensors a and b
13+
struct simple_model {
14+
struct ggml_tensor * a;
15+
struct ggml_tensor * b;
16+
17+
// the context to define the tensor information (dimensions, size, memory data)
18+
struct ggml_context * ctx;
19+
};
20+
21+
// initialize the tensors of the model in this case two matrices 2x2
22+
void load_model(simple_model & model, float * a, float * b, int rows_A, int cols_A, int rows_B, int cols_B) {
23+
size_t ctx_size = 0;
24+
{
25+
ctx_size += rows_A * cols_A * ggml_type_size(GGML_TYPE_F32); // tensor a
26+
ctx_size += rows_B * cols_B * ggml_type_size(GGML_TYPE_F32); // tensor b
27+
ctx_size += 2 * ggml_tensor_overhead(), // tensors
28+
ctx_size += ggml_graph_overhead(); // compute graph
29+
ctx_size += 1024; // some overhead
30+
}
31+
32+
struct ggml_init_params params {
33+
/*.mem_size =*/ ctx_size,
34+
/*.mem_buffer =*/ NULL,
35+
/*.no_alloc =*/ false, // NOTE: this should be false when using the legacy API
36+
};
37+
38+
// create context
39+
model.ctx = ggml_init(params);
40+
41+
// create tensors
42+
model.a = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, cols_A, rows_A);
43+
model.b = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, cols_B, rows_B);
44+
45+
memcpy(model.a->data, a, ggml_nbytes(model.a));
46+
memcpy(model.b->data, b, ggml_nbytes(model.b));
47+
}
48+
49+
// build the compute graph to perform a matrix multiplication
50+
struct ggml_cgraph * build_graph(const simple_model& model) {
51+
struct ggml_cgraph * gf = ggml_new_graph(model.ctx);
52+
53+
// result = a*b^T
54+
struct ggml_tensor * result = ggml_mul_mat(model.ctx, model.a, model.b);
55+
56+
ggml_build_forward_expand(gf, result);
57+
return gf;
58+
}
59+
60+
// compute with backend
61+
struct ggml_tensor * compute(const simple_model & model) {
62+
struct ggml_cgraph * gf = build_graph(model);
63+
64+
int n_threads = 1; // number of threads to perform some operations with multi-threading
65+
66+
ggml_graph_compute_with_ctx(model.ctx, gf, n_threads);
67+
68+
// in this case, the output tensor is the last one in the graph
69+
return gf->nodes[gf->n_nodes - 1];
70+
}
71+
72+
int main(void) {
73+
ggml_time_init();
74+
75+
// initialize data of matrices to perform matrix multiplication
76+
const int rows_A = 4, cols_A = 2;
77+
78+
float matrix_A[rows_A * cols_A] = {
79+
2, 8,
80+
5, 1,
81+
4, 2,
82+
8, 6
83+
};
84+
85+
const int rows_B = 3, cols_B = 2;
86+
/* Transpose([
87+
10, 9, 5,
88+
5, 9, 4
89+
]) 2 rows, 3 cols */
90+
float matrix_B[rows_B * cols_B] = {
91+
10, 5,
92+
9, 9,
93+
5, 4
94+
};
95+
96+
simple_model model;
97+
load_model(model, matrix_A, matrix_B, rows_A, cols_A, rows_B, cols_B);
98+
99+
// perform computation in cpu
100+
struct ggml_tensor * result = compute(model);
101+
102+
// get the result data pointer as a float array to print
103+
std::vector<float> out_data(ggml_nelements(result));
104+
memcpy(out_data.data(), result->data, ggml_nbytes(result));
105+
106+
// expected result:
107+
// [ 60.00 110.00 54.00 29.00
108+
// 55.00 90.00 126.00 28.00
109+
// 50.00 54.00 42.00 64.00 ]
110+
111+
printf("mul mat (%d x %d) (transposed result):\n[", result->ne[0], result->ne[1]);
112+
for (int j = 0; j < result->ne[1] /* rows */; j++) {
113+
if (j > 0) {
114+
printf("\n");
115+
}
116+
117+
for (int i = 0; i < result->ne[0] /* cols */; i++) {
118+
printf(" %.2f", out_data[i * result->ne[1] + j]);
119+
}
120+
}
121+
printf(" ]\n");
122+
123+
// free memory
124+
ggml_free(model.ctx);
125+
return 0;
126+
}

0 commit comments

Comments
 (0)