ggml : add simple example (#713)

FSSRepo · web-flow · commit d9cd6b570997 · 2024-02-28T11:40:12.000-05:00
* add simple example for explain memory management and basic operation of ggml
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
@@ -24,4 +24,5 @@ add_subdirectory(whisper)
 add_subdirectory(mnist)
 add_subdirectory(sam)
 add_subdirectory(yolo)
+add_subdirectory(simple)
 add_subdirectory(magika)
diff --git a/examples/simple/CMakeLists.txt b/examples/simple/CMakeLists.txt
@@ -0,0 +1,21 @@
+#
+# simple-ctx
+
+set(TEST_TARGET simple-ctx)
+add_executable(${TEST_TARGET} simple-ctx.cpp)
+target_link_libraries(${TEST_TARGET} PRIVATE ggml)
+
+#
+# simple-backend
+
+set(TEST_TARGET simple-backend)
+add_executable(${TEST_TARGET} simple-backend.cpp)
+target_link_libraries(${TEST_TARGET} PRIVATE ggml)
+
+if (GGML_CUBLAS)
+    add_compile_definitions(GGML_USE_CUBLAS)
+endif()
+
+if (GGML_METAL)
+    add_compile_definitions(GGML_USE_METAL)
+endif()
diff --git a/examples/simple/README.md b/examples/simple/README.md
@@ -0,0 +1,25 @@
+## Simple
+
+This example simply performs a matrix multiplication, solely for the purpose of demonstrating a basic usage of ggml and backend handling. The code is commented to help understand what each part does.
+
+$$
+\begin{bmatrix}
+2 & 8 \\
+5 & 1 \\
+4 & 2 \\
+8 & 6 \\
+\end{bmatrix}
+\times
+\begin{bmatrix}
+10 & 9 & 5 \\
+5 & 9 & 4 \\
+\end{bmatrix}
+\=
+\begin{bmatrix}
+60 & 110 & 54 & 29 \\
+55 & 90 & 126 & 28 \\
+50 & 54 & 42 & 64 \\
+\end{bmatrix}
+$$
+
+The `simple-ctx` doesn't support gpu acceleration. `simple-backend` demonstrates how to use other backends like CUDA and Metal.
diff --git a/examples/simple/simple-backend.cpp b/examples/simple/simple-backend.cpp
@@ -0,0 +1,216 @@
+#include "ggml.h"
+#include "ggml/ggml-alloc.h"
+#include "ggml/ggml-backend.h"
+
+#ifdef GGML_USE_CUBLAS
+#include "ggml-cuda.h"
+#endif
+
+#ifdef GGML_USE_METAL
+#include "ggml-metal.h"
+#endif
+
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <fstream>
+#include <map>
+#include <string>
+#include <vector>
+
+// This is a simple model with two tensors a and b
+struct simple_model {
+    struct ggml_tensor * a;
+    struct ggml_tensor * b;
+
+    // the backend to perform the computation (CPU, CUDA, METAL)
+    ggml_backend_t backend = NULL;
+
+    // the backend buffer to storage the tensors data of a and b
+    ggml_backend_buffer_t buffer;
+
+    // the context to define the tensor information (dimensions, size, memory address)
+    struct ggml_context * ctx;
+};
+
+// initialize the tensors of the model in this case two matrices 2x2
+void load_model(simple_model & model, float * a, float * b, int rows_A, int cols_A, int rows_B, int cols_B) {
+    // initialize the backend
+#ifdef GGML_USE_CUBLAS
+    fprintf(stderr, "%s: using CUDA backend\n", __func__);
+    model.backend = ggml_backend_cuda_init(0); // init device 0
+    if (!model.backend) {
+        fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
+    }
+#endif
+
+#ifdef GGML_USE_METAL
+    fprintf(stderr, "%s: using Metal backend\n", __func__);
+    ggml_metal_log_set_callback(ggml_log_callback_default, nullptr);
+    model.backend = ggml_backend_metal_init();
+    if (!model.backend) {
+        fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
+    }
+#endif
+
+    // if there aren't GPU Backends fallback to CPU backend
+    if (!model.backend) {
+        model.backend = ggml_backend_cpu_init();
+    }
+
+    int num_tensors = 2;
+
+    struct ggml_init_params params {
+            /*.mem_size   =*/ ggml_tensor_overhead() * num_tensors,
+            /*.mem_buffer =*/ NULL,
+            /*.no_alloc   =*/ true,
+    };
+
+    // create context
+    model.ctx = ggml_init(params);
+
+    // create tensors
+    model.a = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, cols_A, rows_A);
+    model.b = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, cols_B, rows_B);
+
+    // create a backend buffer (backend memory) and alloc the tensors from the context
+    model.buffer = ggml_backend_alloc_ctx_tensors(model.ctx, model.backend);
+
+    // load data from cpu memory to backend buffer
+    ggml_backend_tensor_set(model.a, a, 0, ggml_nbytes(model.a));
+    ggml_backend_tensor_set(model.b, b, 0, ggml_nbytes(model.b));
+}
+
+// build the compute graph to perform a matrix multiplication
+struct ggml_cgraph * build_graph(const simple_model& model) {
+    static size_t buf_size = ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead();
+    static std::vector<uint8_t> buf(buf_size);
+
+    struct ggml_init_params params0 = {
+        /*.mem_size   =*/ buf_size,
+        /*.mem_buffer =*/ buf.data(),
+        /*.no_alloc   =*/ true, // the tensors will be allocated later by ggml_allocr_alloc_graph()
+    };
+
+    // create a temporally context to build the graph
+    struct ggml_context * ctx0 = ggml_init(params0);
+
+    struct ggml_cgraph  * gf = ggml_new_graph(ctx0);
+
+    // result = a*b^T
+    struct ggml_tensor * result = ggml_mul_mat(ctx0, model.a, model.b);
+
+    // build operations nodes
+    ggml_build_forward_expand(gf, result);
+
+    // delete the temporally context used to build the graph
+    ggml_free(ctx0);
+    return gf;
+}
+
+// compute with backend
+struct ggml_tensor * compute(const simple_model & model, ggml_gallocr_t allocr) {
+    // reset the allocator to free all the memory allocated during the previous inference
+
+    struct ggml_cgraph * gf = build_graph(model);
+
+    // allocate tensors
+    ggml_gallocr_alloc_graph(allocr, gf);
+
+    int n_threads = 1; // number of threads to perform some operations with multi-threading
+
+    if (ggml_backend_is_cpu(model.backend)) {
+        ggml_backend_cpu_set_n_threads(model.backend, n_threads);
+    }
+
+#ifdef GGML_USE_METAL
+    if (ggml_backend_is_metal(model.backend)) {
+        ggml_backend_metal_set_n_cb(model.backend, n_threads);
+    }
+#endif
+
+    ggml_backend_graph_compute(model.backend, gf);
+
+    // in this case, the output tensor is the last one in the graph
+    return gf->nodes[gf->n_nodes - 1];
+}
+
+int main(void) {
+    ggml_time_init();
+
+    // initialize data of matrices to perform matrix multiplication
+    const int rows_A = 4, cols_A = 2;
+
+    float matrix_A[rows_A * cols_A] = {
+        2, 8,
+        5, 1,
+        4, 2,
+        8, 6
+    };
+
+    const int rows_B = 3, cols_B = 2;
+    /* Transpose([
+        10, 9, 5,
+        5, 9, 4
+    ]) 2 rows, 3 cols */
+    float matrix_B[rows_B * cols_B] = {
+        10, 5,
+        9, 9,
+        5, 4
+    };
+
+    simple_model model;
+    load_model(model, matrix_A, matrix_B, rows_A, cols_A, rows_B, cols_B);
+
+    // calculate the temporaly memory required to compute
+    ggml_gallocr_t allocr = NULL;
+
+    {
+        allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend));
+
+        // create the worst case graph for memory usage estimation
+        struct ggml_cgraph * gf = build_graph(model);
+        ggml_gallocr_reserve(allocr, gf);
+        size_t mem_size = ggml_gallocr_get_buffer_size(allocr, 0);
+
+        fprintf(stderr, "%s: compute buffer size: %.4f KB\n", __func__, mem_size/1024.0);
+    }
+
+    // perform computation
+    struct ggml_tensor * result = compute(model, allocr);
+
+    // create a array to print result
+    std::vector<float> out_data(ggml_nelements(result));
+
+    // bring the data from the backend memory
+    ggml_backend_tensor_get(result, out_data.data(), 0, ggml_nbytes(result));
+
+    // expected result:
+    // [ 60.00 110.00 54.00 29.00
+    //  55.00 90.00 126.00 28.00
+    //  50.00 54.00 42.00 64.00 ]
+
+    printf("mul mat (%d x %d) (transposed result):\n[", result->ne[0], result->ne[1]);
+    for (int j = 0; j < result->ne[1] /* rows */; j++) {
+        if (j > 0) {
+            printf("\n");
+        }
+
+        for (int i = 0; i < result->ne[0] /* cols */; i++) {
+            printf(" %.2f", out_data[i * result->ne[1] + j]);
+        }
+    }
+    printf(" ]\n");
+
+    // release backend memory used for computation
+    ggml_gallocr_free(allocr);
+
+    // free memory
+    ggml_free(model.ctx);
+
+    // release backend memory and free backend
+    ggml_backend_buffer_free(model.buffer);
+    ggml_backend_free(model.backend);
+    return 0;
+}
diff --git a/examples/simple/simple-ctx.cpp b/examples/simple/simple-ctx.cpp
@@ -0,0 +1,126 @@
+#include "ggml.h"
+
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <fstream>
+#include <map>
+#include <string>
+#include <vector>
+
+// This is a simple model with two tensors a and b
+struct simple_model {
+    struct ggml_tensor * a;
+    struct ggml_tensor * b;
+
+    // the context to define the tensor information (dimensions, size, memory data)
+    struct ggml_context * ctx;
+};
+
+// initialize the tensors of the model in this case two matrices 2x2
+void load_model(simple_model & model, float * a, float * b, int rows_A, int cols_A, int rows_B, int cols_B) {
+    size_t ctx_size = 0;
+    {
+        ctx_size += rows_A * cols_A * ggml_type_size(GGML_TYPE_F32); // tensor a
+        ctx_size += rows_B * cols_B * ggml_type_size(GGML_TYPE_F32); // tensor b
+        ctx_size += 2 * ggml_tensor_overhead(), // tensors
+        ctx_size += ggml_graph_overhead(); // compute graph
+        ctx_size += 1024; // some overhead
+    }
+
+    struct ggml_init_params params {
+            /*.mem_size   =*/ ctx_size,
+            /*.mem_buffer =*/ NULL,
+            /*.no_alloc   =*/ false, // NOTE: this should be false when using the legacy API
+    };
+
+    // create context
+    model.ctx = ggml_init(params);
+
+    // create tensors
+    model.a = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, cols_A, rows_A);
+    model.b = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, cols_B, rows_B);
+
+    memcpy(model.a->data, a, ggml_nbytes(model.a));
+    memcpy(model.b->data, b, ggml_nbytes(model.b));
+}
+
+// build the compute graph to perform a matrix multiplication
+struct ggml_cgraph * build_graph(const simple_model& model) {
+    struct ggml_cgraph  * gf = ggml_new_graph(model.ctx);
+
+    // result = a*b^T
+    struct ggml_tensor * result = ggml_mul_mat(model.ctx, model.a, model.b);
+
+    ggml_build_forward_expand(gf, result);
+    return gf;
+}
+
+// compute with backend
+struct ggml_tensor * compute(const simple_model & model) {
+    struct ggml_cgraph * gf = build_graph(model);
+
+    int n_threads = 1; // number of threads to perform some operations with multi-threading
+
+    ggml_graph_compute_with_ctx(model.ctx, gf, n_threads);
+
+    // in this case, the output tensor is the last one in the graph
+    return gf->nodes[gf->n_nodes - 1];
+}
+
+int main(void) {
+    ggml_time_init();
+
+    // initialize data of matrices to perform matrix multiplication
+    const int rows_A = 4, cols_A = 2;
+
+    float matrix_A[rows_A * cols_A] = {
+        2, 8,
+        5, 1,
+        4, 2,
+        8, 6
+    };
+
+    const int rows_B = 3, cols_B = 2;
+    /* Transpose([
+        10, 9, 5,
+        5, 9, 4
+    ]) 2 rows, 3 cols */
+    float matrix_B[rows_B * cols_B] = {
+        10, 5,
+        9, 9,
+        5, 4
+    };
+
+    simple_model model;
+    load_model(model, matrix_A, matrix_B, rows_A, cols_A, rows_B, cols_B);
+
+    // perform computation in cpu
+    struct ggml_tensor * result = compute(model);
+
+    // get the result data pointer as a float array to print
+    std::vector<float> out_data(ggml_nelements(result));
+    memcpy(out_data.data(), result->data, ggml_nbytes(result));
+
+    // expected result:
+    // [ 60.00 110.00 54.00 29.00
+    //  55.00 90.00 126.00 28.00
+    //  50.00 54.00 42.00 64.00 ]
+
+    printf("mul mat (%d x %d) (transposed result):\n[", result->ne[0], result->ne[1]);
+    for (int j = 0; j < result->ne[1] /* rows */; j++) {
+        if (j > 0) {
+            printf("\n");
+        }
+
+        for (int i = 0; i < result->ne[0] /* cols */; i++) {
+            printf(" %.2f", out_data[i * result->ne[1] + j]);
+        }
+    }
+    printf(" ]\n");
+
+    // free memory
+    ggml_free(model.ctx);
+    return 0;
+}