Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

sync : llama.cpp #773

Merged
merged 35 commits into from Mar 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
aeb0020
gguf : add support for I64 and F64 arrays (llama/6062)
certik Mar 15, 2024
323c87d
Fix non-intel device selection (llama/6042)
AidanBeltonS Mar 15, 2024
aab2444
fix set main gpu error (llama/6073)
NeoZhangJianyu Mar 15, 2024
3359229
cuda : disable unused cudaLaunchHostFunc code (llama/6078)
slaren Mar 15, 2024
116d0a2
ggml : add AVX512F SIMD (llama/6088)
amiralimi Mar 16, 2024
8172925
ggml:fix finding transfer queue family index error (llama/6094)
GainLee Mar 17, 2024
3edce07
backend : offload large batches to GPU (llama/6083)
slaren Mar 18, 2024
d1023de
backend : set max split inputs to GGML_MAX_SRC (llama/6137)
slaren Mar 18, 2024
dff7077
increase igpu cluster limit (llama/6159)
abhilash1910 Mar 20, 2024
881e390
cuda : refactor to remove global resources (llama/6170)
slaren Mar 20, 2024
5fa5e12
cuda : print the returned error when CUDA initialization fails (llama…
slaren Mar 20, 2024
0b457ad
cuda : fix conflict with std::swap (llama/6186)
slaren Mar 21, 2024
8339d37
Add nvidia and amd backends (llama/6157)
AidanBeltonS Mar 21, 2024
052559f
Add ability to use Q5_0, Q5_1, and IQ4_NL for quantized K cache (llam…
ikawrakow Mar 21, 2024
a9e1f05
ggml : same IQ4_NL quantization for CPU/CUDA/Metal (llama/6196)
ikawrakow Mar 21, 2024
f48bc15
cuda : fix LLAMA_CUDA_F16 build (llama/6197)
slaren Mar 21, 2024
3bd340f
cuda : disable host register by default (llama/6206)
slaren Mar 21, 2024
5cca5b4
metal : pad n_ctx by 32 (llama/6177)
ggerganov Mar 22, 2024
e1e0d48
metal : proper assert for mat-mat memory alignment (llama/6225)
ggerganov Mar 22, 2024
f45358a
cuda : add LLAMA_CUDA_NO_PEER_COPY to workaround broken ROCm p2p copy…
slaren Mar 22, 2024
c1ab06b
use _wfopen instead of fopen on Windows (llama/6248)
cebtenzzre Mar 23, 2024
8f1c0a4
offload op (llama/6217)
airMeng Mar 24, 2024
3bc549e
Fix heap corruption from wmode out-of-bound writes on windows (llama/…
TheFlipbook Mar 24, 2024
4a4ba1b
ggml : support AVX512VNNI (llama/6280)
jart Mar 25, 2024
39b7c81
cuda : refactor into multiple files (llama/6269)
slaren Mar 25, 2024
2c3e572
tests : include IQ2_XXS and IQ2_XS in test-quantize-fns (llama/6303)
ikawrakow Mar 25, 2024
1697add
cuda : rename build flag to LLAMA_CUDA (llama/6299)
slaren Mar 26, 2024
847bedc
IQ1_M: 1.75 bpw quantization (llama/6302)
ikawrakow Mar 26, 2024
fe968c1
llama : greatly reduce output buffer memory usage (llama/6122)
compilade Mar 26, 2024
0c19983
Make IQ1_M work for QK_K = 64 (llama/6327)
ikawrakow Mar 27, 2024
348f094
Fix batched impl for NVidia GPU (llama/6164)
AidanBeltonS Mar 27, 2024
6937369
sync : llama.cpp
ggerganov Mar 27, 2024
a4ce4c3
sync : adapt to CUDA changes (#0)
ggerganov Mar 27, 2024
fe3158e
examples : fix CUBLAS leftovers (#0)
ggerganov Mar 27, 2024
11d2033
examples : more CUDA leftovers (#0)
ggerganov Mar 27, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
3 changes: 2 additions & 1 deletion CMakeLists.txt
Expand Up @@ -44,7 +44,8 @@ option(GGML_NO_ACCELERATE "ggml: disable Accelerate framework" OFF)
option(GGML_OPENBLAS "ggml: use OpenBLAS" OFF)
option(GGML_CLBLAST "ggml: use clBLAST" OFF)
option(GGML_HIPBLAS "ggml: use hipBLAS" OFF)
option(GGML_CUBLAS "ggml: use cuBLAS" OFF)
option(GGML_CUDA "ggml: use CUDA" OFF)
option(GGML_CUBLAS "ggml: use CUDA (deprecated)" OFF)
option(GGML_METAL "ggml: use Metal" OFF)

option(GGML_CUDA_FORCE_DMMV "ggml: use dmmv instead of mmvq CUDA kernels" OFF)
Expand Down
4 changes: 4 additions & 0 deletions examples/common-ggml.cpp
Expand Up @@ -70,6 +70,7 @@ bool ggml_common_quantize_0(
case GGML_FTYPE_MOSTLY_IQ1_S:
case GGML_FTYPE_MOSTLY_IQ4_NL:
case GGML_FTYPE_MOSTLY_IQ4_XS:
case GGML_FTYPE_MOSTLY_IQ1_M:
{
fprintf(stderr, "%s: invalid model type %d\n", __func__, ftype);
return false;
Expand Down Expand Up @@ -193,6 +194,8 @@ bool ggml_common_quantize_0(
case GGML_TYPE_I8:
case GGML_TYPE_I16:
case GGML_TYPE_I32:
case GGML_TYPE_I64:
case GGML_TYPE_F64:
case GGML_TYPE_Q8_1:
case GGML_TYPE_Q8_K:
case GGML_TYPE_IQ2_XXS:
Expand All @@ -203,6 +206,7 @@ bool ggml_common_quantize_0(
case GGML_TYPE_IQ1_S:
case GGML_TYPE_IQ4_NL:
case GGML_TYPE_IQ4_XS:
case GGML_TYPE_IQ1_M:
case GGML_TYPE_COUNT:
{
fprintf(stderr, "%s: unsupported quantization type %d (%s)\n", __func__, ttype, ggml_type_name((ggml_type) ttype));
Expand Down
4 changes: 2 additions & 2 deletions examples/gpt-2/main-backend.cpp
Expand Up @@ -2,7 +2,7 @@
#include "ggml/ggml-alloc.h"
#include "ggml/ggml-backend.h"

#ifdef GGML_USE_CUBLAS
#ifdef GGML_USE_CUDA
#include "ggml-cuda.h"
#endif

Expand Down Expand Up @@ -197,7 +197,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
}

// initialize the backend
#ifdef GGML_USE_CUBLAS
#ifdef GGML_USE_CUDA
if (n_gpu_layers > 0) {
fprintf(stderr, "%s: using CUDA backend\n", __func__);
model.backend = ggml_backend_cuda_init(0);
Expand Down
4 changes: 2 additions & 2 deletions examples/gpt-2/main-batched.cpp
Expand Up @@ -2,7 +2,7 @@
#include "ggml/ggml-alloc.h"
#include "ggml/ggml-backend.h"

#ifdef GGML_USE_CUBLAS
#ifdef GGML_USE_CUDA
#include "ggml-cuda.h"
#endif

Expand Down Expand Up @@ -285,7 +285,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
}

// initialize the backend
#ifdef GGML_USE_CUBLAS
#ifdef GGML_USE_CUDA
if (n_gpu_layers > 0) {
fprintf(stderr, "%s: using CUDA backend\n", __func__);
model.backend = ggml_backend_cuda_init(0);
Expand Down
4 changes: 2 additions & 2 deletions examples/gpt-2/main-sched.cpp
Expand Up @@ -2,7 +2,7 @@
#include "ggml/ggml-alloc.h"
#include "ggml/ggml-backend.h"

#ifdef GGML_USE_CUBLAS
#ifdef GGML_USE_CUDA
#include "ggml-cuda.h"
#endif

Expand Down Expand Up @@ -105,7 +105,7 @@ void init_backends(gpt2_model & model, const gpt_params & params) {
ggml_backend_t gpu_backend = NULL;

// initialize the backends
#ifdef GGML_USE_CUBLAS
#ifdef GGML_USE_CUDA
if (params.n_gpu_layers > 0) {
fprintf(stderr, "%s: using CUDA backend\n", __func__);
gpu_backend = ggml_backend_cuda_init(0);
Expand Down
4 changes: 2 additions & 2 deletions examples/magika/CMakeLists.txt
Expand Up @@ -8,8 +8,8 @@ target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml)
#
# For GPU offloading

if (GGML_CUBLAS)
add_compile_definitions(GGML_USE_CUBLAS)
if (GGML_CUDA)
add_compile_definitions(GGML_USE_CUDA)
endif()

if (GGML_CLBLAST)
Expand Down
2 changes: 1 addition & 1 deletion examples/python/README.md
Expand Up @@ -52,7 +52,7 @@ As of this writing the best is to use [ggerganov/llama.cpp](https://github.com/g

```bash
git clone https://github.com/ggerganov/llama.cpp
# On a CUDA-enabled system add -DLLAMA_CUBLAS=1
# On a CUDA-enabled system add -DLLAMA_CUDA=1
# On a Mac add -DLLAMA_METAL=1
cmake llama.cpp \
-B llama_build \
Expand Down
8 changes: 4 additions & 4 deletions examples/python/ggml/__init__.pyi
Expand Up @@ -568,8 +568,8 @@ class lib:
def ggml_cpu_has_clblast() -> int:
""" GGML_API int ggml_cpu_has_clblast (void);"""
...
def ggml_cpu_has_cublas() -> int:
""" GGML_API int ggml_cpu_has_cublas (void);"""
def ggml_cpu_has_cuda() -> int:
""" GGML_API int ggml_cpu_has_cuda (void);"""
...
def ggml_cpu_has_f16c() -> int:
""" GGML_API int ggml_cpu_has_f16c (void);"""
Expand Down Expand Up @@ -967,8 +967,8 @@ class lib:
def ggml_init(params: ffi.CData) -> ffi.CData:
""" GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);"""
...
def ggml_init_cublas() -> None:
"""GGML_API void ggml_init_cublas(void);"""
def ggml_init_cuda() -> None:
"""GGML_API void ggml_init_cuda(void);"""
...
def ggml_internal_get_type_traits(type: int) -> ffi.CData:
""" ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);"""
Expand Down
4 changes: 2 additions & 2 deletions examples/simple/CMakeLists.txt
Expand Up @@ -12,8 +12,8 @@ set(TEST_TARGET simple-backend)
add_executable(${TEST_TARGET} simple-backend.cpp)
target_link_libraries(${TEST_TARGET} PRIVATE ggml)

if (GGML_CUBLAS)
add_compile_definitions(GGML_USE_CUBLAS)
if (GGML_CUDA)
add_compile_definitions(GGML_USE_CUDA)
endif()

if (GGML_METAL)
Expand Down
4 changes: 2 additions & 2 deletions examples/simple/simple-backend.cpp
Expand Up @@ -2,7 +2,7 @@
#include "ggml/ggml-alloc.h"
#include "ggml/ggml-backend.h"

#ifdef GGML_USE_CUBLAS
#ifdef GGML_USE_CUDA
#include "ggml-cuda.h"
#endif

Expand Down Expand Up @@ -44,7 +44,7 @@ struct simple_model {
// initialize the tensors of the model in this case two matrices 2x2
void load_model(simple_model & model, float * a, float * b, int rows_A, int cols_A, int rows_B, int cols_B) {
// initialize the backend
#ifdef GGML_USE_CUBLAS
#ifdef GGML_USE_CUDA
fprintf(stderr, "%s: using CUDA backend\n", __func__);
model.backend = ggml_backend_cuda_init(0); // init device 0
if (!model.backend) {
Expand Down
8 changes: 4 additions & 4 deletions examples/whisper/whisper.cpp
Expand Up @@ -8,7 +8,7 @@
#include "ggml-metal.h"
#endif

#ifdef GGML_USE_CUBLAS
#ifdef GGML_USE_CUDA
#include "ggml-cuda.h"
#endif

Expand Down Expand Up @@ -1031,8 +1031,8 @@ static ggml_backend_t whisper_backend_init(const whisper_context_params & params
ggml_backend_t backend_gpu = NULL;

// initialize the backends
#ifdef GGML_USE_CUBLAS
if (params.use_gpu && ggml_cublas_loaded()) {
#ifdef GGML_USE_CUDA
if (params.use_gpu) {
WHISPER_LOG_INFO("%s: using CUDA backend\n", __func__);
backend_gpu = ggml_backend_cuda_init(params.gpu_device);
if (!backend_gpu) {
Expand Down Expand Up @@ -3852,7 +3852,7 @@ const char * whisper_print_system_info(void) {
s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
s += "SSSE3 = " + std::to_string(ggml_cpu_has_ssse3()) + " | ";
s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
s += "CUDA = " + std::to_string(ggml_cpu_has_cublas()) + " | ";
s += "CUDA = " + std::to_string(ggml_cpu_has_cuda()) + " | ";
s += "COREML = " + std::to_string(whisper_has_coreml()) + " | ";
s += "OPENVINO = " + std::to_string(whisper_has_openvino()) ;

Expand Down
8 changes: 4 additions & 4 deletions include/ggml/ggml-backend.h
Expand Up @@ -70,11 +70,11 @@ extern "C" {
GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph);
GGML_API void ggml_backend_graph_plan_free (ggml_backend_t backend, ggml_backend_graph_plan_t plan);

GGML_API enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);

GGML_API bool ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph);
GGML_API enum ggml_status ggml_backend_graph_plan_compute (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph);
GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
GGML_API bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op);

// tensor copy between different backends
GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
Expand Down
15 changes: 12 additions & 3 deletions include/ggml/ggml.h
Expand Up @@ -214,9 +214,10 @@
# define GGML_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
#endif

#include <stdint.h>
#include <stddef.h>
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
#include <stdio.h>

#define GGML_FILE_MAGIC 0x67676d6c // "ggml"
#define GGML_FILE_VERSION 1
Expand Down Expand Up @@ -366,6 +367,9 @@ extern "C" {
GGML_TYPE_I8 = 24,
GGML_TYPE_I16 = 25,
GGML_TYPE_I32 = 26,
GGML_TYPE_I64 = 27,
GGML_TYPE_F64 = 28,
GGML_TYPE_IQ1_M = 29,
GGML_TYPE_COUNT,
};

Expand Down Expand Up @@ -405,6 +409,7 @@ extern "C" {
GGML_FTYPE_MOSTLY_IQ3_S = 20, // except 1d tensors
GGML_FTYPE_MOSTLY_IQ2_S = 21, // except 1d tensors
GGML_FTYPE_MOSTLY_IQ4_XS = 22, // except 1d tensors
GGML_FTYPE_MOSTLY_IQ1_M = 23, // except 1d tensors
};

// available tensor operations:
Expand Down Expand Up @@ -706,6 +711,9 @@ extern "C" {

GGML_API void ggml_print_backtrace(void);

// accepts a UTF-8 path, even on Windows
GGML_API FILE * ggml_fopen(const char * fname, const char * mode);

GGML_API void ggml_numa_init(enum ggml_numa_strategy numa); // call once for better performance on NUMA systems
GGML_API bool ggml_is_numa(void); // true if init detected that system has >1 NUMA node

Expand Down Expand Up @@ -742,6 +750,7 @@ extern "C" {
GGML_API GGML_CALL bool ggml_is_transposed(const struct ggml_tensor * tensor);
GGML_API GGML_CALL bool ggml_is_contiguous(const struct ggml_tensor * tensor);
GGML_API GGML_CALL bool ggml_is_permuted (const struct ggml_tensor * tensor);
GGML_API GGML_CALL bool ggml_is_empty (const struct ggml_tensor * tensor);
GGML_API bool ggml_is_scalar (const struct ggml_tensor * tensor);
GGML_API bool ggml_is_vector (const struct ggml_tensor * tensor);
GGML_API bool ggml_is_matrix (const struct ggml_tensor * tensor);
Expand Down Expand Up @@ -2348,7 +2357,7 @@ extern "C" {
GGML_API int ggml_cpu_has_fp16_va (void);
GGML_API int ggml_cpu_has_wasm_simd (void);
GGML_API int ggml_cpu_has_blas (void);
GGML_API int ggml_cpu_has_cublas (void);
GGML_API int ggml_cpu_has_cuda (void);
GGML_API int ggml_cpu_has_clblast (void);
GGML_API int ggml_cpu_has_vulkan (void);
GGML_API int ggml_cpu_has_kompute (void);
Expand Down
2 changes: 2 additions & 0 deletions scripts/sync-llama-am.sh
Expand Up @@ -94,6 +94,7 @@ if [ -f $SRC_GGML/llama-src.patch ]; then
# ggml-backend-impl.h -> src/ggml-backend-impl.h
# ggml-backend.c -> src/ggml-backend.c
# ggml-common.h -> src/ggml-common.h
# ggml-cuda/* -> src/ggml-cuda/*
# ggml-cuda.cu -> src/ggml-cuda.cu
# ggml-cuda.h -> src/ggml-cuda.h
# ggml-impl.h -> src/ggml-impl.h
Expand Down Expand Up @@ -127,6 +128,7 @@ if [ -f $SRC_GGML/llama-src.patch ]; then
-e 's/\/ggml-backend-impl\.h/\/src\/ggml-backend-impl.h/g' \
-e 's/\/ggml-backend\.c/\/src\/ggml-backend.c/g' \
-e 's/\/ggml-common\.h/\/src\/ggml-common.h/g' \
-e 's/\/ggml-cuda\//\/src\/ggml-cuda\//g' \
-e 's/\/ggml-cuda\.cu/\/src\/ggml-cuda.cu/g' \
-e 's/\/ggml-cuda\.h/\/src\/ggml-cuda.h/g' \
-e 's/\/ggml-impl\.h/\/src\/ggml-impl.h/g' \
Expand Down
2 changes: 1 addition & 1 deletion scripts/sync-llama.last
@@ -1 +1 @@
044ec4b2a567f649459ccd20af2f387c784faa51
e82f9e2b833d88cd2b30123ef57346c2cb8abd99
1 change: 1 addition & 0 deletions scripts/sync-llama.sh
Expand Up @@ -5,6 +5,7 @@ cp -rpv ../llama.cpp/ggml-alloc.c src/ggml-alloc.c
cp -rpv ../llama.cpp/ggml-backend-impl.h src/ggml-backend-impl.h
cp -rpv ../llama.cpp/ggml-backend.c src/ggml-backend.c
cp -rpv ../llama.cpp/ggml-common.h src/ggml-common.h
cp -rpv ../llama.cpp/ggml-cuda/* src/ggml-cuda/
cp -rpv ../llama.cpp/ggml-cuda.cu src/ggml-cuda.cu
cp -rpv ../llama.cpp/ggml-cuda.h src/ggml-cuda.h
cp -rpv ../llama.cpp/ggml-impl.h src/ggml-impl.h
Expand Down
19 changes: 14 additions & 5 deletions src/CMakeLists.txt
Expand Up @@ -206,17 +206,24 @@ if (GGML_CLBLAST)
endif()

if (GGML_CUBLAS)
message(WARNING "GGML_CUBLAS is deprecated and will be removed in the future.\nUse GGML_CUDA instead")
set(GGML_CUDA ON)
endif()

if (GGML_CUDA)
cmake_minimum_required(VERSION 3.17)

find_package(CUDAToolkit)
if (CUDAToolkit_FOUND)
message(STATUS "cuBLAS found")
message(STATUS "CUDA found")

enable_language(CUDA)

set(GGML_CUDA_SOURCES ggml-cuda.cu ggml-cuda.h)
file(GLOB GGML_CUDA_SOURCES "ggml-cuda/*.cu")
list(APPEND GGML_CUDA_SOURCES ggml-cuda.h)
list(APPEND GGML_CUDA_SOURCES ggml-cuda.cu)

set(GGML_EXTRA_FLAGS ${GGML_EXTRA_FLAGS} -DGGML_USE_CUBLAS)
set(GGML_EXTRA_FLAGS ${GGML_EXTRA_FLAGS} -DGGML_USE_CUDA)

if (GGML_CUDA_FORCE_DMMV)
add_compile_definitions(GGML_CUDA_FORCE_DMMV)
Expand Down Expand Up @@ -245,10 +252,11 @@ if (GGML_CUBLAS)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo")
endif()
else()
message(WARNING "cuBLAS not found")
message(WARNING "CUDA not found")
endif()
endif()

# TODO: do not build separate ggml-rocm target (see CUDA build above, or llama.cpp for reference)
if (GGML_HIPBLAS)
list(APPEND CMAKE_PREFIX_PATH /opt/rocm)

Expand All @@ -266,9 +274,10 @@ if (GGML_HIPBLAS)
if (${hipblas_FOUND} AND ${hip_FOUND})
message(STATUS "HIP and hipBLAS found")

add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUBLAS)
add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUDA)

add_library(ggml-rocm OBJECT ggml-cuda.cu ggml-cuda.h)

if (BUILD_SHARED_LIBS)
set_target_properties(ggml-rocm PROPERTIES POSITION_INDEPENDENT_CODE ON)
endif()
Expand Down
10 changes: 7 additions & 3 deletions src/ggml-alloc.c
Expand Up @@ -548,7 +548,11 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
for (int i = 0; i < graph->n_nodes; i++) {
struct ggml_tensor * node = graph->nodes[i];

if (ggml_is_view(node)) {
// TODO: better way to add external dependencies
// GGML_OP_NONE does not appear normally in the graph nodes, but is used by ggml-backend to add dependencies to
// control when some tensors are allocated and freed. in this case, the dependencies are in `src`, but the node
// itself is never used and should not be considered a dependency
if (ggml_is_view(node) && node->op != GGML_OP_NONE) {
struct ggml_tensor * view_src = node->view_src;
ggml_gallocr_hash_get(galloc, view_src)->n_views += 1;
}
Expand All @@ -565,8 +569,8 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr

ggml_gallocr_hash_get(galloc, src)->n_children += 1;

// allocate explicit inputs and leafs
if (src->flags & GGML_TENSOR_FLAG_INPUT || src->op == GGML_OP_NONE) {
// allocate explicit inputs
if (src->flags & GGML_TENSOR_FLAG_INPUT) {
ggml_gallocr_allocate_node(galloc, src, get_node_buffer_id(node_buffer_ids, i));
}
}
Expand Down
5 changes: 5 additions & 0 deletions src/ggml-backend-impl.h
Expand Up @@ -103,6 +103,11 @@ extern "C" {
// check if the backend supports an operation
bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);

// check if the backend wants to run an operation, even if the weights are allocated in a CPU buffer
// these should be expensive operations with large batch sizes that may benefit from running on this backend
// even if the weight has to be copied from the CPU temporarily
bool (*GGML_CALL offload_op)(ggml_backend_t backend, const struct ggml_tensor * op);

// (optional) event synchronization
ggml_backend_event_t (*GGML_CALL event_new) (ggml_backend_t backend);
void (*GGML_CALL event_free) (ggml_backend_event_t event);
Expand Down