Skip to content

Commit

Permalink
ggml : add dynamic CUDA driver loader and static link against CUDA ru…
Browse files Browse the repository at this point in the history
…ntime

This approach lets CUDA enabled binaries to run on systems without CUDA
supported GPUs and fall back to alternative computation methods.
  • Loading branch information
didzis committed Feb 6, 2024
1 parent 7a74e92 commit f10a7b4
Show file tree
Hide file tree
Showing 4 changed files with 164 additions and 6 deletions.
11 changes: 9 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,8 @@ else()
option(WHISPER_CUBLAS "whisper: support for cuBLAS" OFF)
option(WHISPER_HIPBLAS "whisper: support for hipBLAS" OFF)
option(WHISPER_CLBLAST "whisper: use CLBlast" OFF)

option(WHISPER_DYNAMIC_CUDA "whisper: load CUDA dynamically" OFF)
endif()

option(WHISPER_PERF "whisper: enable perf timings" OFF)
Expand Down Expand Up @@ -217,7 +219,7 @@ if (WHISPER_CUBLAS)

add_compile_definitions(GGML_USE_CUBLAS)

if (WHISPER_STATIC)
if (WHISPER_STATIC OR WHISPER_DYNAMIC_CUDA)
if (WIN32)
# As of 12.3.1 CUDA Tookit for Windows does not offer a static cublas library
set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas CUDA::cublasLt)
Expand All @@ -228,7 +230,12 @@ if (WHISPER_CUBLAS)
set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
endif()

set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} CUDA::cuda_driver)
if (WHISPER_DYNAMIC_CUDA)
set(GGML_SOURCES_CUDA ${GGML_SOURCES_CUDA} cuda-loader.c)
else()
set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} CUDA::cuda_driver)
endif()

else()
message(FATAL_ERROR "cuBLAS not found")
endif()
Expand Down
8 changes: 7 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -216,7 +216,13 @@ ifdef WHISPER_CUBLAS

CFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
CXXFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib
ifdef WHISPER_DYNAMIC_CUDA
LDFLAGS += -lcublas_static -lculibos -lcudart_static -lcublasLt_static -lpthread -ldl -lrt
WHISPER_OBJ += cuda-loader.o
else
LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt
endif
LDFLAGS += -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib
WHISPER_OBJ += ggml-cuda.o
NVCC = nvcc
NVCCFLAGS = --forward-unknown-to-host-compiler -arch=$(CUDA_ARCH_FLAG)
Expand Down
140 changes: 140 additions & 0 deletions cuda-loader.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
#include <stdio.h>

#include <dlfcn.h>

#include <cuda.h>


typedef CUresult (*cuDeviceGet_pt)(CUdevice *device, int ordinal);
typedef CUresult (*cuDeviceGetAttribute_pt)(int *pi, CUdevice_attribute attrib, CUdevice dev);
typedef CUresult (*cuGetErrorString_pt)(CUresult error, const char **pStr);
typedef CUresult (*cuMemGetAllocationGranularity_pt)(size_t *granularity, const CUmemAllocationProp *prop, CUmemAllocationGranularity_flags option);
typedef CUresult (*cuMemCreate_pt)(CUmemGenericAllocationHandle *handle, size_t size, const CUmemAllocationProp *prop, unsigned long long flags);
typedef CUresult (*cuMemAddressReserve_pt)(CUdeviceptr *ptr, size_t size, size_t alignment, CUdeviceptr addr, unsigned long long flags);
typedef CUresult (*cuMemMap_pt)(CUdeviceptr ptr, size_t size, size_t offset, CUmemGenericAllocationHandle handle, unsigned long long flags);
typedef CUresult (*cuMemRelease_pt)(CUmemGenericAllocationHandle handle);
typedef CUresult (*cuMemSetAccess_pt)(CUdeviceptr ptr, size_t size, const CUmemAccessDesc *desc, size_t count);


cuDeviceGet_pt _cuDeviceGet = NULL;
cuDeviceGetAttribute_pt _cuDeviceGetAttribute = NULL;
cuGetErrorString_pt _cuGetErrorString = NULL;
cuMemGetAllocationGranularity_pt _cuMemGetAllocationGranularity = NULL;
cuMemCreate_pt _cuMemCreate = NULL;
cuMemAddressReserve_pt _cuMemAddressReserve = NULL;
cuMemMap_pt _cuMemMap = NULL;
cuMemRelease_pt _cuMemRelease = NULL;
cuMemSetAccess_pt _cuMemSetAccess = NULL;


int load_libcuda(void) {

static void * libcuda = NULL;

if (libcuda == (void*)1)
return 0;

if (libcuda != NULL)
return 1;

libcuda = dlopen("libcuda.so", RTLD_NOW);

if (libcuda == NULL) {
libcuda = dlopen("libcuda.so.1", RTLD_NOW);
}

if (libcuda != NULL) {
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wpedantic"
_cuDeviceGet = (cuDeviceGet_pt)dlsym(libcuda, "cuDeviceGet");
_cuDeviceGetAttribute = (cuDeviceGetAttribute_pt)dlsym(libcuda, "cuDeviceGetAttribute");
_cuGetErrorString = (cuGetErrorString_pt)dlsym(libcuda, "cuGetErrorString");
_cuMemGetAllocationGranularity = (cuMemGetAllocationGranularity_pt)dlsym(libcuda, "cuMemGetAllocationGranularity");
_cuMemCreate = (cuMemCreate_pt)dlsym(libcuda, "cuMemCreate");
_cuMemAddressReserve = (cuMemAddressReserve_pt)dlsym(libcuda, "cuMemAddressReserve");
_cuMemMap = (cuMemMap_pt)dlsym(libcuda, "cuMemMap");
_cuMemRelease = (cuMemRelease_pt)dlsym(libcuda, "cuMemRelease");
_cuMemSetAccess = (cuMemSetAccess_pt)dlsym(libcuda, "cuMemSetAccess");
#pragma GCC diagnostic pop

return 1;
}

fprintf(stderr, "error: failed to load libcuda.so: %s\n", dlerror());

libcuda = (void*)1; // tried and failed
return 0;
}


CUresult CUDAAPI cuDeviceGet(CUdevice *device, int ordinal) {
if (_cuDeviceGet == NULL && !load_libcuda())
return CUDA_ERROR_SHARED_OBJECT_INIT_FAILED;
if (_cuDeviceGet == NULL)
return CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND;
return _cuDeviceGet(device, ordinal);
}

CUresult CUDAAPI cuDeviceGetAttribute(int *pi, CUdevice_attribute attrib, CUdevice dev) {
if (_cuDeviceGetAttribute == NULL && !load_libcuda())
return CUDA_ERROR_SHARED_OBJECT_INIT_FAILED;
if (_cuDeviceGetAttribute == NULL)
return CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND;
return _cuDeviceGetAttribute(pi, attrib, dev);
}

CUresult CUDAAPI cuGetErrorString(CUresult error, const char **pStr) {
if (_cuGetErrorString == NULL && !load_libcuda())
return CUDA_ERROR_SHARED_OBJECT_INIT_FAILED;
if (_cuGetErrorString == NULL)
return CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND;
return _cuGetErrorString(error, pStr);
}

CUresult CUDAAPI cuMemGetAllocationGranularity(size_t *granularity, const CUmemAllocationProp *prop, CUmemAllocationGranularity_flags option) {
if (_cuMemGetAllocationGranularity == NULL && !load_libcuda())
return CUDA_ERROR_SHARED_OBJECT_INIT_FAILED;
if (_cuMemGetAllocationGranularity == NULL)
return CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND;
return _cuMemGetAllocationGranularity(granularity, prop, option);
}

CUresult CUDAAPI cuMemCreate(CUmemGenericAllocationHandle *handle, size_t size, const CUmemAllocationProp *prop, unsigned long long flags) {
if (_cuMemCreate == NULL && !load_libcuda())
return CUDA_ERROR_SHARED_OBJECT_INIT_FAILED;
if (_cuMemCreate == NULL)
return CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND;
return _cuMemCreate(handle, size, prop, flags);
}

CUresult CUDAAPI cuMemAddressReserve(CUdeviceptr *ptr, size_t size, size_t alignment, CUdeviceptr addr, unsigned long long flags) {
if (_cuMemAddressReserve == NULL && !load_libcuda())
return CUDA_ERROR_SHARED_OBJECT_INIT_FAILED;
if (_cuMemAddressReserve == NULL)
return CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND;
return _cuMemAddressReserve(ptr, size, alignment, addr, flags);
}

CUresult CUDAAPI cuMemMap(CUdeviceptr ptr, size_t size, size_t offset, CUmemGenericAllocationHandle handle, unsigned long long flags) {
if (_cuMemMap == NULL && !load_libcuda())
return CUDA_ERROR_SHARED_OBJECT_INIT_FAILED;
if (_cuMemMap == NULL)
return CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND;
return _cuMemMap(ptr, size, offset, handle, flags);
}

CUresult CUDAAPI cuMemRelease(CUmemGenericAllocationHandle handle) {
if (_cuMemRelease == NULL && !load_libcuda())
return CUDA_ERROR_SHARED_OBJECT_INIT_FAILED;
if (_cuMemRelease == NULL)
return CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND;
return _cuMemRelease(handle);
}

CUresult CUDAAPI cuMemSetAccess(CUdeviceptr ptr, size_t size, const CUmemAccessDesc *desc, size_t count) {
if (_cuMemSetAccess == NULL && !load_libcuda())
return CUDA_ERROR_SHARED_OBJECT_INIT_FAILED;
if (_cuMemSetAccess == NULL)
return CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND;
return _cuMemSetAccess(ptr, size, desc, count);
}
11 changes: 8 additions & 3 deletions ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -277,7 +277,8 @@ inline static void * ggml_calloc(size_t num, size_t size) {
#else
#include <cblas.h>
#endif
#elif defined(GGML_USE_CUBLAS)
#endif
#if defined(GGML_USE_CUBLAS)
#include "ggml-cuda.h"
#elif defined(GGML_USE_CLBLAST)
#include "ggml-opencl.h"
Expand Down Expand Up @@ -20442,7 +20443,11 @@ int ggml_cpu_has_wasm_simd(void) {
}

int ggml_cpu_has_blas(void) {
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_SYCL)
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
return 1;
#elif defined(GGML_USE_CUBLAS)
return ggml_cublas_loaded();
#elif defined(GGML_USE_VULKAN) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_SYCL)
return 1;
#else
return 0;
Expand All @@ -20451,7 +20456,7 @@ int ggml_cpu_has_blas(void) {

int ggml_cpu_has_cublas(void) {
#if defined(GGML_USE_CUBLAS)
return 1;
return ggml_cublas_loaded();
#else
return 0;
#endif
Expand Down

0 comments on commit f10a7b4

Please sign in to comment.