diff --git a/CMakeLists.txt b/CMakeLists.txt index 4c620f51206..f949d76e30f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -75,6 +75,8 @@ else() option(WHISPER_CUBLAS "whisper: support for cuBLAS" OFF) option(WHISPER_HIPBLAS "whisper: support for hipBLAS" OFF) option(WHISPER_CLBLAST "whisper: use CLBlast" OFF) + + option(WHISPER_DYNAMIC_CUDA "whisper: load CUDA dynamically" OFF) endif() option(WHISPER_PERF "whisper: enable perf timings" OFF) @@ -217,7 +219,7 @@ if (WHISPER_CUBLAS) add_compile_definitions(GGML_USE_CUBLAS) - if (WHISPER_STATIC) + if (WHISPER_STATIC OR WHISPER_DYNAMIC_CUDA) if (WIN32) # As of 12.3.1 CUDA Tookit for Windows does not offer a static cublas library set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas CUDA::cublasLt) @@ -228,7 +230,12 @@ if (WHISPER_CUBLAS) set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt) endif() - set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} CUDA::cuda_driver) + if (WHISPER_DYNAMIC_CUDA) + set(GGML_SOURCES_CUDA ${GGML_SOURCES_CUDA} cuda-loader.c) + else() + set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} CUDA::cuda_driver) + endif() + else() message(FATAL_ERROR "cuBLAS not found") endif() diff --git a/Makefile b/Makefile index 762dc65ea0f..79b8c14719f 100644 --- a/Makefile +++ b/Makefile @@ -216,7 +216,13 @@ ifdef WHISPER_CUBLAS CFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include CXXFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include - LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib + ifdef WHISPER_DYNAMIC_CUDA + LDFLAGS += -lcublas_static -lculibos -lcudart_static -lcublasLt_static -lpthread -ldl -lrt + WHISPER_OBJ += cuda-loader.o + else + LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt + endif + LDFLAGS += -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib WHISPER_OBJ += ggml-cuda.o NVCC = nvcc NVCCFLAGS = --forward-unknown-to-host-compiler -arch=$(CUDA_ARCH_FLAG) diff --git a/cuda-loader.c b/cuda-loader.c new file mode 100644 index 00000000000..6c648a1fac1 --- /dev/null +++ b/cuda-loader.c @@ -0,0 +1,140 @@ +#include + +#include + +#include + + +typedef CUresult (*cuDeviceGet_pt)(CUdevice *device, int ordinal); +typedef CUresult (*cuDeviceGetAttribute_pt)(int *pi, CUdevice_attribute attrib, CUdevice dev); +typedef CUresult (*cuGetErrorString_pt)(CUresult error, const char **pStr); +typedef CUresult (*cuMemGetAllocationGranularity_pt)(size_t *granularity, const CUmemAllocationProp *prop, CUmemAllocationGranularity_flags option); +typedef CUresult (*cuMemCreate_pt)(CUmemGenericAllocationHandle *handle, size_t size, const CUmemAllocationProp *prop, unsigned long long flags); +typedef CUresult (*cuMemAddressReserve_pt)(CUdeviceptr *ptr, size_t size, size_t alignment, CUdeviceptr addr, unsigned long long flags); +typedef CUresult (*cuMemMap_pt)(CUdeviceptr ptr, size_t size, size_t offset, CUmemGenericAllocationHandle handle, unsigned long long flags); +typedef CUresult (*cuMemRelease_pt)(CUmemGenericAllocationHandle handle); +typedef CUresult (*cuMemSetAccess_pt)(CUdeviceptr ptr, size_t size, const CUmemAccessDesc *desc, size_t count); + + +cuDeviceGet_pt _cuDeviceGet = NULL; +cuDeviceGetAttribute_pt _cuDeviceGetAttribute = NULL; +cuGetErrorString_pt _cuGetErrorString = NULL; +cuMemGetAllocationGranularity_pt _cuMemGetAllocationGranularity = NULL; +cuMemCreate_pt _cuMemCreate = NULL; +cuMemAddressReserve_pt _cuMemAddressReserve = NULL; +cuMemMap_pt _cuMemMap = NULL; +cuMemRelease_pt _cuMemRelease = NULL; +cuMemSetAccess_pt _cuMemSetAccess = NULL; + + +int load_libcuda(void) { + + static void * libcuda = NULL; + + if (libcuda == (void*)1) + return 0; + + if (libcuda != NULL) + return 1; + + libcuda = dlopen("libcuda.so", RTLD_NOW); + + if (libcuda == NULL) { + libcuda = dlopen("libcuda.so.1", RTLD_NOW); + } + + if (libcuda != NULL) { +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wpedantic" + _cuDeviceGet = (cuDeviceGet_pt)dlsym(libcuda, "cuDeviceGet"); + _cuDeviceGetAttribute = (cuDeviceGetAttribute_pt)dlsym(libcuda, "cuDeviceGetAttribute"); + _cuGetErrorString = (cuGetErrorString_pt)dlsym(libcuda, "cuGetErrorString"); + _cuMemGetAllocationGranularity = (cuMemGetAllocationGranularity_pt)dlsym(libcuda, "cuMemGetAllocationGranularity"); + _cuMemCreate = (cuMemCreate_pt)dlsym(libcuda, "cuMemCreate"); + _cuMemAddressReserve = (cuMemAddressReserve_pt)dlsym(libcuda, "cuMemAddressReserve"); + _cuMemMap = (cuMemMap_pt)dlsym(libcuda, "cuMemMap"); + _cuMemRelease = (cuMemRelease_pt)dlsym(libcuda, "cuMemRelease"); + _cuMemSetAccess = (cuMemSetAccess_pt)dlsym(libcuda, "cuMemSetAccess"); +#pragma GCC diagnostic pop + + return 1; + } + + fprintf(stderr, "error: failed to load libcuda.so: %s\n", dlerror()); + + libcuda = (void*)1; // tried and failed + return 0; +} + + +CUresult CUDAAPI cuDeviceGet(CUdevice *device, int ordinal) { + if (_cuDeviceGet == NULL && !load_libcuda()) + return CUDA_ERROR_SHARED_OBJECT_INIT_FAILED; + if (_cuDeviceGet == NULL) + return CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND; + return _cuDeviceGet(device, ordinal); +} + +CUresult CUDAAPI cuDeviceGetAttribute(int *pi, CUdevice_attribute attrib, CUdevice dev) { + if (_cuDeviceGetAttribute == NULL && !load_libcuda()) + return CUDA_ERROR_SHARED_OBJECT_INIT_FAILED; + if (_cuDeviceGetAttribute == NULL) + return CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND; + return _cuDeviceGetAttribute(pi, attrib, dev); +} + +CUresult CUDAAPI cuGetErrorString(CUresult error, const char **pStr) { + if (_cuGetErrorString == NULL && !load_libcuda()) + return CUDA_ERROR_SHARED_OBJECT_INIT_FAILED; + if (_cuGetErrorString == NULL) + return CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND; + return _cuGetErrorString(error, pStr); +} + +CUresult CUDAAPI cuMemGetAllocationGranularity(size_t *granularity, const CUmemAllocationProp *prop, CUmemAllocationGranularity_flags option) { + if (_cuMemGetAllocationGranularity == NULL && !load_libcuda()) + return CUDA_ERROR_SHARED_OBJECT_INIT_FAILED; + if (_cuMemGetAllocationGranularity == NULL) + return CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND; + return _cuMemGetAllocationGranularity(granularity, prop, option); +} + +CUresult CUDAAPI cuMemCreate(CUmemGenericAllocationHandle *handle, size_t size, const CUmemAllocationProp *prop, unsigned long long flags) { + if (_cuMemCreate == NULL && !load_libcuda()) + return CUDA_ERROR_SHARED_OBJECT_INIT_FAILED; + if (_cuMemCreate == NULL) + return CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND; + return _cuMemCreate(handle, size, prop, flags); +} + +CUresult CUDAAPI cuMemAddressReserve(CUdeviceptr *ptr, size_t size, size_t alignment, CUdeviceptr addr, unsigned long long flags) { + if (_cuMemAddressReserve == NULL && !load_libcuda()) + return CUDA_ERROR_SHARED_OBJECT_INIT_FAILED; + if (_cuMemAddressReserve == NULL) + return CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND; + return _cuMemAddressReserve(ptr, size, alignment, addr, flags); +} + +CUresult CUDAAPI cuMemMap(CUdeviceptr ptr, size_t size, size_t offset, CUmemGenericAllocationHandle handle, unsigned long long flags) { + if (_cuMemMap == NULL && !load_libcuda()) + return CUDA_ERROR_SHARED_OBJECT_INIT_FAILED; + if (_cuMemMap == NULL) + return CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND; + return _cuMemMap(ptr, size, offset, handle, flags); +} + +CUresult CUDAAPI cuMemRelease(CUmemGenericAllocationHandle handle) { + if (_cuMemRelease == NULL && !load_libcuda()) + return CUDA_ERROR_SHARED_OBJECT_INIT_FAILED; + if (_cuMemRelease == NULL) + return CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND; + return _cuMemRelease(handle); +} + +CUresult CUDAAPI cuMemSetAccess(CUdeviceptr ptr, size_t size, const CUmemAccessDesc *desc, size_t count) { + if (_cuMemSetAccess == NULL && !load_libcuda()) + return CUDA_ERROR_SHARED_OBJECT_INIT_FAILED; + if (_cuMemSetAccess == NULL) + return CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND; + return _cuMemSetAccess(ptr, size, desc, count); +} diff --git a/ggml.c b/ggml.c index a7a9ea319c5..50734f02b32 100644 --- a/ggml.c +++ b/ggml.c @@ -277,7 +277,8 @@ inline static void * ggml_calloc(size_t num, size_t size) { #else #include #endif -#elif defined(GGML_USE_CUBLAS) +#endif +#if defined(GGML_USE_CUBLAS) #include "ggml-cuda.h" #elif defined(GGML_USE_CLBLAST) #include "ggml-opencl.h" @@ -20442,7 +20443,11 @@ int ggml_cpu_has_wasm_simd(void) { } int ggml_cpu_has_blas(void) { -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_SYCL) +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) + return 1; +#elif defined(GGML_USE_CUBLAS) + return ggml_cublas_loaded(); +#elif defined(GGML_USE_VULKAN) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_SYCL) return 1; #else return 0; @@ -20451,7 +20456,7 @@ int ggml_cpu_has_blas(void) { int ggml_cpu_has_cublas(void) { #if defined(GGML_USE_CUBLAS) - return 1; + return ggml_cublas_loaded(); #else return 0; #endif