ggerganov · slaren · Mar 26, 2024 · Mar 25, 2024 · Mar 25, 2024
diff --git a/.devops/llama-cpp-cublas.srpm.spec b/.devops/llama-cpp-cublas.srpm.spec
@@ -32,7 +32,7 @@ CPU inference for Meta's Lllama2 models using default options.
 %setup -n llama.cpp-master
 
 %build
-make -j LLAMA_CUBLAS=1
+make -j LLAMA_CUDA=1
 
 %install
 mkdir -p %{buildroot}%{_bindir}/

diff --git a/.devops/main-cuda.Dockerfile b/.devops/main-cuda.Dockerfile
@@ -20,8 +20,8 @@ COPY . .
 
 # Set nvcc architecture
 ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
-# Enable cuBLAS
-ENV LLAMA_CUBLAS=1
+# Enable CUDA
+ENV LLAMA_CUDA=1
 
 RUN make
 

diff --git a/.devops/nix/package.nix b/.devops/nix/package.nix
@@ -191,7 +191,7 @@ effectiveStdenv.mkDerivation (
  (cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
  (cmakeBool "LLAMA_BLAS" useBlas)
  (cmakeBool "LLAMA_CLBLAST" useOpenCL)
- (cmakeBool "LLAMA_CUBLAS" useCuda)
+ (cmakeBool "LLAMA_CUDA" useCuda)
  (cmakeBool "LLAMA_HIPBLAS" useRocm)
  (cmakeBool "LLAMA_METAL" useMetalKit)
  (cmakeBool "LLAMA_MPI" useMpi)

diff --git a/.devops/server-cuda.Dockerfile b/.devops/server-cuda.Dockerfile
@@ -20,8 +20,8 @@ COPY . .
 
 # Set nvcc architecture
 ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
-# Enable cuBLAS
-ENV LLAMA_CUBLAS=1
+# Enable CUDA
+ENV LLAMA_CUDA=1
 
 RUN make
 

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -89,8 +89,8 @@ endif()
 option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON)
 option(LLAMA_BLAS "llama: use BLAS" OFF)
 set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
-option(LLAMA_CUBLAS "llama: use CUDA" OFF)
-#option(LLAMA_CUDA_CUBLAS "llama: use cuBLAS for prompt processing"  OFF)
+option(LLAMA_CUDA  "llama: use CUDA" OFF)
+option(LLAMA_CUBLAS  "llama: use CUDA (deprecated, use LLAMA_CUDA)" OFF)
 option(LLAMA_CUDA_FORCE_DMMV "llama: use dmmv instead of mmvq CUDA kernels" OFF)
 option(LLAMA_CUDA_FORCE_MMQ "llama: use mmq kernels instead of cuBLAS" OFF)
 set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
@@ -360,11 +360,16 @@ if (LLAMA_QKK_64)
 endif()
 
 if (LLAMA_CUBLAS)
+ message(WARNING "LLAMA_CUBLAS is deprecated and will be removed in the future.\nUse LLAMA_CUDA instead")
+ set(LLAMA_CUDA ON)
+endif()
+
+if (LLAMA_CUDA)
  cmake_minimum_required(VERSION 3.17)
 
  find_package(CUDAToolkit)
  if (CUDAToolkit_FOUND)
- message(STATUS "cuBLAS found")
+ message(STATUS "CUDA found")
 
  enable_language(CUDA)
 
@@ -373,7 +378,7 @@ if (LLAMA_CUBLAS)
  file(GLOB GGML_SOURCES_CUDA "ggml-cuda/*.cu")
  list(APPEND GGML_SOURCES_CUDA "ggml-cuda.cu")
 
- add_compile_definitions(GGML_USE_CUBLAS)
+ add_compile_definitions(GGML_USE_CUDA)
  if (LLAMA_CUDA_FORCE_DMMV)
  add_compile_definitions(GGML_CUDA_FORCE_DMMV)
  endif()
@@ -422,7 +427,7 @@ if (LLAMA_CUBLAS)
  message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
 
  else()
- message(WARNING "cuBLAS not found")
+ message(WARNING "CUDA not found")
  endif()
 endif()
 
@@ -525,7 +530,7 @@ if (LLAMA_HIPBLAS)
  file(GLOB GGML_SOURCES_ROCM "ggml-cuda/*.cu")
  list(APPEND GGML_SOURCES_ROCM "ggml-cuda.cu")
 
- add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUBLAS)
+ add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUDA)
 
  if (LLAMA_HIP_UMA)
  add_compile_definitions(GGML_HIP_UMA)
@@ -830,7 +835,7 @@ endif()
 
 set(CUDA_CXX_FLAGS "")
 
-if (LLAMA_CUBLAS)
+if (LLAMA_CUDA)
  set(CUDA_FLAGS -use_fast_math)
 
  if (LLAMA_FATAL_WARNINGS)
@@ -1055,7 +1060,7 @@ endif()
 add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:${ARCH_FLAGS}>")
 add_compile_options("$<$<COMPILE_LANGUAGE:C>:${ARCH_FLAGS}>")
 
-if (LLAMA_CUBLAS)
+if (LLAMA_CUDA)
  list(APPEND CUDA_CXX_FLAGS ${ARCH_FLAGS})
  list(JOIN CUDA_CXX_FLAGS " " CUDA_CXX_FLAGS_JOINED) # pass host compiler flags as a single argument
  if (NOT CUDA_CXX_FLAGS_JOINED STREQUAL "")

diff --git a/Makefile b/Makefile
@@ -390,12 +390,17 @@ ifdef LLAMA_BLIS
 endif # LLAMA_BLIS
 
 ifdef LLAMA_CUBLAS
+# $(error LLAMA_CUBLAS is deprecated. Use LLAMA_CUDA instead.)
+ LLAMA_CUDA := 1
+endif
+
+ifdef LLAMA_CUDA
  ifneq ('', '$(wildcard /opt/cuda)')
  CUDA_PATH ?= /opt/cuda
  else
  CUDA_PATH ?= /usr/local/cuda
  endif
- MK_CPPFLAGS += -DGGML_USE_CUBLAS -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
+ MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
  MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L/usr/lib/wsl/lib
  OBJS += ggml-cuda.o
  OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
@@ -462,7 +467,7 @@ endif
 
 ifdef JETSON_EOL_MODULE_DETECT
 define NVCC_COMPILE
- $(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
+ $(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUDA -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
 endef # NVCC_COMPILE
 else
 define NVCC_COMPILE
@@ -476,7 +481,7 @@ ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/com
 ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
  $(NVCC_COMPILE)
 
-endif # LLAMA_CUBLAS
+endif # LLAMA_CUDA
 
 ifdef LLAMA_CLBLAST
 
@@ -533,7 +538,7 @@ ifdef LLAMA_HIPBLAS
  LLAMA_CUDA_DMMV_X ?= 32
  LLAMA_CUDA_MMV_Y ?= 1
  LLAMA_CUDA_KQUANTS_ITER ?= 2
- MK_CPPFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS
+ MK_CPPFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUDA
 ifdef LLAMA_HIP_UMA
  MK_CPPFLAGS += -DGGML_HIP_UMA
 endif # LLAMA_HIP_UMA
@@ -609,7 +614,7 @@ override NVCCFLAGS := $(MK_NVCCFLAGS) $(NVCCFLAGS)
 override LDFLAGS := $(MK_LDFLAGS) $(LDFLAGS)
 
 # identify CUDA host compiler
-ifdef LLAMA_CUBLAS
+ifdef LLAMA_CUDA
 GF_CC := $(NVCC) $(NVCCFLAGS) 2>/dev/null .c -Xcompiler
 include scripts/get-flags.mk
 CUDA_CXXFLAGS := $(BASE_CXXFLAGS) $(GF_CXXFLAGS) -Wno-pedantic
@@ -634,7 +639,7 @@ $(info I NVCCFLAGS: $(NVCCFLAGS))
 $(info I LDFLAGS: $(LDFLAGS))
 $(info I CC: $(shell $(CC) --version | head -n 1))
 $(info I CXX: $(shell $(CXX) --version | head -n 1))
-ifdef LLAMA_CUBLAS
+ifdef LLAMA_CUDA
 $(info I NVCC: $(shell $(NVCC) --version | tail -n 1))
 CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])')
 ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1)
@@ -644,9 +649,16 @@ $(error I ERROR: For CUDA versions < 11.7 a target CUDA architecture must be exp
 endif # CUDA_POWER_ARCH
 endif # CUDA_DOCKER_ARCH
 endif # eq ($(shell echo "$(CUDA_VERSION) < 11.7" | bc),1)
-endif # LLAMA_CUBLAS
+endif # LLAMA_CUDA
 $(info )
 
+ifdef LLAMA_CUBLAS
+$(info !!!!)
+$(info LLAMA_CUBLAS is deprecated and will be removed in the future. Use LLAMA_CUDA instead.)
+$(info !!!!)
+$(info )
+endif
+
 #
 # Build library
 #

diff --git a/README.md b/README.md
@@ -448,30 +448,27 @@ Building the program with BLAS support may lead to some performance improvements
 
  Check [Optimizing and Running LLaMA2 on Intel® CPU](https://www.intel.com/content/www/us/en/content-details/791610/optimizing-and-running-llama2-on-intel-cpu.html) for more information.
 
-- #### cuBLAS
+- #### CUDA
 
- This provides BLAS acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads).
+ This provides GPU acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads).
 
  For Jetson user, if you have Jetson Orin, you can try this: [Offical Support](https://www.jetson-ai-lab.com/tutorial_text-generation.html). If you are using an old model(nano/TX2), need some additional operations before compiling.
 
  - Using `make`:
  ```bash
- make LLAMA_CUBLAS=1
+ make LLAMA_CUDA=1
  ```
  - Using `CMake`:
 
  ```bash
  mkdir build
  cd build
- cmake .. -DLLAMA_CUBLAS=ON
+ cmake .. -DLLAMA_CUDA=ON
  cmake --build . --config Release
  ```
 
  The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used. The following compilation options are also available to tweak performance:
 
-<!---
- | LLAMA_CUDA_CUBLAS | Boolean | false | Use cuBLAS instead of custom CUDA kernels for prompt processing. Faster for all quantization formats except for q4_0 and q8_0, especially for k-quants. Increases VRAM usage (700 MiB for 7b, 970 MiB for 13b, 1430 MiB for 33b). |
---->
  | Option | Legal values | Default | Description |
  |--------------------------------|------------------------|---------|-------------|
  | LLAMA_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |

diff --git a/ci/run.sh b/ci/run.sh
@@ -40,7 +40,7 @@ if [ ! -z ${GG_BUILD_METAL} ]; then
 fi
 
 if [ ! -z ${GG_BUILD_CUDA} ]; then
- CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_CUBLAS=1"
+ CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_CUDA=1"
 fi
 
 if [ ! -z ${GG_BUILD_SYCL} ]; then
@@ -412,8 +412,8 @@ function gg_run_open_llama_7b_v2 {
 
  set -e
 
- (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUBLAS=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
- (time make -j  ) 2>&1 | tee -a $OUT/${ci}-make.log
+ (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
+ (time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
 
  python3 ../convert.py ${path_models}
 

diff --git a/common/common.cpp b/common/common.cpp
@@ -48,12 +48,12 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
 
-#if (defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL))
-#define GGML_USE_CUBLAS_SYCL
+#if (defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL))
+#define GGML_USE_CUDA_SYCL
 #endif
 
-#if (defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL)) || defined(GGML_USE_VULKAN)
-#define GGML_USE_CUBLAS_SYCL_VULKAN
+#if (defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL)) || defined(GGML_USE_VULKAN)
+#define GGML_USE_CUDA_SYCL_VULKAN
 #endif
 
 #if defined(LLAMA_USE_CURL)
@@ -861,9 +861,9 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
  return true;
  }
  params.main_gpu = std::stoi(argv[i]);
-#ifndef GGML_USE_CUBLAS_SYCL
- fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting the main GPU has no effect.\n");
-#endif // GGML_USE_CUBLAS_SYCL
+#ifndef GGML_USE_CUDA_SYCL
+ fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL. Setting the main GPU has no effect.\n");
+#endif // GGML_USE_CUDA_SYCL
  return true;
  }
  if (arg == "--split-mode" || arg == "-sm") {
@@ -889,9 +889,9 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
  invalid_param = true;
  return true;
  }
-#ifndef GGML_USE_CUBLAS_SYCL
- fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting the split mode has no effect.\n");
-#endif // GGML_USE_CUBLAS_SYCL
+#ifndef GGML_USE_CUDA_SYCL
+ fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL. Setting the split mode has no effect.\n");
+#endif // GGML_USE_CUDA_SYCL
  return true;
  }
  if (arg == "--tensor-split" || arg == "-ts") {
@@ -917,9 +917,9 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
  params.tensor_split[i] = 0.0f;
  }
  }
-#ifndef GGML_USE_CUBLAS_SYCL_VULKAN
- fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL/Vulkan. Setting a tensor split has no effect.\n");
-#endif // GGML_USE_CUBLAS_SYCL
+#ifndef GGML_USE_CUDA_SYCL_VULKAN
+ fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting a tensor split has no effect.\n");
+#endif // GGML_USE_CUDA_SYCL_VULKAN
  return true;
  }
  if (arg == "--no-mmap") {
@@ -2387,7 +2387,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
  fprintf(stream, "cpu_has_avx512: %s\n", ggml_cpu_has_avx512() ? "true" : "false");
  fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false");
  fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false");
- fprintf(stream, "cpu_has_cublas: %s\n", ggml_cpu_has_cublas() ? "true" : "false");
+ fprintf(stream, "cpu_has_cuda: %s\n",  ggml_cpu_has_cuda()  ? "true" : "false");
  fprintf(stream, "cpu_has_vulkan: %s\n", ggml_cpu_has_vulkan() ? "true" : "false");
  fprintf(stream, "cpu_has_clblast: %s\n", ggml_cpu_has_clblast() ? "true" : "false");
  fprintf(stream, "cpu_has_kompute: %s\n", ggml_cpu_has_kompute() ? "true" : "false");

diff --git a/examples/imatrix/README.md b/examples/imatrix/README.md
@@ -22,7 +22,7 @@ For faster computation, make sure to use GPU offloading via the `-ngl` argument
 ## Example
 
 ```bash
-LLAMA_CUBLAS=1 make -j
+LLAMA_CUDA=1 make -j
 
 # generate importance matrix (imatrix.dat)
 ./imatrix -m ggml-model-f16.gguf -f train-data.txt -ngl 99

diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp
@@ -113,7 +113,7 @@ static std::string get_cpu_info() {
 
 static std::string get_gpu_info() {
  std::string id;
-#ifdef GGML_USE_CUBLAS
+#ifdef GGML_USE_CUDA
  int count = ggml_backend_cuda_get_device_count();
  for (int i = 0; i < count; i++) {
  char buf[128];
@@ -808,7 +808,7 @@ struct test {
 
 const std::string test::build_commit = LLAMA_COMMIT;
 const int test::build_number = LLAMA_BUILD_NUMBER;
-const bool test::cuda = !!ggml_cpu_has_cublas();
+const bool test::cuda = !!ggml_cpu_has_cuda();
 const bool test::opencl = !!ggml_cpu_has_clblast();
 const bool test::vulkan = !!ggml_cpu_has_vulkan();
 const bool test::kompute = !!ggml_cpu_has_kompute();

diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
@@ -7,7 +7,7 @@
 #include "ggml-alloc.h"
 #include "ggml-backend.h"
 
-#ifdef GGML_USE_CUBLAS
+#ifdef GGML_USE_CUDA
 #include "ggml-cuda.h"
 #endif
 
@@ -968,7 +968,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
  }
  }
 
-#ifdef GGML_USE_CUBLAS
+#ifdef GGML_USE_CUDA
  new_clip->backend = ggml_backend_cuda_init(0);
  printf("%s: CLIP using CUDA backend\n", __func__);
 #endif