ggerganov · slaren · Mar 26, 2024 · Mar 25, 2024 · Mar 25, 2024
diff --git a/.devops/full-cuda.Dockerfile b/.devops/full-cuda.Dockerfile
@@ -26,8 +26,8 @@ COPY . .
 
 # Set nvcc architecture
 ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
-# Enable cuBLAS
-ENV LLAMA_CUBLAS=1
+# Enable CUDA
+ENV LLAMA_CUDA=1
 
 RUN make
 

diff --git a/.devops/llama-cpp-cublas.srpm.spec → .devops/llama-cpp-cuda.srpm.spec b/.devops/llama-cpp-cublas.srpm.spec → .devops/llama-cpp-cuda.srpm.spec
@@ -12,7 +12,7 @@
 # 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries.
 # It is up to the user to install the correct vendor-specific support.
 
-Name: llama.cpp-cublas
+Name: llama.cpp-cuda
 Version: %( date "+%%Y%%m%%d" )
 Release: 1%{?dist}
 Summary: CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL)
@@ -32,24 +32,24 @@ CPU inference for Meta's Lllama2 models using default options.
 %setup -n llama.cpp-master
 
 %build
-make -j LLAMA_CUBLAS=1
+make -j LLAMA_CUDA=1
 
 %install
 mkdir -p %{buildroot}%{_bindir}/
-cp -p main %{buildroot}%{_bindir}/llamacppcublas
-cp -p server %{buildroot}%{_bindir}/llamacppcublasserver
-cp -p simple %{buildroot}%{_bindir}/llamacppcublassimple
+cp -p main %{buildroot}%{_bindir}/llamacppcuda
+cp -p server %{buildroot}%{_bindir}/llamacppcudaserver
+cp -p simple %{buildroot}%{_bindir}/llamacppcudasimple
 
 mkdir -p %{buildroot}/usr/lib/systemd/system
-%{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/llamacublas.service
+%{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/llamacuda.service
 [Unit]
 Description=Llama.cpp server, CPU only (no GPU support in this build).
 After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target
 
 [Service]
 Type=simple
 EnvironmentFile=/etc/sysconfig/llama
-ExecStart=/usr/bin/llamacppcublasserver $LLAMA_ARGS
+ExecStart=/usr/bin/llamacppcudaserver $LLAMA_ARGS
 ExecReload=/bin/kill -s HUP $MAINPID
 Restart=never
 
@@ -67,10 +67,10 @@ rm -rf %{buildroot}
 rm -rf %{_builddir}/*
 
 %files
-%{_bindir}/llamacppcublas
-%{_bindir}/llamacppcublasserver
-%{_bindir}/llamacppcublassimple
-/usr/lib/systemd/system/llamacublas.service
+%{_bindir}/llamacppcuda
+%{_bindir}/llamacppcudaserver
+%{_bindir}/llamacppcudasimple
+/usr/lib/systemd/system/llamacuda.service
 %config /etc/sysconfig/llama
 
 %pre

diff --git a/.devops/main-cuda.Dockerfile b/.devops/main-cuda.Dockerfile
@@ -20,8 +20,8 @@ COPY . .
 
 # Set nvcc architecture
 ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
-# Enable cuBLAS
-ENV LLAMA_CUBLAS=1
+# Enable CUDA
+ENV LLAMA_CUDA=1
 
 RUN make
 

diff --git a/.devops/nix/package.nix b/.devops/nix/package.nix
@@ -191,7 +191,7 @@ effectiveStdenv.mkDerivation (
  (cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
  (cmakeBool "LLAMA_BLAS" useBlas)
  (cmakeBool "LLAMA_CLBLAST" useOpenCL)
- (cmakeBool "LLAMA_CUBLAS" useCuda)
+ (cmakeBool "LLAMA_CUDA" useCuda)
  (cmakeBool "LLAMA_HIPBLAS" useRocm)
  (cmakeBool "LLAMA_METAL" useMetalKit)
  (cmakeBool "LLAMA_MPI" useMpi)

diff --git a/.devops/server-cuda.Dockerfile b/.devops/server-cuda.Dockerfile
@@ -20,8 +20,8 @@ COPY . .
 
 # Set nvcc architecture
 ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
-# Enable cuBLAS
-ENV LLAMA_CUBLAS=1
+# Enable CUDA
+ENV LLAMA_CUDA=1
 
 RUN make
 

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -728,13 +728,13 @@ jobs:
  path: |
  llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-x64.zip
 
- windows-latest-cmake-cublas:
+ windows-latest-cmake-cuda:
  runs-on: windows-latest
 
  strategy:
  matrix:
  cuda: ['12.2.0', '11.7.1']
- build: ['cublas']
+ build: ['cuda']
 
  steps:
  - name: Clone
@@ -755,7 +755,7 @@ jobs:
  run: |
  mkdir build
  cd build
- cmake .. -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUBLAS=ON -DBUILD_SHARED_LIBS=ON
+ cmake .. -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=ON
  cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
 
  - name: Determine tag name
@@ -911,7 +911,7 @@ jobs:
  - macOS-latest-make
  - macOS-latest-cmake
  - windows-latest-cmake
- - windows-latest-cmake-cublas
+ - windows-latest-cmake-cuda
  - macOS-latest-cmake-arm64
  - macOS-latest-cmake-x64
 

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -89,8 +89,8 @@ endif()
 option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON)
 option(LLAMA_BLAS "llama: use BLAS" OFF)
 set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
-option(LLAMA_CUBLAS "llama: use CUDA" OFF)
-#option(LLAMA_CUDA_CUBLAS "llama: use cuBLAS for prompt processing"  OFF)
+option(LLAMA_CUDA  "llama: use CUDA" OFF)
+option(LLAMA_CUBLAS  "llama: use CUDA (deprecated, use LLAMA_CUDA)" OFF)
 option(LLAMA_CUDA_FORCE_DMMV "llama: use dmmv instead of mmvq CUDA kernels" OFF)
 option(LLAMA_CUDA_FORCE_MMQ "llama: use mmq kernels instead of cuBLAS" OFF)
 set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
@@ -360,11 +360,16 @@ if (LLAMA_QKK_64)
 endif()
 
 if (LLAMA_CUBLAS)
+ message(WARNING "LLAMA_CUBLAS is deprecated and will be removed in the future.\nUse LLAMA_CUDA instead")
+ set(LLAMA_CUDA ON)
+endif()
+
+if (LLAMA_CUDA)
  cmake_minimum_required(VERSION 3.17)
 
  find_package(CUDAToolkit)
  if (CUDAToolkit_FOUND)
- message(STATUS "cuBLAS found")
+ message(STATUS "CUDA found")
 
  enable_language(CUDA)
 
@@ -373,7 +378,7 @@ if (LLAMA_CUBLAS)
  file(GLOB GGML_SOURCES_CUDA "ggml-cuda/*.cu")
  list(APPEND GGML_SOURCES_CUDA "ggml-cuda.cu")
 
- add_compile_definitions(GGML_USE_CUBLAS)
+ add_compile_definitions(GGML_USE_CUDA)
  if (LLAMA_CUDA_FORCE_DMMV)
  add_compile_definitions(GGML_CUDA_FORCE_DMMV)
  endif()
@@ -422,7 +427,7 @@ if (LLAMA_CUBLAS)
  message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
 
  else()
- message(WARNING "cuBLAS not found")
+ message(WARNING "CUDA not found")
  endif()
 endif()
 
@@ -525,7 +530,7 @@ if (LLAMA_HIPBLAS)
  file(GLOB GGML_SOURCES_ROCM "ggml-cuda/*.cu")
  list(APPEND GGML_SOURCES_ROCM "ggml-cuda.cu")
 
- add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUBLAS)
+ add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUDA)
 
  if (LLAMA_HIP_UMA)
  add_compile_definitions(GGML_HIP_UMA)
@@ -830,7 +835,7 @@ endif()
 
 set(CUDA_CXX_FLAGS "")
 
-if (LLAMA_CUBLAS)
+if (LLAMA_CUDA)
  set(CUDA_FLAGS -use_fast_math)
 
  if (LLAMA_FATAL_WARNINGS)
@@ -1055,7 +1060,7 @@ endif()
 add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:${ARCH_FLAGS}>")
 add_compile_options("$<$<COMPILE_LANGUAGE:C>:${ARCH_FLAGS}>")
 
-if (LLAMA_CUBLAS)
+if (LLAMA_CUDA)
  list(APPEND CUDA_CXX_FLAGS ${ARCH_FLAGS})
  list(JOIN CUDA_CXX_FLAGS " " CUDA_CXX_FLAGS_JOINED) # pass host compiler flags as a single argument
  if (NOT CUDA_CXX_FLAGS_JOINED STREQUAL "")

diff --git a/Makefile b/Makefile
@@ -390,12 +390,17 @@ ifdef LLAMA_BLIS
 endif # LLAMA_BLIS
 
 ifdef LLAMA_CUBLAS
+# LLAMA_CUBLAS is deprecated and will be removed in the future
+ LLAMA_CUDA := 1
+endif
+
+ifdef LLAMA_CUDA
  ifneq ('', '$(wildcard /opt/cuda)')
  CUDA_PATH ?= /opt/cuda
  else
  CUDA_PATH ?= /usr/local/cuda
  endif
- MK_CPPFLAGS += -DGGML_USE_CUBLAS -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
+ MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
  MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L/usr/lib/wsl/lib
  OBJS += ggml-cuda.o
  OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
@@ -462,7 +467,7 @@ endif
 
 ifdef JETSON_EOL_MODULE_DETECT
 define NVCC_COMPILE
- $(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
+ $(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUDA -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
 endef # NVCC_COMPILE
 else
 define NVCC_COMPILE
@@ -476,7 +481,7 @@ ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/com
 ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
  $(NVCC_COMPILE)
 
-endif # LLAMA_CUBLAS
+endif # LLAMA_CUDA
 
 ifdef LLAMA_CLBLAST
 
@@ -533,7 +538,7 @@ ifdef LLAMA_HIPBLAS
  LLAMA_CUDA_DMMV_X ?= 32
  LLAMA_CUDA_MMV_Y ?= 1
  LLAMA_CUDA_KQUANTS_ITER ?= 2
- MK_CPPFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS
+ MK_CPPFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUDA
 ifdef LLAMA_HIP_UMA
  MK_CPPFLAGS += -DGGML_HIP_UMA
 endif # LLAMA_HIP_UMA
@@ -609,7 +614,7 @@ override NVCCFLAGS := $(MK_NVCCFLAGS) $(NVCCFLAGS)
 override LDFLAGS := $(MK_LDFLAGS) $(LDFLAGS)
 
 # identify CUDA host compiler
-ifdef LLAMA_CUBLAS
+ifdef LLAMA_CUDA
 GF_CC := $(NVCC) $(NVCCFLAGS) 2>/dev/null .c -Xcompiler
 include scripts/get-flags.mk
 CUDA_CXXFLAGS := $(BASE_CXXFLAGS) $(GF_CXXFLAGS) -Wno-pedantic
@@ -634,7 +639,7 @@ $(info I NVCCFLAGS: $(NVCCFLAGS))
 $(info I LDFLAGS: $(LDFLAGS))
 $(info I CC: $(shell $(CC) --version | head -n 1))
 $(info I CXX: $(shell $(CXX) --version | head -n 1))
-ifdef LLAMA_CUBLAS
+ifdef LLAMA_CUDA
 $(info I NVCC: $(shell $(NVCC) --version | tail -n 1))
 CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])')
 ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1)
@@ -644,9 +649,16 @@ $(error I ERROR: For CUDA versions < 11.7 a target CUDA architecture must be exp
 endif # CUDA_POWER_ARCH
 endif # CUDA_DOCKER_ARCH
 endif # eq ($(shell echo "$(CUDA_VERSION) < 11.7" | bc),1)
-endif # LLAMA_CUBLAS
+endif # LLAMA_CUDA
 $(info )
 
+ifdef LLAMA_CUBLAS
+$(info !!!!)
+$(info LLAMA_CUBLAS is deprecated and will be removed in the future. Use LLAMA_CUDA instead.)
+$(info !!!!)
+$(info )
+endif
+
 #
 # Build library
 #

diff --git a/README.md b/README.md
@@ -448,30 +448,27 @@ Building the program with BLAS support may lead to some performance improvements
 
  Check [Optimizing and Running LLaMA2 on Intel® CPU](https://www.intel.com/content/www/us/en/content-details/791610/optimizing-and-running-llama2-on-intel-cpu.html) for more information.
 
-- #### cuBLAS
+- #### CUDA
 
- This provides BLAS acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads).
+ This provides GPU acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads).
 
  For Jetson user, if you have Jetson Orin, you can try this: [Offical Support](https://www.jetson-ai-lab.com/tutorial_text-generation.html). If you are using an old model(nano/TX2), need some additional operations before compiling.
 
  - Using `make`:
  ```bash
- make LLAMA_CUBLAS=1
+ make LLAMA_CUDA=1
  ```
  - Using `CMake`:
 
  ```bash
  mkdir build
  cd build
- cmake .. -DLLAMA_CUBLAS=ON
+ cmake .. -DLLAMA_CUDA=ON
  cmake --build . --config Release
  ```
 
  The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used. The following compilation options are also available to tweak performance:
 
-<!---
- | LLAMA_CUDA_CUBLAS | Boolean | false | Use cuBLAS instead of custom CUDA kernels for prompt processing. Faster for all quantization formats except for q4_0 and q8_0, especially for k-quants. Increases VRAM usage (700 MiB for 7b, 970 MiB for 13b, 1430 MiB for 33b). |
---->
  | Option | Legal values | Default | Description |
  |--------------------------------|------------------------|---------|-------------|
  | LLAMA_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |

diff --git a/ci/run.sh b/ci/run.sh
@@ -40,7 +40,7 @@ if [ ! -z ${GG_BUILD_METAL} ]; then
 fi
 
 if [ ! -z ${GG_BUILD_CUDA} ]; then
- CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_CUBLAS=1"
+ CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_CUDA=1"
 fi
 
 if [ ! -z ${GG_BUILD_SYCL} ]; then
@@ -412,8 +412,8 @@ function gg_run_open_llama_7b_v2 {
 
  set -e
 
- (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUBLAS=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
- (time make -j  ) 2>&1 | tee -a $OUT/${ci}-make.log
+ (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
+ (time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
 
  python3 ../convert.py ${path_models}