Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

cuda : rename build flag to LLAMA_CUDA #6299

Merged
merged 2 commits into from Mar 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
4 changes: 2 additions & 2 deletions .devops/full-cuda.Dockerfile
Expand Up @@ -26,8 +26,8 @@ COPY . .

# Set nvcc architecture
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
# Enable cuBLAS
ENV LLAMA_CUBLAS=1
# Enable CUDA
ENV LLAMA_CUDA=1

RUN make

Expand Down
Expand Up @@ -12,7 +12,7 @@
# 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries.
# It is up to the user to install the correct vendor-specific support.

Name: llama.cpp-cublas
Name: llama.cpp-cuda
Version: %( date "+%%Y%%m%%d" )
Release: 1%{?dist}
Summary: CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL)
Expand All @@ -32,24 +32,24 @@ CPU inference for Meta's Lllama2 models using default options.
%setup -n llama.cpp-master

%build
make -j LLAMA_CUBLAS=1
make -j LLAMA_CUDA=1

%install
mkdir -p %{buildroot}%{_bindir}/
cp -p main %{buildroot}%{_bindir}/llamacppcublas
cp -p server %{buildroot}%{_bindir}/llamacppcublasserver
cp -p simple %{buildroot}%{_bindir}/llamacppcublassimple
cp -p main %{buildroot}%{_bindir}/llamacppcuda
cp -p server %{buildroot}%{_bindir}/llamacppcudaserver
cp -p simple %{buildroot}%{_bindir}/llamacppcudasimple

mkdir -p %{buildroot}/usr/lib/systemd/system
%{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/llamacublas.service
%{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/llamacuda.service
[Unit]
Description=Llama.cpp server, CPU only (no GPU support in this build).
After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target

[Service]
Type=simple
EnvironmentFile=/etc/sysconfig/llama
ExecStart=/usr/bin/llamacppcublasserver $LLAMA_ARGS
ExecStart=/usr/bin/llamacppcudaserver $LLAMA_ARGS
ExecReload=/bin/kill -s HUP $MAINPID
Restart=never

Expand All @@ -67,10 +67,10 @@ rm -rf %{buildroot}
rm -rf %{_builddir}/*

%files
%{_bindir}/llamacppcublas
%{_bindir}/llamacppcublasserver
%{_bindir}/llamacppcublassimple
/usr/lib/systemd/system/llamacublas.service
%{_bindir}/llamacppcuda
%{_bindir}/llamacppcudaserver
%{_bindir}/llamacppcudasimple
/usr/lib/systemd/system/llamacuda.service
%config /etc/sysconfig/llama

%pre
Expand Down
4 changes: 2 additions & 2 deletions .devops/main-cuda.Dockerfile
Expand Up @@ -20,8 +20,8 @@ COPY . .

# Set nvcc architecture
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
# Enable cuBLAS
ENV LLAMA_CUBLAS=1
# Enable CUDA
ENV LLAMA_CUDA=1

RUN make

Expand Down
2 changes: 1 addition & 1 deletion .devops/nix/package.nix
Expand Up @@ -191,7 +191,7 @@ effectiveStdenv.mkDerivation (
(cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
(cmakeBool "LLAMA_BLAS" useBlas)
(cmakeBool "LLAMA_CLBLAST" useOpenCL)
(cmakeBool "LLAMA_CUBLAS" useCuda)
(cmakeBool "LLAMA_CUDA" useCuda)
(cmakeBool "LLAMA_HIPBLAS" useRocm)
(cmakeBool "LLAMA_METAL" useMetalKit)
(cmakeBool "LLAMA_MPI" useMpi)
Expand Down
4 changes: 2 additions & 2 deletions .devops/server-cuda.Dockerfile
Expand Up @@ -20,8 +20,8 @@ COPY . .

# Set nvcc architecture
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
# Enable cuBLAS
ENV LLAMA_CUBLAS=1
# Enable CUDA
ENV LLAMA_CUDA=1

RUN make

Expand Down
8 changes: 4 additions & 4 deletions .github/workflows/build.yml
Expand Up @@ -728,13 +728,13 @@ jobs:
path: |
llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-x64.zip

windows-latest-cmake-cublas:
windows-latest-cmake-cuda:
runs-on: windows-latest

strategy:
matrix:
cuda: ['12.2.0', '11.7.1']
build: ['cublas']
build: ['cuda']

steps:
- name: Clone
Expand All @@ -755,7 +755,7 @@ jobs:
run: |
mkdir build
cd build
cmake .. -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUBLAS=ON -DBUILD_SHARED_LIBS=ON
cmake .. -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=ON
cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}

- name: Determine tag name
Expand Down Expand Up @@ -911,7 +911,7 @@ jobs:
- macOS-latest-make
- macOS-latest-cmake
- windows-latest-cmake
- windows-latest-cmake-cublas
- windows-latest-cmake-cuda
- macOS-latest-cmake-arm64
- macOS-latest-cmake-x64

Expand Down
21 changes: 13 additions & 8 deletions CMakeLists.txt
Expand Up @@ -89,8 +89,8 @@ endif()
option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON)
option(LLAMA_BLAS "llama: use BLAS" OFF)
set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
option(LLAMA_CUBLAS "llama: use CUDA" OFF)
#option(LLAMA_CUDA_CUBLAS "llama: use cuBLAS for prompt processing" OFF)
option(LLAMA_CUDA "llama: use CUDA" OFF)
option(LLAMA_CUBLAS "llama: use CUDA (deprecated, use LLAMA_CUDA)" OFF)
option(LLAMA_CUDA_FORCE_DMMV "llama: use dmmv instead of mmvq CUDA kernels" OFF)
option(LLAMA_CUDA_FORCE_MMQ "llama: use mmq kernels instead of cuBLAS" OFF)
set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
Expand Down Expand Up @@ -360,11 +360,16 @@ if (LLAMA_QKK_64)
endif()

if (LLAMA_CUBLAS)
message(WARNING "LLAMA_CUBLAS is deprecated and will be removed in the future.\nUse LLAMA_CUDA instead")
set(LLAMA_CUDA ON)
endif()

if (LLAMA_CUDA)
cmake_minimum_required(VERSION 3.17)

find_package(CUDAToolkit)
if (CUDAToolkit_FOUND)
message(STATUS "cuBLAS found")
message(STATUS "CUDA found")

enable_language(CUDA)

Expand All @@ -373,7 +378,7 @@ if (LLAMA_CUBLAS)
file(GLOB GGML_SOURCES_CUDA "ggml-cuda/*.cu")
list(APPEND GGML_SOURCES_CUDA "ggml-cuda.cu")

add_compile_definitions(GGML_USE_CUBLAS)
add_compile_definitions(GGML_USE_CUDA)
if (LLAMA_CUDA_FORCE_DMMV)
add_compile_definitions(GGML_CUDA_FORCE_DMMV)
endif()
Expand Down Expand Up @@ -422,7 +427,7 @@ if (LLAMA_CUBLAS)
message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")

else()
message(WARNING "cuBLAS not found")
message(WARNING "CUDA not found")
endif()
endif()

Expand Down Expand Up @@ -525,7 +530,7 @@ if (LLAMA_HIPBLAS)
file(GLOB GGML_SOURCES_ROCM "ggml-cuda/*.cu")
list(APPEND GGML_SOURCES_ROCM "ggml-cuda.cu")

add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUBLAS)
add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUDA)

if (LLAMA_HIP_UMA)
add_compile_definitions(GGML_HIP_UMA)
Expand Down Expand Up @@ -830,7 +835,7 @@ endif()

set(CUDA_CXX_FLAGS "")

if (LLAMA_CUBLAS)
if (LLAMA_CUDA)
set(CUDA_FLAGS -use_fast_math)

if (LLAMA_FATAL_WARNINGS)
Expand Down Expand Up @@ -1055,7 +1060,7 @@ endif()
add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:${ARCH_FLAGS}>")
add_compile_options("$<$<COMPILE_LANGUAGE:C>:${ARCH_FLAGS}>")

if (LLAMA_CUBLAS)
if (LLAMA_CUDA)
list(APPEND CUDA_CXX_FLAGS ${ARCH_FLAGS})
list(JOIN CUDA_CXX_FLAGS " " CUDA_CXX_FLAGS_JOINED) # pass host compiler flags as a single argument
if (NOT CUDA_CXX_FLAGS_JOINED STREQUAL "")
Expand Down
26 changes: 19 additions & 7 deletions Makefile
Expand Up @@ -390,12 +390,17 @@ ifdef LLAMA_BLIS
endif # LLAMA_BLIS

ifdef LLAMA_CUBLAS
# LLAMA_CUBLAS is deprecated and will be removed in the future
LLAMA_CUDA := 1
endif

ifdef LLAMA_CUDA
ifneq ('', '$(wildcard /opt/cuda)')
CUDA_PATH ?= /opt/cuda
else
CUDA_PATH ?= /usr/local/cuda
endif
MK_CPPFLAGS += -DGGML_USE_CUBLAS -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L/usr/lib/wsl/lib
OBJS += ggml-cuda.o
OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
Expand Down Expand Up @@ -462,7 +467,7 @@ endif

ifdef JETSON_EOL_MODULE_DETECT
define NVCC_COMPILE
$(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
$(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUDA -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
endef # NVCC_COMPILE
else
define NVCC_COMPILE
Expand All @@ -476,7 +481,7 @@ ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/com
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
$(NVCC_COMPILE)

endif # LLAMA_CUBLAS
endif # LLAMA_CUDA

ifdef LLAMA_CLBLAST

Expand Down Expand Up @@ -533,7 +538,7 @@ ifdef LLAMA_HIPBLAS
LLAMA_CUDA_DMMV_X ?= 32
LLAMA_CUDA_MMV_Y ?= 1
LLAMA_CUDA_KQUANTS_ITER ?= 2
MK_CPPFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS
MK_CPPFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUDA
ifdef LLAMA_HIP_UMA
MK_CPPFLAGS += -DGGML_HIP_UMA
endif # LLAMA_HIP_UMA
Expand Down Expand Up @@ -609,7 +614,7 @@ override NVCCFLAGS := $(MK_NVCCFLAGS) $(NVCCFLAGS)
override LDFLAGS := $(MK_LDFLAGS) $(LDFLAGS)

# identify CUDA host compiler
ifdef LLAMA_CUBLAS
ifdef LLAMA_CUDA
GF_CC := $(NVCC) $(NVCCFLAGS) 2>/dev/null .c -Xcompiler
include scripts/get-flags.mk
CUDA_CXXFLAGS := $(BASE_CXXFLAGS) $(GF_CXXFLAGS) -Wno-pedantic
Expand All @@ -634,7 +639,7 @@ $(info I NVCCFLAGS: $(NVCCFLAGS))
$(info I LDFLAGS: $(LDFLAGS))
$(info I CC: $(shell $(CC) --version | head -n 1))
$(info I CXX: $(shell $(CXX) --version | head -n 1))
ifdef LLAMA_CUBLAS
ifdef LLAMA_CUDA
$(info I NVCC: $(shell $(NVCC) --version | tail -n 1))
CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])')
ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1)
Expand All @@ -644,9 +649,16 @@ $(error I ERROR: For CUDA versions < 11.7 a target CUDA architecture must be exp
endif # CUDA_POWER_ARCH
endif # CUDA_DOCKER_ARCH
endif # eq ($(shell echo "$(CUDA_VERSION) < 11.7" | bc),1)
endif # LLAMA_CUBLAS
endif # LLAMA_CUDA
$(info )

ifdef LLAMA_CUBLAS
$(info !!!!)
$(info LLAMA_CUBLAS is deprecated and will be removed in the future. Use LLAMA_CUDA instead.)
$(info !!!!)
$(info )
endif

#
# Build library
#
Expand Down
11 changes: 4 additions & 7 deletions README.md
Expand Up @@ -448,30 +448,27 @@ Building the program with BLAS support may lead to some performance improvements

Check [Optimizing and Running LLaMA2 on Intel® CPU](https://www.intel.com/content/www/us/en/content-details/791610/optimizing-and-running-llama2-on-intel-cpu.html) for more information.

- #### cuBLAS
- #### CUDA

This provides BLAS acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads).
This provides GPU acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads).

For Jetson user, if you have Jetson Orin, you can try this: [Offical Support](https://www.jetson-ai-lab.com/tutorial_text-generation.html). If you are using an old model(nano/TX2), need some additional operations before compiling.

- Using `make`:
```bash
make LLAMA_CUBLAS=1
make LLAMA_CUDA=1
```
- Using `CMake`:

```bash
mkdir build
cd build
cmake .. -DLLAMA_CUBLAS=ON
cmake .. -DLLAMA_CUDA=ON
cmake --build . --config Release
```

The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used. The following compilation options are also available to tweak performance:

<!---
| LLAMA_CUDA_CUBLAS | Boolean | false | Use cuBLAS instead of custom CUDA kernels for prompt processing. Faster for all quantization formats except for q4_0 and q8_0, especially for k-quants. Increases VRAM usage (700 MiB for 7b, 970 MiB for 13b, 1430 MiB for 33b). |
--->
| Option | Legal values | Default | Description |
|--------------------------------|------------------------|---------|-------------|
| LLAMA_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
Expand Down
6 changes: 3 additions & 3 deletions ci/run.sh
Expand Up @@ -40,7 +40,7 @@ if [ ! -z ${GG_BUILD_METAL} ]; then
fi

if [ ! -z ${GG_BUILD_CUDA} ]; then
CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_CUBLAS=1"
CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_CUDA=1"
fi

if [ ! -z ${GG_BUILD_SYCL} ]; then
Expand Down Expand Up @@ -412,8 +412,8 @@ function gg_run_open_llama_7b_v2 {

set -e

(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUBLAS=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log

python3 ../convert.py ${path_models}

Expand Down