diff --git a/.github/workflows/build-vllm.yaml b/.github/workflows/build-vllm.yaml index 51a3ba4..9261a91 100644 --- a/.github/workflows/build-vllm.yaml +++ b/.github/workflows/build-vllm.yaml @@ -4,17 +4,24 @@ env: PARALLELISM: 1 TORCH_CUDA_ARCH_LIST: 9.0a VLLM_FA_CMAKE_GPU_ARCHES: 90a-real + TORCH_REF: v2.6.0 + TORCH_BUILD_VERSION: 2.6.0+cu124 + AUDIO_REF: v2.6.0 + AUDIO_BUILD_VERSION: 2.6.0+cu124 + VISION_REF: v0.21.0 + VISION_BUILD_VERSION: 0.21.0+cu124 TRITON_REF: release/3.2.x - TRITON_BUILD_SUFFIX: +cu126 + TRITON_BUILD_SUFFIX: +cu124 XFORMERS_REF: v0.0.29.post2 - XFORMERS_BUILD_VERSION: 0.0.29.post2+cu126 + XFORMERS_BUILD_VERSION: 0.0.29.post2+cu124 FLASHINFER_REF: v0.2.2.post1 - FLASHINFER_BUILD_SUFFIX: cu126 - VLLM_REF: v0.8.1 + FLASHINFER_BUILD_SUFFIX: cu124 + VLLM_REF: v0.8.3 + VLLM_BUILD_VERSION: 0.8.3 on: push: - branches: [main] + branches: [cu124] pull_request: jobs: @@ -22,8 +29,8 @@ jobs: strategy: matrix: arch: [amd64, arm64] - cuda_version: [12.6.3] - image_distro: [ubuntu24.04] + cuda_version: [12.4.1] + image_distro: [ubuntu22.04] runs-on: [self-hosted, "${{ matrix.arch }}"] steps: - name: Prepare some env vars @@ -65,13 +72,14 @@ jobs: FLASHINFER_REF=${{ env.FLASHINFER_REF }} FLASHINFER_BUILD_SUFFIX=${{ env.FLASHINFER_BUILD_SUFFIX }} VLLM_REF=${{ env.VLLM_REF }} - cache-from: type=registry,ref=${{ env.GHCR_IMAGE }}:cache-cu${{ env.CUDA_TAG }}-${{ matrix.arch }} - cache-to: type=registry,ref=${{ env.GHCR_IMAGE }}:cache-cu${{ env.CUDA_TAG }}-${{ matrix.arch }},mode=max + VLLM_BUILD_VERSION=${{ env.VLLM_BUILD_VERSION }} + cache-from: type=registry,ref=${{ env.GHCR_IMAGE }}:cache-cu${{ env.CUDA_TAG }}-${{ matrix.image_distro }}-${{ matrix.arch }} + cache-to: type=registry,ref=${{ env.GHCR_IMAGE }}:cache-cu${{ env.CUDA_TAG }}-${{ matrix.image_distro }}-${{ matrix.arch }},mode=max context: . file: Dockerfile platforms: linux/${{ matrix.arch }} push: true - tags: ${{ env.GHCR_IMAGE }}:${{ env.VLLM_REF }}-cu${{ env.CUDA_TAG }}-${{ matrix.arch }} + tags: ${{ env.GHCR_IMAGE }}:${{ env.VLLM_REF }}-cu${{ env.CUDA_TAG }}-${{ matrix.image_distro }}-${{ matrix.arch }} # Fix this to use matrix and handle imagetools create --append ghcr: @@ -81,7 +89,7 @@ jobs: - name: Prepare some env vars run: | echo "GHCR_IMAGE=ghcr.io/${GITHUB_REPOSITORY@L}" >> ${GITHUB_ENV} - echo "CUDA_TAG=126" >> ${GITHUB_ENV} + echo "CUDA_TAG=124" >> ${GITHUB_ENV} - name: Login to GHCR uses: docker/login-action@v3 @@ -92,6 +100,6 @@ jobs: - name: Tag images run: | - TAGS=(${VLLM_REF}-cu${CUDA_TAG}-{amd,arm}64) + TAGS=(${VLLM_REF}-cu${CUDA_TAG}-ubuntu22.04-{amd,arm}64) docker buildx imagetools create -t ${GHCR_IMAGE}:${VLLM_REF} ${TAGS[@]/#/${GHCR_IMAGE}:} docker buildx imagetools create -t ${GHCR_IMAGE}:latest ${TAGS[@]/#/${GHCR_IMAGE}:} diff --git a/Dockerfile b/Dockerfile index f1ab30b..60ccaff 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,10 +1,16 @@ -ARG CUDA_VERSION=12.6.3 -ARG IMAGE_DISTRO=ubuntu24.04 +ARG CUDA_VERSION=12.4.1 +ARG IMAGE_DISTRO=ubuntu22.04 ARG PYTHON_VERSION=3.12 # ---------- Builder Base ---------- FROM nvcr.io/nvidia/cuda:${CUDA_VERSION}-devel-${IMAGE_DISTRO} AS base +# Job scaling +ARG MAX_JOBS=32 +ENV MAX_JOBS=${MAX_JOBS} +ARG NVCC_THREADS=2 +ENV NVCC_THREADS=${NVCC_THREADS} + # Set arch lists for all targets # 'a' suffix is not forward compatible but enables all optimizations ARG TORCH_CUDA_ARCH_LIST="9.0a" @@ -17,19 +23,17 @@ ENV DEBIAN_FRONTEND=noninteractive RUN apt update RUN apt upgrade -y RUN apt install -y --no-install-recommends \ - curl \ - git \ - libibverbs-dev \ - zlib1g-dev - -# Clean apt cache -RUN apt clean -RUN rm -rf /var/lib/apt/lists/* -RUN rm -rf /var/cache/apt/archives + curl \ + gcc-12 g++-12 \ + git \ + libibverbs-dev \ + libjpeg-turbo8-dev \ + libpng-dev \ + zlib1g-dev # Set compiler paths -ENV CC=/usr/bin/gcc -ENV CXX=/usr/bin/g++ +ENV CC=/usr/bin/gcc-12 +ENV CXX=/usr/bin/g++-12 # Install uv RUN curl -LsSf https://astral.sh/uv/install.sh | env UV_INSTALL_DIR=/usr/local/bin sh @@ -45,69 +49,146 @@ ENV PATH=${VIRTUAL_ENV}/bin:${PATH} ENV CUDA_HOME=/usr/local/cuda ENV LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH} -# Install pytorch nightly -RUN uv pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu126 - FROM base AS build-base RUN mkdir /wheels # Install build deps that aren't in project requirements files # Make sure to upgrade setuptools to avoid triton build bug -RUN uv pip install -U build cmake ninja pybind11 setuptools wheel +# cmake '4.x' isn't parsed right by some tools yet +RUN uv pip install -U build "cmake<4" ninja pybind11 "setuptools<=76" wheel + +# Handle arm64 torch build +FROM build-base AS build-torch +ARG TARGETARCH +RUN if [ ${TARGETARCH} = arm64 ]; then \ + # Install NVPL for ARM64 \ + apt install -y --no-install-recommends nvpl0 && \ + export BLAS=NVPL && \ + # ARM64 linker optimization \ + export CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000 && \ + export USE_PRIORITIZED_TEXT_FOR_LD=1; \ + else \ + uv pip install mkl-static mkl-include; \ + fi + +ARG TORCH_REF=v2.6.0 +ARG TORCH_BUILD_VERSION=2.6.0+cu124 +ENV PYTORCH_BUILD_VERSION=${TORCH_BUILD_VERSION:-${TORCH_REF#v}} +ENV PYTORCH_BUILD_NUMBER=0 +RUN git clone https://github.com/pytorch/pytorch.git +RUN cd pytorch && \ + git checkout ${TORCH_REF} && \ + git submodule sync --recursive && \ + git submodule update --init --recursive -j 8 + # # Bump XNNPACK submodule ref to fix compilation bug \ + # cd third_party/XNNPACK && \ + # git checkout fcc06d1 +RUN cd pytorch && \ + uv pip install -r requirements.txt && \ + uv build --wheel --no-build-isolation -o /wheels + +FROM build-base AS build-audio +COPY --from=build-torch /wheels/*.whl wheels/ +RUN uv pip install wheels/* + +ARG AUDIO_REF=v2.6.0 +ARG AUDIO_BUILD_VERSION=2.6.0+cu124 +ENV BUILD_VERSION=${AUDIO_BUILD_VERSION:-${AUDIO_REF#v}} +RUN git clone https://github.com/pytorch/audio.git +RUN cd audio && \ + git checkout ${AUDIO_REF} && \ + git submodule sync --recursive && \ + git submodule update --init --recursive -j 8 +RUN cd audio && \ + uv build --wheel --no-build-isolation -o /wheels + +FROM build-base AS build-vision +COPY --from=build-torch /wheels/*.whl wheels/ +RUN uv pip install wheels/* + +ARG VISION_REF=v0.21.0 +ARG VISION_BUILD_VERSION=0.21.0+cu124 +ENV BUILD_VERSION=${VISION_BUILD_VERSION:-${VISION_REF#v}} +RUN git clone https://github.com/pytorch/vision.git +RUN cd vision && \ + git checkout ${VISION_REF} && \ + git submodule sync --recursive && \ + git submodule update --init --recursive -j 8 +RUN cd vision && \ + uv build --wheel --no-build-isolation -o /wheels FROM build-base AS build-triton +COPY --from=build-torch /wheels/*.whl wheels/ +RUN uv pip install wheels/* + ARG TRITON_REF=release/3.2.x -ARG TRITON_BUILD_SUFFIX=+cu126 +ARG TRITON_BUILD_SUFFIX=+cu124 ENV TRITON_WHEEL_VERSION_SUFFIX=${TRITON_BUILD_SUFFIX:-} RUN git clone https://github.com/triton-lang/triton.git RUN cd triton && \ git checkout ${TRITON_REF} && \ - git submodule sync && \ - git submodule update --init --recursive -j 8 && \ + git submodule sync --recursive && \ + git submodule update --init --recursive -j 8 +RUN cd triton && \ uv build python --wheel --no-build-isolation -o /wheels FROM build-base AS build-xformers +COPY --from=build-torch /wheels/*.whl wheels/ +RUN uv pip install wheels/* + ARG XFORMERS_REF=v0.0.29.post2 -ARG XFORMERS_BUILD_VERSION=0.0.29.post2+cu126 +ARG XFORMERS_BUILD_VERSION=0.0.29.post2+cu124 ENV BUILD_VERSION=${XFORMERS_BUILD_VERSION:-${XFORMERS_REF#v}} -RUN git clone https://github.com/facebookresearch/xformers.git +RUN git clone https://github.com/facebookresearch/xformers.git RUN cd xformers && \ git checkout ${XFORMERS_REF} && \ - git submodule sync && \ - git submodule update --init --recursive -j 8 && \ + git submodule sync --recursive && \ + git submodule update --init --recursive -j 8 +RUN cd xformers && \ uv build --wheel --no-build-isolation -o /wheels FROM build-base AS build-flashinfer +COPY --from=build-torch /wheels/*.whl wheels/ +RUN uv pip install wheels/* + ARG FLASHINFER_ENABLE_AOT=1 ARG FLASHINFER_REF=v0.2.2.post1 -ARG FLASHINFER_BUILD_SUFFIX=cu126 +ARG FLASHINFER_BUILD_SUFFIX=cu124 ENV FLASHINFER_LOCAL_VERSION=${FLASHINFER_BUILD_SUFFIX:-} RUN git clone https://github.com/flashinfer-ai/flashinfer.git RUN cd flashinfer && \ git checkout ${FLASHINFER_REF} && \ - git submodule sync && \ - git submodule update --init --recursive -j 8 && \ + git submodule sync --recursive && \ + git submodule update --init --recursive -j 8 +RUN cd flashinfer && \ uv build --wheel --no-build-isolation -o /wheels FROM build-base AS build-vllm -ARG VLLM_REF=v0.8.1 +COPY --from=build-torch /wheels/*.whl wheels/ +RUN uv pip install wheels/* + +ARG VLLM_REF=v0.8.3 +ARG VLLM_BUILD_VERSION=0.8.3 +ENV BUILD_VERSION=${VLLM_BUILD_VERSION:-${VLLM_REF#v}} +ENV SETUPTOOLS_SCM_PRETEND_VERSION=${BUILD_VERSION:-:} RUN git clone https://github.com/vllm-project/vllm.git RUN cd vllm && \ git checkout ${VLLM_REF} && \ - git submodule sync && \ - git submodule update --init --recursive -j 8 && \ + python use_existing_torch.py && \ uv pip install -r requirements/build.txt && \ uv build --wheel --no-build-isolation -o /wheels FROM base AS vllm-openai -COPY --from=build-flashinfer /wheels/* wheels/ -COPY --from=build-triton /wheels/* wheels/ -COPY --from=build-vllm /wheels/* wheels/ -COPY --from=build-xformers /wheels/* wheels/ +COPY --from=build-torch /wheels/*.whl wheels/ +COPY --from=build-audio /wheels/*.whl wheels/ +COPY --from=build-vision /wheels/*.whl wheels/ +COPY --from=build-flashinfer /wheels/*.whl wheels/ +COPY --from=build-triton /wheels/*.whl wheels/ +COPY --from=build-vllm /wheels/*.whl wheels/ +COPY --from=build-xformers /wheels/*.whl wheels/ # Install and cleanup wheels RUN uv pip install wheels/* -RUN rm -r wheels # Install pynvml RUN uv pip install pynvml @@ -118,6 +199,12 @@ RUN uv pip install accelerate hf_transfer modelscope bitsandbytes timm boto3 run # Clean uv cache RUN uv clean +# Clean apt cache +RUN apt autoremove --purge -y +RUN apt clean +RUN rm -rf /var/lib/apt/lists/* +RUN rm -rf /var/cache/apt/archives + # Enable hf-transfer ENV HF_HUB_ENABLE_HF_TRANSFER=1