LambdaLabsML · Apsu · Apr 14, 2025 · Mar 27, 2025 · Mar 28, 2025 · Mar 28, 2025
diff --git a/.github/workflows/build-vllm.yaml b/.github/workflows/build-vllm.yaml
@@ -4,26 +4,33 @@ env:
   PARALLELISM: 1
   TORCH_CUDA_ARCH_LIST: 9.0a
   VLLM_FA_CMAKE_GPU_ARCHES: 90a-real
+  TORCH_REF: v2.6.0
+  TORCH_BUILD_VERSION: 2.6.0+cu124
+  AUDIO_REF: v2.6.0
+  AUDIO_BUILD_VERSION: 2.6.0+cu124
+  VISION_REF: v0.21.0
+  VISION_BUILD_VERSION: 0.21.0+cu124
   TRITON_REF: release/3.2.x
-  TRITON_BUILD_SUFFIX: +cu126
+  TRITON_BUILD_SUFFIX: +cu124
   XFORMERS_REF: v0.0.29.post2
-  XFORMERS_BUILD_VERSION: 0.0.29.post2+cu126
+  XFORMERS_BUILD_VERSION: 0.0.29.post2+cu124
   FLASHINFER_REF: v0.2.2.post1
-  FLASHINFER_BUILD_SUFFIX: cu126
-  VLLM_REF: v0.8.1
+  FLASHINFER_BUILD_SUFFIX: cu124
+  VLLM_REF: v0.8.3
+  VLLM_BUILD_VERSION: 0.8.3
 
 on:
   push:
-    branches: [main]
+    branches: [cu124]
   pull_request:
 
 jobs:
   build:
     strategy:
       matrix:
         arch: [amd64, arm64]
-        cuda_version: [12.6.3]
-        image_distro: [ubuntu24.04]
+        cuda_version: [12.4.1]
+        image_distro: [ubuntu22.04]
     runs-on: [self-hosted, "${{ matrix.arch }}"]
     steps:
       - name: Prepare some env vars
@@ -65,13 +72,14 @@ jobs:
             FLASHINFER_REF=${{ env.FLASHINFER_REF }}
             FLASHINFER_BUILD_SUFFIX=${{ env.FLASHINFER_BUILD_SUFFIX }}
             VLLM_REF=${{ env.VLLM_REF }}
-          cache-from: type=registry,ref=${{ env.GHCR_IMAGE }}:cache-cu${{ env.CUDA_TAG }}-${{ matrix.arch }}
-          cache-to: type=registry,ref=${{ env.GHCR_IMAGE }}:cache-cu${{ env.CUDA_TAG }}-${{ matrix.arch }},mode=max
+            VLLM_BUILD_VERSION=${{ env.VLLM_BUILD_VERSION }}
+          cache-from: type=registry,ref=${{ env.GHCR_IMAGE }}:cache-cu${{ env.CUDA_TAG }}-${{ matrix.image_distro }}-${{ matrix.arch }}
+          cache-to: type=registry,ref=${{ env.GHCR_IMAGE }}:cache-cu${{ env.CUDA_TAG }}-${{ matrix.image_distro }}-${{ matrix.arch }},mode=max
           context: .
           file: Dockerfile
           platforms: linux/${{ matrix.arch }}
           push: true
-          tags: ${{ env.GHCR_IMAGE }}:${{ env.VLLM_REF }}-cu${{ env.CUDA_TAG }}-${{ matrix.arch }}
+          tags: ${{ env.GHCR_IMAGE }}:${{ env.VLLM_REF }}-cu${{ env.CUDA_TAG }}-${{ matrix.image_distro }}-${{ matrix.arch }}
 
   # Fix this to use matrix and handle imagetools create --append
   ghcr:
@@ -81,7 +89,7 @@ jobs:
       - name: Prepare some env vars
         run: |
           echo "GHCR_IMAGE=ghcr.io/${GITHUB_REPOSITORY@L}" >> ${GITHUB_ENV}
-          echo "CUDA_TAG=126" >> ${GITHUB_ENV}
+          echo "CUDA_TAG=124" >> ${GITHUB_ENV}
 
       - name: Login to GHCR
         uses: docker/login-action@v3
@@ -92,6 +100,6 @@ jobs:
 
       - name: Tag images
         run: |
-          TAGS=(${VLLM_REF}-cu${CUDA_TAG}-{amd,arm}64)
+          TAGS=(${VLLM_REF}-cu${CUDA_TAG}-ubuntu22.04-{amd,arm}64)
           docker buildx imagetools create -t ${GHCR_IMAGE}:${VLLM_REF} ${TAGS[@]/#/${GHCR_IMAGE}:}
           docker buildx imagetools create -t ${GHCR_IMAGE}:latest ${TAGS[@]/#/${GHCR_IMAGE}:}
diff --git a/Dockerfile b/Dockerfile
@@ -1,10 +1,16 @@
-ARG CUDA_VERSION=12.6.3
-ARG IMAGE_DISTRO=ubuntu24.04
+ARG CUDA_VERSION=12.4.1
+ARG IMAGE_DISTRO=ubuntu22.04
 ARG PYTHON_VERSION=3.12
 
 # ---------- Builder Base ----------
 FROM nvcr.io/nvidia/cuda:${CUDA_VERSION}-devel-${IMAGE_DISTRO} AS base
 
+# Job scaling
+ARG MAX_JOBS=32
+ENV MAX_JOBS=${MAX_JOBS}
+ARG NVCC_THREADS=2
+ENV NVCC_THREADS=${NVCC_THREADS}
+
 # Set arch lists for all targets
 # 'a' suffix is not forward compatible but enables all optimizations
 ARG TORCH_CUDA_ARCH_LIST="9.0a"
@@ -17,19 +23,17 @@ ENV DEBIAN_FRONTEND=noninteractive
 RUN apt update
 RUN apt upgrade -y
 RUN apt install -y --no-install-recommends \
-        curl \
-        git \
-        libibverbs-dev \
-        zlib1g-dev
-
-# Clean apt cache
-RUN apt clean
-RUN rm -rf /var/lib/apt/lists/*
-RUN rm -rf /var/cache/apt/archives
+    curl \
+    gcc-12 g++-12 \
+    git \
+    libibverbs-dev \
+    libjpeg-turbo8-dev \
+    libpng-dev \
+    zlib1g-dev
 
 # Set compiler paths
-ENV CC=/usr/bin/gcc
-ENV CXX=/usr/bin/g++
+ENV CC=/usr/bin/gcc-12
+ENV CXX=/usr/bin/g++-12
 
 # Install uv
 RUN curl -LsSf https://astral.sh/uv/install.sh | env UV_INSTALL_DIR=/usr/local/bin sh
@@ -45,69 +49,146 @@ ENV PATH=${VIRTUAL_ENV}/bin:${PATH}
 ENV CUDA_HOME=/usr/local/cuda
 ENV LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}
 
-# Install pytorch nightly
-RUN uv pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu126
-
 FROM base AS build-base
 RUN mkdir /wheels
 
 # Install build deps that aren't in project requirements files
 # Make sure to upgrade setuptools to avoid triton build bug
-RUN uv pip install -U build cmake ninja pybind11 setuptools wheel
+# cmake '4.x' isn't parsed right by some tools yet
+RUN uv pip install -U build "cmake<4" ninja pybind11 "setuptools<=76" wheel
+
+# Handle arm64 torch build
+FROM build-base AS build-torch
+ARG TARGETARCH
+RUN if [ ${TARGETARCH} = arm64 ]; then \
+        # Install NVPL for ARM64 \
+        apt install -y --no-install-recommends nvpl0 && \
+        export BLAS=NVPL && \
+        # ARM64 linker optimization \
+        export CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000 && \
+        export USE_PRIORITIZED_TEXT_FOR_LD=1; \
+    else \
+        uv pip install mkl-static mkl-include; \
+    fi
+
+ARG TORCH_REF=v2.6.0
+ARG TORCH_BUILD_VERSION=2.6.0+cu124
+ENV PYTORCH_BUILD_VERSION=${TORCH_BUILD_VERSION:-${TORCH_REF#v}}
+ENV PYTORCH_BUILD_NUMBER=0
+RUN git clone https://github.com/pytorch/pytorch.git
+RUN cd pytorch && \
+    git checkout ${TORCH_REF} && \
+    git submodule sync --recursive && \
+    git submodule update --init --recursive -j 8
+    # # Bump XNNPACK submodule ref to fix compilation bug \
+    # cd third_party/XNNPACK && \
+    # git checkout fcc06d1
+RUN cd pytorch && \
+    uv pip install -r requirements.txt && \
+    uv build --wheel --no-build-isolation -o /wheels
+
+FROM build-base AS build-audio
+COPY --from=build-torch /wheels/*.whl wheels/
+RUN uv pip install wheels/*
+
+ARG AUDIO_REF=v2.6.0
+ARG AUDIO_BUILD_VERSION=2.6.0+cu124
+ENV BUILD_VERSION=${AUDIO_BUILD_VERSION:-${AUDIO_REF#v}}
+RUN git clone https://github.com/pytorch/audio.git
+RUN cd audio && \
+    git checkout ${AUDIO_REF} && \
+    git submodule sync --recursive && \
+    git submodule update --init --recursive -j 8
+RUN cd audio && \
+    uv build --wheel --no-build-isolation -o /wheels
+
+FROM build-base AS build-vision
+COPY --from=build-torch /wheels/*.whl wheels/
+RUN uv pip install wheels/*
+
+ARG VISION_REF=v0.21.0
+ARG VISION_BUILD_VERSION=0.21.0+cu124
+ENV BUILD_VERSION=${VISION_BUILD_VERSION:-${VISION_REF#v}}
+RUN git clone https://github.com/pytorch/vision.git
+RUN cd vision && \
+    git checkout ${VISION_REF} && \
+    git submodule sync --recursive && \
+    git submodule update --init --recursive -j 8
+RUN cd vision && \
+    uv build --wheel --no-build-isolation -o /wheels
 
 FROM build-base AS build-triton
+COPY --from=build-torch /wheels/*.whl wheels/
+RUN uv pip install wheels/*
+
 ARG TRITON_REF=release/3.2.x
-ARG TRITON_BUILD_SUFFIX=+cu126
+ARG TRITON_BUILD_SUFFIX=+cu124
 ENV TRITON_WHEEL_VERSION_SUFFIX=${TRITON_BUILD_SUFFIX:-}
 RUN git clone https://github.com/triton-lang/triton.git
 RUN cd triton && \
     git checkout ${TRITON_REF} && \
-    git submodule sync && \
-    git submodule update --init --recursive -j 8 && \
+    git submodule sync --recursive && \
+    git submodule update --init --recursive -j 8
+RUN cd triton && \
     uv build python --wheel --no-build-isolation -o /wheels
 
 FROM build-base AS build-xformers
+COPY --from=build-torch /wheels/*.whl wheels/
+RUN uv pip install wheels/*
+
 ARG XFORMERS_REF=v0.0.29.post2
-ARG XFORMERS_BUILD_VERSION=0.0.29.post2+cu126
+ARG XFORMERS_BUILD_VERSION=0.0.29.post2+cu124
 ENV BUILD_VERSION=${XFORMERS_BUILD_VERSION:-${XFORMERS_REF#v}}
-RUN git clone  https://github.com/facebookresearch/xformers.git
+RUN git clone https://github.com/facebookresearch/xformers.git
 RUN cd xformers && \
     git checkout ${XFORMERS_REF} && \
-    git submodule sync && \
-    git submodule update --init --recursive -j 8 && \
+    git submodule sync --recursive && \
+    git submodule update --init --recursive -j 8
+RUN cd xformers && \
     uv build --wheel --no-build-isolation -o /wheels
 
 FROM build-base AS build-flashinfer
+COPY --from=build-torch /wheels/*.whl wheels/
+RUN uv pip install wheels/*
+
 ARG FLASHINFER_ENABLE_AOT=1
 ARG FLASHINFER_REF=v0.2.2.post1
-ARG FLASHINFER_BUILD_SUFFIX=cu126
+ARG FLASHINFER_BUILD_SUFFIX=cu124
 ENV FLASHINFER_LOCAL_VERSION=${FLASHINFER_BUILD_SUFFIX:-}
 RUN git clone https://github.com/flashinfer-ai/flashinfer.git
 RUN cd flashinfer && \
     git checkout ${FLASHINFER_REF} && \
-    git submodule sync && \
-    git submodule update --init --recursive -j 8 && \
+    git submodule sync --recursive && \
+    git submodule update --init --recursive -j 8
+RUN cd flashinfer && \
     uv build --wheel --no-build-isolation -o /wheels
 
 FROM build-base AS build-vllm
-ARG VLLM_REF=v0.8.1
+COPY --from=build-torch /wheels/*.whl wheels/
+RUN uv pip install wheels/*
+
+ARG VLLM_REF=v0.8.3
+ARG VLLM_BUILD_VERSION=0.8.3
+ENV BUILD_VERSION=${VLLM_BUILD_VERSION:-${VLLM_REF#v}}
+ENV SETUPTOOLS_SCM_PRETEND_VERSION=${BUILD_VERSION:-:}
 RUN git clone https://github.com/vllm-project/vllm.git
 RUN cd vllm && \
     git checkout ${VLLM_REF} && \
-    git submodule sync && \
-    git submodule update --init --recursive -j 8 && \
+    python use_existing_torch.py && \
     uv pip install -r requirements/build.txt && \
     uv build --wheel --no-build-isolation -o /wheels
 
 FROM base AS vllm-openai
-COPY --from=build-flashinfer /wheels/* wheels/
-COPY --from=build-triton /wheels/* wheels/
-COPY --from=build-vllm /wheels/* wheels/
-COPY --from=build-xformers /wheels/* wheels/
+COPY --from=build-torch /wheels/*.whl wheels/
+COPY --from=build-audio /wheels/*.whl wheels/
+COPY --from=build-vision /wheels/*.whl wheels/
+COPY --from=build-flashinfer /wheels/*.whl wheels/
+COPY --from=build-triton /wheels/*.whl wheels/
+COPY --from=build-vllm /wheels/*.whl wheels/
+COPY --from=build-xformers /wheels/*.whl wheels/
 
 # Install and cleanup wheels
 RUN uv pip install wheels/*
-RUN rm -r wheels
 
 # Install pynvml
 RUN uv pip install pynvml
@@ -118,6 +199,12 @@ RUN uv pip install accelerate hf_transfer modelscope bitsandbytes timm boto3 run
 # Clean uv cache
 RUN uv clean
 
+# Clean apt cache
+RUN apt autoremove --purge -y
+RUN apt clean
+RUN rm -rf /var/lib/apt/lists/*
+RUN rm -rf /var/cache/apt/archives
+
 # Enable hf-transfer
 ENV HF_HUB_ENABLE_HF_TRANSFER=1