Skip to content

Merge newer build components into main #2

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Apr 14, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 20 additions & 12 deletions .github/workflows/build-vllm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,26 +4,33 @@ env:
PARALLELISM: 1
TORCH_CUDA_ARCH_LIST: 9.0a
VLLM_FA_CMAKE_GPU_ARCHES: 90a-real
TORCH_REF: v2.6.0
TORCH_BUILD_VERSION: 2.6.0+cu124
AUDIO_REF: v2.6.0
AUDIO_BUILD_VERSION: 2.6.0+cu124
VISION_REF: v0.21.0
VISION_BUILD_VERSION: 0.21.0+cu124
TRITON_REF: release/3.2.x
TRITON_BUILD_SUFFIX: +cu126
TRITON_BUILD_SUFFIX: +cu124
XFORMERS_REF: v0.0.29.post2
XFORMERS_BUILD_VERSION: 0.0.29.post2+cu126
XFORMERS_BUILD_VERSION: 0.0.29.post2+cu124
FLASHINFER_REF: v0.2.2.post1
FLASHINFER_BUILD_SUFFIX: cu126
VLLM_REF: v0.8.1
FLASHINFER_BUILD_SUFFIX: cu124
VLLM_REF: v0.8.3
VLLM_BUILD_VERSION: 0.8.3

on:
push:
branches: [main]
branches: [cu124]
pull_request:

jobs:
build:
strategy:
matrix:
arch: [amd64, arm64]
cuda_version: [12.6.3]
image_distro: [ubuntu24.04]
cuda_version: [12.4.1]
image_distro: [ubuntu22.04]
runs-on: [self-hosted, "${{ matrix.arch }}"]
steps:
- name: Prepare some env vars
Expand Down Expand Up @@ -65,13 +72,14 @@ jobs:
FLASHINFER_REF=${{ env.FLASHINFER_REF }}
FLASHINFER_BUILD_SUFFIX=${{ env.FLASHINFER_BUILD_SUFFIX }}
VLLM_REF=${{ env.VLLM_REF }}
cache-from: type=registry,ref=${{ env.GHCR_IMAGE }}:cache-cu${{ env.CUDA_TAG }}-${{ matrix.arch }}
cache-to: type=registry,ref=${{ env.GHCR_IMAGE }}:cache-cu${{ env.CUDA_TAG }}-${{ matrix.arch }},mode=max
VLLM_BUILD_VERSION=${{ env.VLLM_BUILD_VERSION }}
cache-from: type=registry,ref=${{ env.GHCR_IMAGE }}:cache-cu${{ env.CUDA_TAG }}-${{ matrix.image_distro }}-${{ matrix.arch }}
cache-to: type=registry,ref=${{ env.GHCR_IMAGE }}:cache-cu${{ env.CUDA_TAG }}-${{ matrix.image_distro }}-${{ matrix.arch }},mode=max
context: .
file: Dockerfile
platforms: linux/${{ matrix.arch }}
push: true
tags: ${{ env.GHCR_IMAGE }}:${{ env.VLLM_REF }}-cu${{ env.CUDA_TAG }}-${{ matrix.arch }}
tags: ${{ env.GHCR_IMAGE }}:${{ env.VLLM_REF }}-cu${{ env.CUDA_TAG }}-${{ matrix.image_distro }}-${{ matrix.arch }}

# Fix this to use matrix and handle imagetools create --append
ghcr:
Expand All @@ -81,7 +89,7 @@ jobs:
- name: Prepare some env vars
run: |
echo "GHCR_IMAGE=ghcr.io/${GITHUB_REPOSITORY@L}" >> ${GITHUB_ENV}
echo "CUDA_TAG=126" >> ${GITHUB_ENV}
echo "CUDA_TAG=124" >> ${GITHUB_ENV}

- name: Login to GHCR
uses: docker/login-action@v3
Expand All @@ -92,6 +100,6 @@ jobs:

- name: Tag images
run: |
TAGS=(${VLLM_REF}-cu${CUDA_TAG}-{amd,arm}64)
TAGS=(${VLLM_REF}-cu${CUDA_TAG}-ubuntu22.04-{amd,arm}64)
docker buildx imagetools create -t ${GHCR_IMAGE}:${VLLM_REF} ${TAGS[@]/#/${GHCR_IMAGE}:}
docker buildx imagetools create -t ${GHCR_IMAGE}:latest ${TAGS[@]/#/${GHCR_IMAGE}:}
157 changes: 122 additions & 35 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,10 +1,16 @@
ARG CUDA_VERSION=12.6.3
ARG IMAGE_DISTRO=ubuntu24.04
ARG CUDA_VERSION=12.4.1
ARG IMAGE_DISTRO=ubuntu22.04
ARG PYTHON_VERSION=3.12

# ---------- Builder Base ----------
FROM nvcr.io/nvidia/cuda:${CUDA_VERSION}-devel-${IMAGE_DISTRO} AS base

# Job scaling
ARG MAX_JOBS=32
ENV MAX_JOBS=${MAX_JOBS}
ARG NVCC_THREADS=2
ENV NVCC_THREADS=${NVCC_THREADS}

# Set arch lists for all targets
# 'a' suffix is not forward compatible but enables all optimizations
ARG TORCH_CUDA_ARCH_LIST="9.0a"
Expand All @@ -17,19 +23,17 @@ ENV DEBIAN_FRONTEND=noninteractive
RUN apt update
RUN apt upgrade -y
RUN apt install -y --no-install-recommends \
curl \
git \
libibverbs-dev \
zlib1g-dev

# Clean apt cache
RUN apt clean
RUN rm -rf /var/lib/apt/lists/*
RUN rm -rf /var/cache/apt/archives
curl \
gcc-12 g++-12 \
git \
libibverbs-dev \
libjpeg-turbo8-dev \
libpng-dev \
zlib1g-dev

# Set compiler paths
ENV CC=/usr/bin/gcc
ENV CXX=/usr/bin/g++
ENV CC=/usr/bin/gcc-12
ENV CXX=/usr/bin/g++-12

# Install uv
RUN curl -LsSf https://astral.sh/uv/install.sh | env UV_INSTALL_DIR=/usr/local/bin sh
Expand All @@ -45,69 +49,146 @@ ENV PATH=${VIRTUAL_ENV}/bin:${PATH}
ENV CUDA_HOME=/usr/local/cuda
ENV LD_LIBRARY_PATH=${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}

# Install pytorch nightly
RUN uv pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu126

FROM base AS build-base
RUN mkdir /wheels

# Install build deps that aren't in project requirements files
# Make sure to upgrade setuptools to avoid triton build bug
RUN uv pip install -U build cmake ninja pybind11 setuptools wheel
# cmake '4.x' isn't parsed right by some tools yet
RUN uv pip install -U build "cmake<4" ninja pybind11 "setuptools<=76" wheel

# Handle arm64 torch build
FROM build-base AS build-torch
ARG TARGETARCH
RUN if [ ${TARGETARCH} = arm64 ]; then \
# Install NVPL for ARM64 \
apt install -y --no-install-recommends nvpl0 && \
export BLAS=NVPL && \
# ARM64 linker optimization \
export CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000 && \
export USE_PRIORITIZED_TEXT_FOR_LD=1; \
else \
uv pip install mkl-static mkl-include; \
fi

ARG TORCH_REF=v2.6.0
ARG TORCH_BUILD_VERSION=2.6.0+cu124
ENV PYTORCH_BUILD_VERSION=${TORCH_BUILD_VERSION:-${TORCH_REF#v}}
ENV PYTORCH_BUILD_NUMBER=0
RUN git clone https://github.com/pytorch/pytorch.git
RUN cd pytorch && \
git checkout ${TORCH_REF} && \
git submodule sync --recursive && \
git submodule update --init --recursive -j 8
# # Bump XNNPACK submodule ref to fix compilation bug \
# cd third_party/XNNPACK && \
# git checkout fcc06d1
RUN cd pytorch && \
uv pip install -r requirements.txt && \
uv build --wheel --no-build-isolation -o /wheels

FROM build-base AS build-audio
COPY --from=build-torch /wheels/*.whl wheels/
RUN uv pip install wheels/*

ARG AUDIO_REF=v2.6.0
ARG AUDIO_BUILD_VERSION=2.6.0+cu124
ENV BUILD_VERSION=${AUDIO_BUILD_VERSION:-${AUDIO_REF#v}}
RUN git clone https://github.com/pytorch/audio.git
RUN cd audio && \
git checkout ${AUDIO_REF} && \
git submodule sync --recursive && \
git submodule update --init --recursive -j 8
RUN cd audio && \
uv build --wheel --no-build-isolation -o /wheels

FROM build-base AS build-vision
COPY --from=build-torch /wheels/*.whl wheels/
RUN uv pip install wheels/*

ARG VISION_REF=v0.21.0
ARG VISION_BUILD_VERSION=0.21.0+cu124
ENV BUILD_VERSION=${VISION_BUILD_VERSION:-${VISION_REF#v}}
RUN git clone https://github.com/pytorch/vision.git
RUN cd vision && \
git checkout ${VISION_REF} && \
git submodule sync --recursive && \
git submodule update --init --recursive -j 8
RUN cd vision && \
uv build --wheel --no-build-isolation -o /wheels

FROM build-base AS build-triton
COPY --from=build-torch /wheels/*.whl wheels/
RUN uv pip install wheels/*

ARG TRITON_REF=release/3.2.x
ARG TRITON_BUILD_SUFFIX=+cu126
ARG TRITON_BUILD_SUFFIX=+cu124
ENV TRITON_WHEEL_VERSION_SUFFIX=${TRITON_BUILD_SUFFIX:-}
RUN git clone https://github.com/triton-lang/triton.git
RUN cd triton && \
git checkout ${TRITON_REF} && \
git submodule sync && \
git submodule update --init --recursive -j 8 && \
git submodule sync --recursive && \
git submodule update --init --recursive -j 8
RUN cd triton && \
uv build python --wheel --no-build-isolation -o /wheels

FROM build-base AS build-xformers
COPY --from=build-torch /wheels/*.whl wheels/
RUN uv pip install wheels/*

ARG XFORMERS_REF=v0.0.29.post2
ARG XFORMERS_BUILD_VERSION=0.0.29.post2+cu126
ARG XFORMERS_BUILD_VERSION=0.0.29.post2+cu124
ENV BUILD_VERSION=${XFORMERS_BUILD_VERSION:-${XFORMERS_REF#v}}
RUN git clone https://github.com/facebookresearch/xformers.git
RUN git clone https://github.com/facebookresearch/xformers.git
RUN cd xformers && \
git checkout ${XFORMERS_REF} && \
git submodule sync && \
git submodule update --init --recursive -j 8 && \
git submodule sync --recursive && \
git submodule update --init --recursive -j 8
RUN cd xformers && \
uv build --wheel --no-build-isolation -o /wheels

FROM build-base AS build-flashinfer
COPY --from=build-torch /wheels/*.whl wheels/
RUN uv pip install wheels/*

ARG FLASHINFER_ENABLE_AOT=1
ARG FLASHINFER_REF=v0.2.2.post1
ARG FLASHINFER_BUILD_SUFFIX=cu126
ARG FLASHINFER_BUILD_SUFFIX=cu124
ENV FLASHINFER_LOCAL_VERSION=${FLASHINFER_BUILD_SUFFIX:-}
RUN git clone https://github.com/flashinfer-ai/flashinfer.git
RUN cd flashinfer && \
git checkout ${FLASHINFER_REF} && \
git submodule sync && \
git submodule update --init --recursive -j 8 && \
git submodule sync --recursive && \
git submodule update --init --recursive -j 8
RUN cd flashinfer && \
uv build --wheel --no-build-isolation -o /wheels

FROM build-base AS build-vllm
ARG VLLM_REF=v0.8.1
COPY --from=build-torch /wheels/*.whl wheels/
RUN uv pip install wheels/*

ARG VLLM_REF=v0.8.3
ARG VLLM_BUILD_VERSION=0.8.3
ENV BUILD_VERSION=${VLLM_BUILD_VERSION:-${VLLM_REF#v}}
ENV SETUPTOOLS_SCM_PRETEND_VERSION=${BUILD_VERSION:-:}
RUN git clone https://github.com/vllm-project/vllm.git
RUN cd vllm && \
git checkout ${VLLM_REF} && \
git submodule sync && \
git submodule update --init --recursive -j 8 && \
python use_existing_torch.py && \
uv pip install -r requirements/build.txt && \
uv build --wheel --no-build-isolation -o /wheels

FROM base AS vllm-openai
COPY --from=build-flashinfer /wheels/* wheels/
COPY --from=build-triton /wheels/* wheels/
COPY --from=build-vllm /wheels/* wheels/
COPY --from=build-xformers /wheels/* wheels/
COPY --from=build-torch /wheels/*.whl wheels/
COPY --from=build-audio /wheels/*.whl wheels/
COPY --from=build-vision /wheels/*.whl wheels/
COPY --from=build-flashinfer /wheels/*.whl wheels/
COPY --from=build-triton /wheels/*.whl wheels/
COPY --from=build-vllm /wheels/*.whl wheels/
COPY --from=build-xformers /wheels/*.whl wheels/

# Install and cleanup wheels
RUN uv pip install wheels/*
RUN rm -r wheels

# Install pynvml
RUN uv pip install pynvml
Expand All @@ -118,6 +199,12 @@ RUN uv pip install accelerate hf_transfer modelscope bitsandbytes timm boto3 run
# Clean uv cache
RUN uv clean

# Clean apt cache
RUN apt autoremove --purge -y
RUN apt clean
RUN rm -rf /var/lib/apt/lists/*
RUN rm -rf /var/cache/apt/archives

# Enable hf-transfer
ENV HF_HUB_ENABLE_HF_TRANSFER=1

Expand Down