Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[1/n][CI] Load models in CI from S3 instead of HF #13205

Merged
merged 23 commits into from
Feb 19, 2025
203 changes: 203 additions & 0 deletions hf_to_s3.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,203 @@
# SPDX-License-Identifier: Apache-2.0
import logging
import os
import shutil

import boto3
from huggingface_hub import HfApi, snapshot_download
from tqdm import tqdm

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


class ModelTransfer:

def __init__(self,
model_id,
s3_bucket,
aws_access_key_id=None,
aws_secret_access_key=None,
aws_region=None):
"""
Initialize the ModelTransfer class.

Args:
model_id (str): HuggingFace model ID
s3_bucket (str): Name of the S3 bucket
aws_access_key_id (str, optional)
aws_secret_access_key (str, optional)
aws_region (str, optional): AWS region. Defaults to None.
"""
self.model_id = model_id
self.s3_bucket = s3_bucket
self.model_name = model_id.split('/')[-1]

# Initialize S3 client
self.s3_client = boto3.client(
's3',
aws_access_key_id=aws_access_key_id,
aws_secret_access_key=aws_secret_access_key,
region_name=aws_region)

# Initialize Hugging Face API
self.hf_api = HfApi()

def download_model(self, local_dir):
"""
Download the model from HuggingFace.

Args:
local_dir (str): Local directory to save the model

Returns:
str: Path to the downloaded model directory
"""
logger.info("Downloading model %s...", self.model_id)

try:
local_dir_with_model = os.path.join(local_dir, self.model_name)
snapshot_download(repo_id=self.model_id,
local_dir=local_dir_with_model,
local_dir_use_symlinks=False,
token=os.getenv("HF_TOKEN"))
logger.info("Model downloaded successfully to %s",
local_dir_with_model)
return local_dir_with_model

except Exception as e:
logger.error("Error downloading model: %s", str(e))
raise

def upload_to_s3(self, local_dir):
"""
Upload the model directory to S3.

Args:
local_dir (str): Local directory containing the model files
"""
logger.info("Uploading model to S3 bucket %s...", self.s3_bucket)

try:
# Walk through all files in the directory
for root, _, files in os.walk(local_dir):
for filename in files:
# Get the full local path
local_path = os.path.join(root, filename)

# Calculate S3 path (preserve directory structure)
relative_path = os.path.relpath(local_path, local_dir)
s3_path = f"{self.model_name}/{relative_path}"

# Upload file with progress bar
file_size = os.path.getsize(local_path)
with tqdm(total=file_size,
unit='B',
unit_scale=True,
desc=f"Uploading {filename}") as pbar:
self.s3_client.upload_file(
local_path,
self.s3_bucket,
s3_path,
Callback=lambda bytes_transferred: pbar.update(
bytes_transferred))

logger.info("Uploaded %s to s3://%s/%s", filename,
self.s3_bucket, s3_path)

logger.info("Model upload completed successfully!")

except Exception as e:
logger.error("Error uploading to S3: %s", str(e))
raise


# "ibm/PowerMoE-3b", "internlm/internlm-chat-7b",
# "internlm/internlm2-chat-7b", "OpenGVLab/Mono-InternVL-2B",
# "internlm/internlm3-8b-instruct", "inceptionai/jais-13b-chat",
# "ai21labs/AI21-Jamba-1.5-Mini", "meta-llama/Meta-Llama-3-8B",
# "decapoda-research/llama-7b-hf", "state-spaces/mamba-130m-hf",
# "tiiuae/falcon-mamba-7b-instruct", "openbmb/MiniCPM-2B-sft-bf16",
# "openbmb/MiniCPM3-4B", "mistralai/Mistral-7B-Instruct-v0.1",
# "mistralai/Mixtral-8x7B-Instruct-v0.1",
# "mistral-community/Mixtral-8x22B-v0.1-AWQ", "mpt", "mosaicml/mpt-7b",
# "nvidia/Minitron-8B-Base", "allenai/OLMo-1B-hf",
# "shanearora/OLMo-7B-1124-hf", "allenai/OLMoE-1B-7B-0924-Instruct",
# "facebook/opt-iml-max-1.3b", "OrionStarAI/Orion-14B-Chat",
# "adept/persimmon-8b-chat", "microsoft/phi-2",
# "microsoft/Phi-3-mini-4k-instruct",
# "microsoft/Phi-3-small-8k-instruct", "microsoft/Phi-3.5-MoE-instruct",
# "Qwen/Qwen2-7B-Instruct", "Qwen/Qwen1.5-MoE-A2.7B-Chat",
# "tiiuae/falcon-40b", "stabilityai/stablelm-zephyr-3b",
# "stabilityai/stablelm-3b-4e1t", "bigcode/starcoder2-3b",
# "upstage/solar-pro-preview-instruct", "Tele-AI/TeleChat2-3B",
# "xverse/XVERSE-7B-Chat", "facebook/bart-base",
# "facebook/bart-large-cnn", "microsoft/Florence-2-base",
# "BAAI/bge-base-en-v1.5", "BAAI/bge-multilingual-gemma2",
# "parasail-ai/GritLM-7B-vllm", "internlm/internlm2-1_8b-reward",
# "ai21labs/Jamba-tiny-reward-dev", "llama",
# "intfloat/e5-mistral-7b-instruct",
# "ssmits/Qwen2-7B-Instruct-embed-base", "Qwen/Qwen2.5-Math-RM-72B",
# "Qwen/Qwen2.5-Math-PRM-7B", "jason9693/Qwen2.5-1.5B-apeach",
# "sentence-transformers/stsb-roberta-base-v2",
# "sentence-transformers/all-roberta-large-v1",
# "intfloat/multilingual-e5-large", "royokong/e5-v",
# "TIGER-Lab/VLM2Vec-Full", "MrLight/dse-qwen2-2b-mrl-v1",
# "ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11",
# "cross-encoder/ms-marco-MiniLM-L-6-v2",
# "cross-encoder/quora-roberta-base", "BAAI/bge-reranker-v2-m3",
# "THUDM/glm-4v-9b", "chatglm2-6b", "deepseek-ai/deepseek-vl2-tiny",
# "adept/fuyu-8b", "h2oai/h2ovl-mississippi-800m",
# "OpenGVLab/InternVL2-1B", "HuggingFaceM4/Idefics3-8B-Llama3",
# "llava-hf/llava-1.5-7b-hf", "llava-hf/llava-v1.6-mistral-7b-hf",
# "llava-hf/LLaVA-NeXT-Video-7B-hf",
# "llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
# "TIGER-Lab/Mantis-8B-siglip-llama3", "openbmb/MiniCPM-o-2_6",
# "openbmb/MiniCPM-V-2_6", "allenai/Molmo-7B-D-0924",
# "nvidia/NVLM-D-72B", "google/paligemma-3b-pt-224",
# "microsoft/Phi-3-vision-128k-instruct", "mistralai/Pixtral-12B-2409",
# "Qwen/Qwen-VL-Chat", "Qwen/Qwen2-Audio-7B-Instruct",
# "Qwen/Qwen2-VL-2B-Instruct", "Qwen/Qwen2.5-VL-3B-Instruct",
# "fixie-ai/ultravox-v0_5-llama-3_2-1b",
# "meta-llama/Llama-3.2-11B-Vision-Instruct", "openai/whisper-large-v3",
# "JackFram/llama-68m", "JackFram/llama-68m", "JackFram/llama-160m",
# "ArthurZ/Ilama-3.2-1B"


def main():
# Configuration
MODEL_ID = ["mistralai/Mistral-7B-Instruct-v0.1"]
S3_BUCKET = "vllm-ci-model-weights"
# Local directory to temporarily store the model
LOCAL_DIR = "/home/ec2-user/models"

AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")
AWS_REGION = "us-west-2"

# Create transfer object
for model_id in MODEL_ID:
transfer = ModelTransfer(model_id=model_id,
s3_bucket=S3_BUCKET,
aws_access_key_id=AWS_ACCESS_KEY_ID,
aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
aws_region=AWS_REGION)

try:
# Create local directory if it doesn't exist
os.makedirs(LOCAL_DIR, exist_ok=True)

# Download model
model_dir = transfer.download_model(LOCAL_DIR)

# Upload to S3 and cleanup
transfer.upload_to_s3(model_dir)
shutil.rmtree(model_dir)

except Exception as e:
logger.error("Error in transfer process: %s", str(e))
raise


if __name__ == "__main__":
main()
2 changes: 2 additions & 0 deletions requirements-test.in
Original file line number Diff line number Diff line change
Expand Up @@ -37,3 +37,5 @@ genai_perf==0.0.8
tritonclient==2.51.0

numpy < 2.0.0
runai-model-streamer==0.11.0
runai-model-streamer-s3==0.11.0
8 changes: 8 additions & 0 deletions requirements-test.txt
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,8 @@ huggingface-hub==0.26.2
# tokenizers
# transformers
# vocos
humanize==4.11.0
# via runai-model-streamer
idna==3.10
# via
# anyio
Expand Down Expand Up @@ -290,6 +292,7 @@ numpy==1.26.4
# patsy
# peft
# rouge-score
# runai-model-streamer
# sacrebleu
# scikit-learn
# scipy
Expand Down Expand Up @@ -514,6 +517,10 @@ rpds-py==0.20.1
# referencing
rsa==4.7.2
# via awscli
runai-model-streamer==0.11.0
# via -r requirements-test.in
runai-model-streamer-s3==0.11.0
# via -r requirements-test.in
s3transfer==0.10.3
# via
# awscli
Expand Down Expand Up @@ -594,6 +601,7 @@ torch==2.5.1
# encodec
# lm-eval
# peft
# runai-model-streamer
# sentence-transformers
# tensorizer
# timm
Expand Down
18 changes: 9 additions & 9 deletions tests/basic_correctness/test_basic_correctness.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def v1(run_with_both_engines):

def test_vllm_gc_ed():
"""Verify vllm instance is GC'ed when it is deleted"""
llm = LLM("facebook/opt-125m")
llm = LLM("distilbert/distilgpt2", load_format="runai_streamer")
weak_llm = weakref.ref(llm)
del llm
# If there's any circular reference to vllm, this fails
Expand Down Expand Up @@ -94,14 +94,14 @@ def test_models(
@pytest.mark.parametrize(
"model, distributed_executor_backend, attention_backend, "
"test_suite", [
("facebook/opt-125m", "ray", "", "L4"),
("facebook/opt-125m", "mp", "", "L4"),
("meta-llama/Llama-3.2-1B-Instruct", "ray", "", "L4"),
("meta-llama/Llama-3.2-1B-Instruct", "mp", "", "L4"),
("facebook/opt-125m", "ray", "", "A100"),
("facebook/opt-125m", "mp", "", "A100"),
("facebook/opt-125m", "mp", "FLASHINFER", "A100"),
("meta-llama/Llama-3.2-1B-Instruct", "ray", "FLASHINFER", "A100"),
("distilbert/distilgpt2", "ray", "", "L4"),
("distilbert/distilgpt2", "mp", "", "L4"),
("meta-llama/Llama-2-7b-hf", "ray", "", "L4"),
("meta-llama/Llama-2-7b-hf", "mp", "", "L4"),
("distilbert/distilgpt2", "ray", "", "A100"),
("distilbert/distilgpt2", "mp", "", "A100"),
("distilbert/distilgpt2", "mp", "FLASHINFER", "A100"),
("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100"),
])
def test_models_distributed(
hf_runner,
Expand Down
10 changes: 7 additions & 3 deletions tests/basic_correctness/test_cumem.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from vllm.device_allocator.cumem import CuMemAllocator
from vllm.utils import GiB_bytes

from ..conftest import MODEL_WEIGHTS_S3_BUCKET
from ..utils import fork_new_process_for_each_test


Expand Down Expand Up @@ -118,13 +119,16 @@ def model(x):
@pytest.mark.parametrize(
"model",
[
"meta-llama/Llama-3.2-1B-Instruct", # sleep mode with safetensors
"facebook/opt-125m" # sleep mode with pytorch checkpoint
# sleep mode with safetensors
f"{MODEL_WEIGHTS_S3_BUCKET}/Llama-3.2-1B",
# sleep mode with pytorch checkpoint
"facebook/opt-125m"
])
def test_end_to_end(model):
free, total = torch.cuda.mem_get_info()
used_bytes_baseline = total - free # in case other process is running
llm = LLM(model, enable_sleep_mode=True)
load_format = "runai_streamer" if "Llama" in model else "auto"
llm = LLM(model, load_format=load_format, enable_sleep_mode=True)
prompt = "How are you?"
sampling_params = SamplingParams(temperature=0, max_tokens=10)
output = llm.generate(prompt, sampling_params)
Expand Down
2 changes: 1 addition & 1 deletion tests/basic_correctness/test_preemption.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from ..models.utils import check_outputs_equal

MODELS = [
"facebook/opt-125m",
"distilbert/distilgpt2",
]


Expand Down
23 changes: 22 additions & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
from vllm import LLM, SamplingParams
from vllm.assets.image import ImageAsset
from vllm.assets.video import VideoAsset
from vllm.config import TaskOption, TokenizerPoolConfig
from vllm.config import LoadFormat, TaskOption, TokenizerPoolConfig
from vllm.connections import global_http_connection
from vllm.distributed import (cleanup_dist_env_and_memory,
init_distributed_environment,
Expand All @@ -46,6 +46,21 @@
_SYS_MSG = os.path.join(_TEST_DIR, "system_messages", "sonnet3.5_nov2024.txt")

_M = TypeVar("_M")

MODELS_ON_S3 = [
"distilbert/distilgpt2",
"meta-llama/Llama-2-7b-hf",
"meta-llama/Meta-Llama-3-8B",
"meta-llama/Llama-3.2-1B",
"meta-llama/Llama-3.2-1B-Instruct",
"openai-community/gpt2",
"ArthurZ/Ilama-3.2-1B",
"llava-hf/llava-1.5-7b-hf",
"TinyLlama/TinyLlama-1.1B-Chat-v1.0",
]

MODEL_WEIGHTS_S3_BUCKET = "s3://vllm-ci-model-weights"

_PromptMultiModalInput = Union[List[_M], List[List[_M]]]

PromptImageInput = _PromptMultiModalInput[Image.Image]
Expand Down Expand Up @@ -679,6 +694,11 @@ def __init__(
enforce_eager: Optional[bool] = False,
**kwargs,
) -> None:
load_format = LoadFormat.AUTO
if model_name in MODELS_ON_S3:
model_name = (f"s3://vllm-ci-model-weights/"
f"{model_name.split('/')[-1]}")
load_format = LoadFormat.RUNAI_STREAMER
self.model = LLM(
model=model_name,
task=task,
Expand All @@ -693,6 +713,7 @@ def __init__(
max_model_len=max_model_len,
block_size=block_size,
enable_chunked_prefill=enable_chunked_prefill,
load_format=load_format,
**kwargs,
)

Expand Down
5 changes: 4 additions & 1 deletion tests/engine/test_computed_prefix_blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,10 @@
from vllm.engine.llm_engine import LLMEngine
from vllm.sampling_params import SamplingParams

from ..conftest import MODEL_WEIGHTS_S3_BUCKET

@pytest.mark.parametrize("model", ["facebook/opt-125m"])

@pytest.mark.parametrize("model", [f"{MODEL_WEIGHTS_S3_BUCKET}/distilgpt2"])
@pytest.mark.parametrize("block_size", [16])
def test_computed_prefix_blocks(model: str, block_size: int):
# This test checks if we are able to run the engine to completion
Expand All @@ -24,6 +26,7 @@ def test_computed_prefix_blocks(model: str, block_size: int):
"decoration.")

engine_args = EngineArgs(model=model,
load_format="runai_streamer",
block_size=block_size,
enable_prefix_caching=True)

Expand Down
Loading