Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[1/n][CI] Load models in CI from S3 instead of HF #13205

Merged
merged 23 commits into from
Feb 19, 2025
2 changes: 2 additions & 0 deletions requirements-test.in
Original file line number Diff line number Diff line change
Expand Up @@ -37,3 +37,5 @@ genai_perf==0.0.8
tritonclient==2.51.0

numpy < 2.0.0
runai-model-streamer==0.11.0
runai-model-streamer-s3==0.11.0
8 changes: 8 additions & 0 deletions requirements-test.txt
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,8 @@ huggingface-hub==0.26.2
# tokenizers
# transformers
# vocos
humanize==4.11.0
# via runai-model-streamer
idna==3.10
# via
# anyio
Expand Down Expand Up @@ -290,6 +292,7 @@ numpy==1.26.4
# patsy
# peft
# rouge-score
# runai-model-streamer
# sacrebleu
# scikit-learn
# scipy
Expand Down Expand Up @@ -514,6 +517,10 @@ rpds-py==0.20.1
# referencing
rsa==4.7.2
# via awscli
runai-model-streamer==0.11.0
# via -r requirements-test.in
runai-model-streamer-s3==0.11.0
# via -r requirements-test.in
s3transfer==0.10.3
# via
# awscli
Expand Down Expand Up @@ -594,6 +601,7 @@ torch==2.5.1
# encodec
# lm-eval
# peft
# runai-model-streamer
# sentence-transformers
# tensorizer
# timm
Expand Down
19 changes: 10 additions & 9 deletions tests/basic_correctness/test_basic_correctness.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import pytest

from vllm import LLM
from vllm.config import LoadFormat
from vllm.platforms import current_platform

from ..conftest import VllmRunner
Expand All @@ -33,7 +34,7 @@ def v1(run_with_both_engines):

def test_vllm_gc_ed():
"""Verify vllm instance is GC'ed when it is deleted"""
llm = LLM("facebook/opt-125m")
llm = LLM("distilbert/distilgpt2", load_format=LoadFormat.RUNAI_STREAMER)
weak_llm = weakref.ref(llm)
del llm
# If there's any circular reference to vllm, this fails
Expand Down Expand Up @@ -94,14 +95,14 @@ def test_models(
@pytest.mark.parametrize(
"model, distributed_executor_backend, attention_backend, "
"test_suite", [
("facebook/opt-125m", "ray", "", "L4"),
("facebook/opt-125m", "mp", "", "L4"),
("meta-llama/Llama-3.2-1B-Instruct", "ray", "", "L4"),
("meta-llama/Llama-3.2-1B-Instruct", "mp", "", "L4"),
("facebook/opt-125m", "ray", "", "A100"),
("facebook/opt-125m", "mp", "", "A100"),
("facebook/opt-125m", "mp", "FLASHINFER", "A100"),
("meta-llama/Llama-3.2-1B-Instruct", "ray", "FLASHINFER", "A100"),
("distilbert/distilgpt2", "ray", "", "L4"),
("distilbert/distilgpt2", "mp", "", "L4"),
("meta-llama/Llama-2-7b-hf", "ray", "", "L4"),
("meta-llama/Llama-2-7b-hf", "mp", "", "L4"),
("distilbert/distilgpt2", "ray", "", "A100"),
("distilbert/distilgpt2", "mp", "", "A100"),
("distilbert/distilgpt2", "mp", "FLASHINFER", "A100"),
("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100"),
])
def test_models_distributed(
hf_runner,
Expand Down
13 changes: 10 additions & 3 deletions tests/basic_correctness/test_cumem.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,11 @@
import torch

from vllm import LLM, SamplingParams
from vllm.config import LoadFormat
from vllm.device_allocator.cumem import CuMemAllocator
from vllm.utils import GiB_bytes

from ..conftest import MODEL_WEIGHTS_S3_BUCKET
from ..utils import fork_new_process_for_each_test


Expand Down Expand Up @@ -118,13 +120,18 @@ def model(x):
@pytest.mark.parametrize(
"model",
[
"meta-llama/Llama-3.2-1B-Instruct", # sleep mode with safetensors
"facebook/opt-125m" # sleep mode with pytorch checkpoint
# sleep mode with safetensors
f"{MODEL_WEIGHTS_S3_BUCKET}/Llama-3.2-1B",
# sleep mode with pytorch checkpoint
"facebook/opt-125m"
])
def test_end_to_end(model):
free, total = torch.cuda.mem_get_info()
used_bytes_baseline = total - free # in case other process is running
llm = LLM(model, enable_sleep_mode=True)
load_format = LoadFormat.AUTO
if "Llama" in model:
load_format = LoadFormat.RUNAI_STREAMER
llm = LLM(model, load_format=load_format, enable_sleep_mode=True)
prompt = "How are you?"
sampling_params = SamplingParams(temperature=0, max_tokens=10)
output = llm.generate(prompt, sampling_params)
Expand Down
2 changes: 1 addition & 1 deletion tests/basic_correctness/test_preemption.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from ..models.utils import check_outputs_equal

MODELS = [
"facebook/opt-125m",
"distilbert/distilgpt2",
]


Expand Down
25 changes: 24 additions & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
from vllm import LLM, SamplingParams
from vllm.assets.image import ImageAsset
from vllm.assets.video import VideoAsset
from vllm.config import TaskOption, TokenizerPoolConfig
from vllm.config import LoadFormat, TaskOption, TokenizerPoolConfig
from vllm.connections import global_http_connection
from vllm.distributed import (cleanup_dist_env_and_memory,
init_distributed_environment,
Expand All @@ -46,6 +46,21 @@
_SYS_MSG = os.path.join(_TEST_DIR, "system_messages", "sonnet3.5_nov2024.txt")

_M = TypeVar("_M")

MODELS_ON_S3 = [
"distilbert/distilgpt2",
"meta-llama/Llama-2-7b-hf",
"meta-llama/Meta-Llama-3-8B",
"meta-llama/Llama-3.2-1B",
"meta-llama/Llama-3.2-1B-Instruct",
"openai-community/gpt2",
"ArthurZ/Ilama-3.2-1B",
"llava-hf/llava-1.5-7b-hf",
"TinyLlama/TinyLlama-1.1B-Chat-v1.0",
]

MODEL_WEIGHTS_S3_BUCKET = "s3://vllm-ci-model-weights"

_PromptMultiModalInput = Union[List[_M], List[List[_M]]]

PromptImageInput = _PromptMultiModalInput[Image.Image]
Expand Down Expand Up @@ -677,8 +692,15 @@ def __init__(
enable_chunked_prefill: bool = False,
swap_space: int = 4,
enforce_eager: Optional[bool] = False,
load_format: Optional[LoadFormat] = None,
**kwargs,
) -> None:
if model_name in MODELS_ON_S3 and not load_format:
model_name = (f"s3://vllm-ci-model-weights/"
f"{model_name.split('/')[-1]}")
load_format = LoadFormat.RUNAI_STREAMER
if not load_format:
load_format = LoadFormat.AUTO
self.model = LLM(
model=model_name,
task=task,
Expand All @@ -693,6 +715,7 @@ def __init__(
max_model_len=max_model_len,
block_size=block_size,
enable_chunked_prefill=enable_chunked_prefill,
load_format=load_format,
**kwargs,
)

Expand Down
6 changes: 5 additions & 1 deletion tests/engine/test_computed_prefix_blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,15 @@

import pytest

from vllm.config import LoadFormat
from vllm.engine.arg_utils import EngineArgs
from vllm.engine.llm_engine import LLMEngine
from vllm.sampling_params import SamplingParams

from ..conftest import MODEL_WEIGHTS_S3_BUCKET

@pytest.mark.parametrize("model", ["facebook/opt-125m"])

@pytest.mark.parametrize("model", [f"{MODEL_WEIGHTS_S3_BUCKET}/distilgpt2"])
@pytest.mark.parametrize("block_size", [16])
def test_computed_prefix_blocks(model: str, block_size: int):
# This test checks if we are able to run the engine to completion
Expand All @@ -24,6 +27,7 @@ def test_computed_prefix_blocks(model: str, block_size: int):
"decoration.")

engine_args = EngineArgs(model=model,
load_format=LoadFormat.RUNAI_STREAMER,
block_size=block_size,
enable_prefix_caching=True)

Expand Down
7 changes: 5 additions & 2 deletions tests/engine/test_detokenization.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,14 @@

import pytest

from vllm.config import LoadFormat
from vllm.entrypoints.llm import LLM
from vllm.sampling_params import SamplingParams

from ..conftest import MODEL_WEIGHTS_S3_BUCKET

@pytest.mark.parametrize("model", ["facebook/opt-125m"])

@pytest.mark.parametrize("model", [f"{MODEL_WEIGHTS_S3_BUCKET}/distilgpt2"])
def test_computed_prefix_blocks(model: str):
# This test checks if the engine generates completions both with and
# without optional detokenization, that detokenization includes text
Expand All @@ -17,7 +20,7 @@ def test_computed_prefix_blocks(model: str):
"paper clips? Is there an easy to follow video tutorial available "
"online for free?")

llm = LLM(model=model)
llm = LLM(model=model, load_format=LoadFormat.RUNAI_STREAMER)
sampling_params = SamplingParams(max_tokens=10,
temperature=0.0,
detokenize=False)
Expand Down
17 changes: 13 additions & 4 deletions tests/engine/test_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,17 @@

import pytest

from vllm.config import LoadFormat
from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine
from vllm.engine.llm_engine import LLMEngine
from vllm.executor.uniproc_executor import UniProcExecutor
from vllm.sampling_params import SamplingParams

from ..conftest import MODEL_WEIGHTS_S3_BUCKET

RUNAI_STREAMER_LOAD_FORMAT = LoadFormat.RUNAI_STREAMER


class Mock:
...
Expand All @@ -33,10 +38,11 @@ def collective_rpc(self,
CustomUniExecutorAsync = CustomUniExecutor


@pytest.mark.parametrize("model", ["facebook/opt-125m"])
@pytest.mark.parametrize("model", [f"{MODEL_WEIGHTS_S3_BUCKET}/distilgpt2"])
def test_custom_executor_type_checking(model):
with pytest.raises(ValueError):
engine_args = EngineArgs(model=model,
load_format=RUNAI_STREAMER_LOAD_FORMAT,
distributed_executor_backend=Mock)
LLMEngine.from_engine_args(engine_args)
with pytest.raises(ValueError):
Expand All @@ -45,7 +51,7 @@ def test_custom_executor_type_checking(model):
AsyncLLMEngine.from_engine_args(engine_args)


@pytest.mark.parametrize("model", ["facebook/opt-125m"])
@pytest.mark.parametrize("model", [f"{MODEL_WEIGHTS_S3_BUCKET}/distilgpt2"])
def test_custom_executor(model, tmp_path):
cwd = os.path.abspath(".")
os.chdir(tmp_path)
Expand All @@ -54,6 +60,7 @@ def test_custom_executor(model, tmp_path):

engine_args = EngineArgs(
model=model,
load_format=RUNAI_STREAMER_LOAD_FORMAT,
distributed_executor_backend=CustomUniExecutor,
enforce_eager=True, # reduce test time
)
Expand All @@ -68,7 +75,7 @@ def test_custom_executor(model, tmp_path):
os.chdir(cwd)


@pytest.mark.parametrize("model", ["facebook/opt-125m"])
@pytest.mark.parametrize("model", [f"{MODEL_WEIGHTS_S3_BUCKET}/distilgpt2"])
def test_custom_executor_async(model, tmp_path):
cwd = os.path.abspath(".")
os.chdir(tmp_path)
Expand All @@ -77,6 +84,7 @@ def test_custom_executor_async(model, tmp_path):

engine_args = AsyncEngineArgs(
model=model,
load_format=RUNAI_STREAMER_LOAD_FORMAT,
distributed_executor_backend=CustomUniExecutorAsync,
enforce_eager=True, # reduce test time
)
Expand All @@ -95,7 +103,7 @@ async def t():
os.chdir(cwd)


@pytest.mark.parametrize("model", ["facebook/opt-125m"])
@pytest.mark.parametrize("model", [f"{MODEL_WEIGHTS_S3_BUCKET}/distilgpt2"])
def test_respect_ray(model):
# even for TP=1 and PP=1,
# if users specify ray, we should use ray.
Expand All @@ -104,6 +112,7 @@ def test_respect_ray(model):
engine_args = EngineArgs(
model=model,
distributed_executor_backend="ray",
load_format=RUNAI_STREAMER_LOAD_FORMAT,
enforce_eager=True, # reduce test time
)
engine = LLMEngine.from_engine_args(engine_args)
Expand Down
9 changes: 7 additions & 2 deletions tests/engine/test_skip_tokenizer_init.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,21 @@

import pytest

from vllm.config import LoadFormat
from vllm.entrypoints.llm import LLM
from vllm.sampling_params import SamplingParams

from ..conftest import MODEL_WEIGHTS_S3_BUCKET

@pytest.mark.parametrize("model", ["facebook/opt-125m"])

@pytest.mark.parametrize("model", [f"{MODEL_WEIGHTS_S3_BUCKET}/distilgpt2"])
def test_skip_tokenizer_initialization(model: str):
# This test checks if the flag skip_tokenizer_init skips the initialization
# of tokenizer and detokenizer. The generated output is expected to contain
# token ids.
llm = LLM(model=model, skip_tokenizer_init=True)
llm = LLM(model=model,
skip_tokenizer_init=True,
load_format=LoadFormat.RUNAI_STREAMER)
sampling_params = SamplingParams(prompt_logprobs=True, detokenize=True)

with pytest.raises(ValueError, match="cannot pass text prompts when"):
Expand Down
2 changes: 1 addition & 1 deletion tests/engine/test_stop_reason.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

from vllm import SamplingParams

MODEL = "facebook/opt-350m"
MODEL = "distilbert/distilgpt2"
STOP_STR = "."
SEED = 42
MAX_TOKENS = 1024
Expand Down
13 changes: 10 additions & 3 deletions tests/entrypoints/llm/test_chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,17 @@
import pytest

from vllm import LLM
from vllm.config import LoadFormat

from ...conftest import MODEL_WEIGHTS_S3_BUCKET
from ..openai.test_vision import TEST_IMAGE_URLS

RUNAI_STREAMER_LOAD_FORMAT = LoadFormat.RUNAI_STREAMER


def test_chat():
llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct")
llm = LLM(model=f"{MODEL_WEIGHTS_S3_BUCKET}/Llama-3.2-1B-Instruct",
load_format=RUNAI_STREAMER_LOAD_FORMAT)

prompt1 = "Explain the concept of entropy."
messages = [
Expand All @@ -28,7 +33,8 @@ def test_chat():


def test_multi_chat():
llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct")
llm = LLM(model=f"{MODEL_WEIGHTS_S3_BUCKET}/Llama-3.2-1B-Instruct",
load_format=RUNAI_STREAMER_LOAD_FORMAT)

prompt1 = "Explain the concept of entropy."
prompt2 = "Explain what among us is."
Expand Down Expand Up @@ -65,7 +71,8 @@ def test_multi_chat():
[[TEST_IMAGE_URLS[0], TEST_IMAGE_URLS[1]]])
def test_chat_multi_image(image_urls: List[str]):
llm = LLM(
model="microsoft/Phi-3.5-vision-instruct",
model=f"{MODEL_WEIGHTS_S3_BUCKET}/Phi-3.5-vision-instruct",
load_format=RUNAI_STREAMER_LOAD_FORMAT,
dtype="bfloat16",
max_model_len=4096,
max_num_seqs=5,
Expand Down
2 changes: 1 addition & 1 deletion tests/entrypoints/llm/test_collective_rpc.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ class MyWorker(Worker):
def echo_rank(self):
return self.rank

llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct",
llm = LLM(model="s3://vllm-ci-model-weights/Llama-3.2-1B-Instruct",
enforce_eager=True,
load_format="dummy",
tensor_parallel_size=tp_size,
Expand Down
Loading