Skip to content

Commit

Permalink
[1/n][CI] Load models in CI from S3 instead of HF (vllm-project#13205)
Browse files Browse the repository at this point in the history
Signed-off-by: <>
Co-authored-by: EC2 Default User <[email protected]>
  • Loading branch information
2 people authored and kerthcet committed Feb 21, 2025
1 parent f5cb1db commit b84fe3a
Show file tree
Hide file tree
Showing 43 changed files with 225 additions and 76 deletions.
2 changes: 2 additions & 0 deletions requirements-test.in
Original file line number Diff line number Diff line change
Expand Up @@ -37,3 +37,5 @@ genai_perf==0.0.8
tritonclient==2.51.0

numpy < 2.0.0
runai-model-streamer==0.11.0
runai-model-streamer-s3==0.11.0
8 changes: 8 additions & 0 deletions requirements-test.txt
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,8 @@ huggingface-hub==0.26.2
# tokenizers
# transformers
# vocos
humanize==4.11.0
# via runai-model-streamer
idna==3.10
# via
# anyio
Expand Down Expand Up @@ -290,6 +292,7 @@ numpy==1.26.4
# patsy
# peft
# rouge-score
# runai-model-streamer
# sacrebleu
# scikit-learn
# scipy
Expand Down Expand Up @@ -514,6 +517,10 @@ rpds-py==0.20.1
# referencing
rsa==4.7.2
# via awscli
runai-model-streamer==0.11.0
# via -r requirements-test.in
runai-model-streamer-s3==0.11.0
# via -r requirements-test.in
s3transfer==0.10.3
# via
# awscli
Expand Down Expand Up @@ -594,6 +601,7 @@ torch==2.5.1
# encodec
# lm-eval
# peft
# runai-model-streamer
# sentence-transformers
# tensorizer
# timm
Expand Down
19 changes: 10 additions & 9 deletions tests/basic_correctness/test_basic_correctness.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import pytest

from vllm import LLM
from vllm.config import LoadFormat
from vllm.platforms import current_platform

from ..conftest import VllmRunner
Expand All @@ -33,7 +34,7 @@ def v1(run_with_both_engines):

def test_vllm_gc_ed():
"""Verify vllm instance is GC'ed when it is deleted"""
llm = LLM("facebook/opt-125m")
llm = LLM("distilbert/distilgpt2", load_format=LoadFormat.RUNAI_STREAMER)
weak_llm = weakref.ref(llm)
del llm
# If there's any circular reference to vllm, this fails
Expand Down Expand Up @@ -94,14 +95,14 @@ def test_models(
@pytest.mark.parametrize(
"model, distributed_executor_backend, attention_backend, "
"test_suite", [
("facebook/opt-125m", "ray", "", "L4"),
("facebook/opt-125m", "mp", "", "L4"),
("meta-llama/Llama-3.2-1B-Instruct", "ray", "", "L4"),
("meta-llama/Llama-3.2-1B-Instruct", "mp", "", "L4"),
("facebook/opt-125m", "ray", "", "A100"),
("facebook/opt-125m", "mp", "", "A100"),
("facebook/opt-125m", "mp", "FLASHINFER", "A100"),
("meta-llama/Llama-3.2-1B-Instruct", "ray", "FLASHINFER", "A100"),
("distilbert/distilgpt2", "ray", "", "L4"),
("distilbert/distilgpt2", "mp", "", "L4"),
("meta-llama/Llama-2-7b-hf", "ray", "", "L4"),
("meta-llama/Llama-2-7b-hf", "mp", "", "L4"),
("distilbert/distilgpt2", "ray", "", "A100"),
("distilbert/distilgpt2", "mp", "", "A100"),
("distilbert/distilgpt2", "mp", "FLASHINFER", "A100"),
("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100"),
])
def test_models_distributed(
hf_runner,
Expand Down
13 changes: 10 additions & 3 deletions tests/basic_correctness/test_cumem.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,11 @@
import torch

from vllm import LLM, SamplingParams
from vllm.config import LoadFormat
from vllm.device_allocator.cumem import CuMemAllocator
from vllm.utils import GiB_bytes

from ..conftest import MODEL_WEIGHTS_S3_BUCKET
from ..utils import fork_new_process_for_each_test


Expand Down Expand Up @@ -118,13 +120,18 @@ def model(x):
@pytest.mark.parametrize(
"model",
[
"meta-llama/Llama-3.2-1B-Instruct", # sleep mode with safetensors
"facebook/opt-125m" # sleep mode with pytorch checkpoint
# sleep mode with safetensors
f"{MODEL_WEIGHTS_S3_BUCKET}/Llama-3.2-1B",
# sleep mode with pytorch checkpoint
"facebook/opt-125m"
])
def test_end_to_end(model):
free, total = torch.cuda.mem_get_info()
used_bytes_baseline = total - free # in case other process is running
llm = LLM(model, enable_sleep_mode=True)
load_format = LoadFormat.AUTO
if "Llama" in model:
load_format = LoadFormat.RUNAI_STREAMER
llm = LLM(model, load_format=load_format, enable_sleep_mode=True)
prompt = "How are you?"
sampling_params = SamplingParams(temperature=0, max_tokens=10)
output = llm.generate(prompt, sampling_params)
Expand Down
2 changes: 1 addition & 1 deletion tests/basic_correctness/test_preemption.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from ..models.utils import check_outputs_equal

MODELS = [
"facebook/opt-125m",
"distilbert/distilgpt2",
]


Expand Down
25 changes: 24 additions & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
from vllm import LLM, SamplingParams
from vllm.assets.image import ImageAsset
from vllm.assets.video import VideoAsset
from vllm.config import TaskOption, TokenizerPoolConfig
from vllm.config import LoadFormat, TaskOption, TokenizerPoolConfig
from vllm.connections import global_http_connection
from vllm.distributed import (cleanup_dist_env_and_memory,
init_distributed_environment,
Expand All @@ -46,6 +46,21 @@
_SYS_MSG = os.path.join(_TEST_DIR, "system_messages", "sonnet3.5_nov2024.txt")

_M = TypeVar("_M")

MODELS_ON_S3 = [
"distilbert/distilgpt2",
"meta-llama/Llama-2-7b-hf",
"meta-llama/Meta-Llama-3-8B",
"meta-llama/Llama-3.2-1B",
"meta-llama/Llama-3.2-1B-Instruct",
"openai-community/gpt2",
"ArthurZ/Ilama-3.2-1B",
"llava-hf/llava-1.5-7b-hf",
"TinyLlama/TinyLlama-1.1B-Chat-v1.0",
]

MODEL_WEIGHTS_S3_BUCKET = "s3://vllm-ci-model-weights"

_PromptMultiModalInput = Union[List[_M], List[List[_M]]]

PromptImageInput = _PromptMultiModalInput[Image.Image]
Expand Down Expand Up @@ -677,8 +692,15 @@ def __init__(
enable_chunked_prefill: bool = False,
swap_space: int = 4,
enforce_eager: Optional[bool] = False,
load_format: Optional[LoadFormat] = None,
**kwargs,
) -> None:
if model_name in MODELS_ON_S3 and not load_format:
model_name = (f"s3://vllm-ci-model-weights/"
f"{model_name.split('/')[-1]}")
load_format = LoadFormat.RUNAI_STREAMER
if not load_format:
load_format = LoadFormat.AUTO
self.model = LLM(
model=model_name,
task=task,
Expand All @@ -693,6 +715,7 @@ def __init__(
max_model_len=max_model_len,
block_size=block_size,
enable_chunked_prefill=enable_chunked_prefill,
load_format=load_format,
**kwargs,
)

Expand Down
6 changes: 5 additions & 1 deletion tests/engine/test_computed_prefix_blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,15 @@

import pytest

from vllm.config import LoadFormat
from vllm.engine.arg_utils import EngineArgs
from vllm.engine.llm_engine import LLMEngine
from vllm.sampling_params import SamplingParams

from ..conftest import MODEL_WEIGHTS_S3_BUCKET

@pytest.mark.parametrize("model", ["facebook/opt-125m"])

@pytest.mark.parametrize("model", [f"{MODEL_WEIGHTS_S3_BUCKET}/distilgpt2"])
@pytest.mark.parametrize("block_size", [16])
def test_computed_prefix_blocks(model: str, block_size: int):
# This test checks if we are able to run the engine to completion
Expand All @@ -24,6 +27,7 @@ def test_computed_prefix_blocks(model: str, block_size: int):
"decoration.")

engine_args = EngineArgs(model=model,
load_format=LoadFormat.RUNAI_STREAMER,
block_size=block_size,
enable_prefix_caching=True)

Expand Down
7 changes: 5 additions & 2 deletions tests/engine/test_detokenization.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,14 @@

import pytest

from vllm.config import LoadFormat
from vllm.entrypoints.llm import LLM
from vllm.sampling_params import SamplingParams

from ..conftest import MODEL_WEIGHTS_S3_BUCKET

@pytest.mark.parametrize("model", ["facebook/opt-125m"])

@pytest.mark.parametrize("model", [f"{MODEL_WEIGHTS_S3_BUCKET}/distilgpt2"])
def test_computed_prefix_blocks(model: str):
# This test checks if the engine generates completions both with and
# without optional detokenization, that detokenization includes text
Expand All @@ -17,7 +20,7 @@ def test_computed_prefix_blocks(model: str):
"paper clips? Is there an easy to follow video tutorial available "
"online for free?")

llm = LLM(model=model)
llm = LLM(model=model, load_format=LoadFormat.RUNAI_STREAMER)
sampling_params = SamplingParams(max_tokens=10,
temperature=0.0,
detokenize=False)
Expand Down
17 changes: 13 additions & 4 deletions tests/engine/test_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,17 @@

import pytest

from vllm.config import LoadFormat
from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine
from vllm.engine.llm_engine import LLMEngine
from vllm.executor.uniproc_executor import UniProcExecutor
from vllm.sampling_params import SamplingParams

from ..conftest import MODEL_WEIGHTS_S3_BUCKET

RUNAI_STREAMER_LOAD_FORMAT = LoadFormat.RUNAI_STREAMER


class Mock:
...
Expand All @@ -33,10 +38,11 @@ def collective_rpc(self,
CustomUniExecutorAsync = CustomUniExecutor


@pytest.mark.parametrize("model", ["facebook/opt-125m"])
@pytest.mark.parametrize("model", [f"{MODEL_WEIGHTS_S3_BUCKET}/distilgpt2"])
def test_custom_executor_type_checking(model):
with pytest.raises(ValueError):
engine_args = EngineArgs(model=model,
load_format=RUNAI_STREAMER_LOAD_FORMAT,
distributed_executor_backend=Mock)
LLMEngine.from_engine_args(engine_args)
with pytest.raises(ValueError):
Expand All @@ -45,7 +51,7 @@ def test_custom_executor_type_checking(model):
AsyncLLMEngine.from_engine_args(engine_args)


@pytest.mark.parametrize("model", ["facebook/opt-125m"])
@pytest.mark.parametrize("model", [f"{MODEL_WEIGHTS_S3_BUCKET}/distilgpt2"])
def test_custom_executor(model, tmp_path):
cwd = os.path.abspath(".")
os.chdir(tmp_path)
Expand All @@ -54,6 +60,7 @@ def test_custom_executor(model, tmp_path):

engine_args = EngineArgs(
model=model,
load_format=RUNAI_STREAMER_LOAD_FORMAT,
distributed_executor_backend=CustomUniExecutor,
enforce_eager=True, # reduce test time
)
Expand All @@ -68,7 +75,7 @@ def test_custom_executor(model, tmp_path):
os.chdir(cwd)


@pytest.mark.parametrize("model", ["facebook/opt-125m"])
@pytest.mark.parametrize("model", [f"{MODEL_WEIGHTS_S3_BUCKET}/distilgpt2"])
def test_custom_executor_async(model, tmp_path):
cwd = os.path.abspath(".")
os.chdir(tmp_path)
Expand All @@ -77,6 +84,7 @@ def test_custom_executor_async(model, tmp_path):

engine_args = AsyncEngineArgs(
model=model,
load_format=RUNAI_STREAMER_LOAD_FORMAT,
distributed_executor_backend=CustomUniExecutorAsync,
enforce_eager=True, # reduce test time
)
Expand All @@ -95,7 +103,7 @@ async def t():
os.chdir(cwd)


@pytest.mark.parametrize("model", ["facebook/opt-125m"])
@pytest.mark.parametrize("model", [f"{MODEL_WEIGHTS_S3_BUCKET}/distilgpt2"])
def test_respect_ray(model):
# even for TP=1 and PP=1,
# if users specify ray, we should use ray.
Expand All @@ -104,6 +112,7 @@ def test_respect_ray(model):
engine_args = EngineArgs(
model=model,
distributed_executor_backend="ray",
load_format=RUNAI_STREAMER_LOAD_FORMAT,
enforce_eager=True, # reduce test time
)
engine = LLMEngine.from_engine_args(engine_args)
Expand Down
9 changes: 7 additions & 2 deletions tests/engine/test_skip_tokenizer_init.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,21 @@

import pytest

from vllm.config import LoadFormat
from vllm.entrypoints.llm import LLM
from vllm.sampling_params import SamplingParams

from ..conftest import MODEL_WEIGHTS_S3_BUCKET

@pytest.mark.parametrize("model", ["facebook/opt-125m"])

@pytest.mark.parametrize("model", [f"{MODEL_WEIGHTS_S3_BUCKET}/distilgpt2"])
def test_skip_tokenizer_initialization(model: str):
# This test checks if the flag skip_tokenizer_init skips the initialization
# of tokenizer and detokenizer. The generated output is expected to contain
# token ids.
llm = LLM(model=model, skip_tokenizer_init=True)
llm = LLM(model=model,
skip_tokenizer_init=True,
load_format=LoadFormat.RUNAI_STREAMER)
sampling_params = SamplingParams(prompt_logprobs=True, detokenize=True)

with pytest.raises(ValueError, match="cannot pass text prompts when"):
Expand Down
2 changes: 1 addition & 1 deletion tests/engine/test_stop_reason.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

from vllm import SamplingParams

MODEL = "facebook/opt-350m"
MODEL = "distilbert/distilgpt2"
STOP_STR = "."
SEED = 42
MAX_TOKENS = 1024
Expand Down
13 changes: 10 additions & 3 deletions tests/entrypoints/llm/test_chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,17 @@
import pytest

from vllm import LLM
from vllm.config import LoadFormat

from ...conftest import MODEL_WEIGHTS_S3_BUCKET
from ..openai.test_vision import TEST_IMAGE_URLS

RUNAI_STREAMER_LOAD_FORMAT = LoadFormat.RUNAI_STREAMER


def test_chat():
llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct")
llm = LLM(model=f"{MODEL_WEIGHTS_S3_BUCKET}/Llama-3.2-1B-Instruct",
load_format=RUNAI_STREAMER_LOAD_FORMAT)

prompt1 = "Explain the concept of entropy."
messages = [
Expand All @@ -28,7 +33,8 @@ def test_chat():


def test_multi_chat():
llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct")
llm = LLM(model=f"{MODEL_WEIGHTS_S3_BUCKET}/Llama-3.2-1B-Instruct",
load_format=RUNAI_STREAMER_LOAD_FORMAT)

prompt1 = "Explain the concept of entropy."
prompt2 = "Explain what among us is."
Expand Down Expand Up @@ -65,7 +71,8 @@ def test_multi_chat():
[[TEST_IMAGE_URLS[0], TEST_IMAGE_URLS[1]]])
def test_chat_multi_image(image_urls: List[str]):
llm = LLM(
model="microsoft/Phi-3.5-vision-instruct",
model=f"{MODEL_WEIGHTS_S3_BUCKET}/Phi-3.5-vision-instruct",
load_format=RUNAI_STREAMER_LOAD_FORMAT,
dtype="bfloat16",
max_model_len=4096,
max_num_seqs=5,
Expand Down
2 changes: 1 addition & 1 deletion tests/entrypoints/llm/test_collective_rpc.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ class MyWorker(Worker):
def echo_rank(self):
return self.rank

llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct",
llm = LLM(model="s3://vllm-ci-model-weights/Llama-3.2-1B-Instruct",
enforce_eager=True,
load_format="dummy",
tensor_parallel_size=tp_size,
Expand Down
Loading

0 comments on commit b84fe3a

Please sign in to comment.