vllm-project · DarkLight1337 · Feb 19, 2025 · Feb 12, 2025 · Feb 13, 2025 · Feb 13, 2025
diff --git a/requirements-test.in b/requirements-test.in
@@ -37,3 +37,5 @@ genai_perf==0.0.8
 tritonclient==2.51.0
 
 numpy < 2.0.0
+runai-model-streamer==0.11.0
+runai-model-streamer-s3==0.11.0
diff --git a/requirements-test.txt b/requirements-test.txt
@@ -171,6 +171,8 @@ huggingface-hub==0.26.2
     #   tokenizers
     #   transformers
     #   vocos
+humanize==4.11.0
+    # via runai-model-streamer
 idna==3.10
     # via
     #   anyio
@@ -290,6 +292,7 @@ numpy==1.26.4
     #   patsy
     #   peft
     #   rouge-score
+    #   runai-model-streamer
     #   sacrebleu
     #   scikit-learn
     #   scipy
@@ -514,6 +517,10 @@ rpds-py==0.20.1
     #   referencing
 rsa==4.7.2
     # via awscli
+runai-model-streamer==0.11.0
+    # via -r requirements-test.in
+runai-model-streamer-s3==0.11.0
+    # via -r requirements-test.in
 s3transfer==0.10.3
     # via
     #   awscli
@@ -594,6 +601,7 @@ torch==2.5.1
     #   encodec
     #   lm-eval
     #   peft
+    #   runai-model-streamer
     #   sentence-transformers
     #   tensorizer
     #   timm

diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py
@@ -9,6 +9,7 @@
 import pytest
 
 from vllm import LLM
+from vllm.config import LoadFormat
 from vllm.platforms import current_platform
 
 from ..conftest import VllmRunner
@@ -33,7 +34,7 @@ def v1(run_with_both_engines):
 
 def test_vllm_gc_ed():
     """Verify vllm instance is GC'ed when it is deleted"""
-    llm = LLM("facebook/opt-125m")
+    llm = LLM("distilbert/distilgpt2", load_format=LoadFormat.RUNAI_STREAMER)
     weak_llm = weakref.ref(llm)
     del llm
     # If there's any circular reference to vllm, this fails
@@ -94,14 +95,14 @@ def test_models(
 @pytest.mark.parametrize(
     "model, distributed_executor_backend, attention_backend, "
     "test_suite", [
-        ("facebook/opt-125m", "ray", "", "L4"),
-        ("facebook/opt-125m", "mp", "", "L4"),
-        ("meta-llama/Llama-3.2-1B-Instruct", "ray", "", "L4"),
-        ("meta-llama/Llama-3.2-1B-Instruct", "mp", "", "L4"),
-        ("facebook/opt-125m", "ray", "", "A100"),
-        ("facebook/opt-125m", "mp", "", "A100"),
-        ("facebook/opt-125m", "mp", "FLASHINFER", "A100"),
-        ("meta-llama/Llama-3.2-1B-Instruct", "ray", "FLASHINFER", "A100"),
+        ("distilbert/distilgpt2", "ray", "", "L4"),
+        ("distilbert/distilgpt2", "mp", "", "L4"),
+        ("meta-llama/Llama-2-7b-hf", "ray", "", "L4"),
+        ("meta-llama/Llama-2-7b-hf", "mp", "", "L4"),
+        ("distilbert/distilgpt2", "ray", "", "A100"),
+        ("distilbert/distilgpt2", "mp", "", "A100"),
+        ("distilbert/distilgpt2", "mp", "FLASHINFER", "A100"),
+        ("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100"),
     ])
 def test_models_distributed(
     hf_runner,

diff --git a/tests/basic_correctness/test_cumem.py b/tests/basic_correctness/test_cumem.py
@@ -4,9 +4,11 @@
 import torch
 
 from vllm import LLM, SamplingParams
+from vllm.config import LoadFormat
 from vllm.device_allocator.cumem import CuMemAllocator
 from vllm.utils import GiB_bytes
 
+from ..conftest import MODEL_WEIGHTS_S3_BUCKET
 from ..utils import fork_new_process_for_each_test
 
 
@@ -118,13 +120,18 @@ def model(x):
 @pytest.mark.parametrize(
     "model",
     [
-        "meta-llama/Llama-3.2-1B-Instruct",  # sleep mode with safetensors
-        "facebook/opt-125m"  # sleep mode with pytorch checkpoint
+        # sleep mode with safetensors
+        f"{MODEL_WEIGHTS_S3_BUCKET}/Llama-3.2-1B",
+        # sleep mode with pytorch checkpoint
+        "facebook/opt-125m"
     ])
 def test_end_to_end(model):
     free, total = torch.cuda.mem_get_info()
     used_bytes_baseline = total - free  # in case other process is running
-    llm = LLM(model, enable_sleep_mode=True)
+    load_format = LoadFormat.AUTO
+    if "Llama" in model:
+        load_format = LoadFormat.RUNAI_STREAMER
+    llm = LLM(model, load_format=load_format, enable_sleep_mode=True)
     prompt = "How are you?"
     sampling_params = SamplingParams(temperature=0, max_tokens=10)
     output = llm.generate(prompt, sampling_params)

diff --git a/tests/basic_correctness/test_preemption.py b/tests/basic_correctness/test_preemption.py
@@ -17,7 +17,7 @@
 from ..models.utils import check_outputs_equal
 
 MODELS = [
-    "facebook/opt-125m",
+    "distilbert/distilgpt2",
 ]
 
 

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -24,7 +24,7 @@
 from vllm import LLM, SamplingParams
 from vllm.assets.image import ImageAsset
 from vllm.assets.video import VideoAsset
-from vllm.config import TaskOption, TokenizerPoolConfig
+from vllm.config import LoadFormat, TaskOption, TokenizerPoolConfig
 from vllm.connections import global_http_connection
 from vllm.distributed import (cleanup_dist_env_and_memory,
                               init_distributed_environment,
@@ -46,6 +46,21 @@
 _SYS_MSG = os.path.join(_TEST_DIR, "system_messages", "sonnet3.5_nov2024.txt")
 
 _M = TypeVar("_M")
+
+MODELS_ON_S3 = [
+    "distilbert/distilgpt2",
+    "meta-llama/Llama-2-7b-hf",
+    "meta-llama/Meta-Llama-3-8B",
+    "meta-llama/Llama-3.2-1B",
+    "meta-llama/Llama-3.2-1B-Instruct",
+    "openai-community/gpt2",
+    "ArthurZ/Ilama-3.2-1B",
+    "llava-hf/llava-1.5-7b-hf",
+    "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+]
+
+MODEL_WEIGHTS_S3_BUCKET = "s3://vllm-ci-model-weights"
+
 _PromptMultiModalInput = Union[List[_M], List[List[_M]]]
 
 PromptImageInput = _PromptMultiModalInput[Image.Image]
@@ -677,8 +692,15 @@ def __init__(
         enable_chunked_prefill: bool = False,
         swap_space: int = 4,
         enforce_eager: Optional[bool] = False,
+        load_format: Optional[LoadFormat] = None,
         **kwargs,
     ) -> None:
+        if model_name in MODELS_ON_S3 and not load_format:
+            model_name = (f"s3://vllm-ci-model-weights/"
+                          f"{model_name.split('/')[-1]}")
+            load_format = LoadFormat.RUNAI_STREAMER
+        if not load_format:
+            load_format = LoadFormat.AUTO
         self.model = LLM(
             model=model_name,
             task=task,
@@ -693,6 +715,7 @@ def __init__(
             max_model_len=max_model_len,
             block_size=block_size,
             enable_chunked_prefill=enable_chunked_prefill,
+            load_format=load_format,
             **kwargs,
         )
 

diff --git a/tests/engine/test_computed_prefix_blocks.py b/tests/engine/test_computed_prefix_blocks.py
@@ -2,12 +2,15 @@
 
 import pytest
 
+from vllm.config import LoadFormat
 from vllm.engine.arg_utils import EngineArgs
 from vllm.engine.llm_engine import LLMEngine
 from vllm.sampling_params import SamplingParams
 
+from ..conftest import MODEL_WEIGHTS_S3_BUCKET
 
-@pytest.mark.parametrize("model", ["facebook/opt-125m"])
+
+@pytest.mark.parametrize("model", [f"{MODEL_WEIGHTS_S3_BUCKET}/distilgpt2"])
 @pytest.mark.parametrize("block_size", [16])
 def test_computed_prefix_blocks(model: str, block_size: int):
     # This test checks if we are able to run the engine to completion
@@ -24,6 +27,7 @@ def test_computed_prefix_blocks(model: str, block_size: int):
         "decoration.")
 
     engine_args = EngineArgs(model=model,
+                             load_format=LoadFormat.RUNAI_STREAMER,
                              block_size=block_size,
                              enable_prefix_caching=True)
 

diff --git a/tests/engine/test_detokenization.py b/tests/engine/test_detokenization.py
@@ -2,11 +2,14 @@
 
 import pytest
 
+from vllm.config import LoadFormat
 from vllm.entrypoints.llm import LLM
 from vllm.sampling_params import SamplingParams
 
+from ..conftest import MODEL_WEIGHTS_S3_BUCKET
 
-@pytest.mark.parametrize("model", ["facebook/opt-125m"])
+
+@pytest.mark.parametrize("model", [f"{MODEL_WEIGHTS_S3_BUCKET}/distilgpt2"])
 def test_computed_prefix_blocks(model: str):
     # This test checks if the engine generates completions both with and
     # without optional detokenization, that detokenization includes text
@@ -17,7 +20,7 @@ def test_computed_prefix_blocks(model: str):
         "paper clips? Is there an easy to follow video tutorial available "
         "online for free?")
 
-    llm = LLM(model=model)
+    llm = LLM(model=model, load_format=LoadFormat.RUNAI_STREAMER)
     sampling_params = SamplingParams(max_tokens=10,
                                      temperature=0.0,
                                      detokenize=False)

diff --git a/tests/engine/test_executor.py b/tests/engine/test_executor.py
@@ -6,12 +6,17 @@
 
 import pytest
 
+from vllm.config import LoadFormat
 from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.engine.llm_engine import LLMEngine
 from vllm.executor.uniproc_executor import UniProcExecutor
 from vllm.sampling_params import SamplingParams
 
+from ..conftest import MODEL_WEIGHTS_S3_BUCKET
+
+RUNAI_STREAMER_LOAD_FORMAT = LoadFormat.RUNAI_STREAMER
+
 
 class Mock:
     ...
@@ -33,10 +38,11 @@ def collective_rpc(self,
 CustomUniExecutorAsync = CustomUniExecutor
 
 
-@pytest.mark.parametrize("model", ["facebook/opt-125m"])
+@pytest.mark.parametrize("model", [f"{MODEL_WEIGHTS_S3_BUCKET}/distilgpt2"])
 def test_custom_executor_type_checking(model):
     with pytest.raises(ValueError):
         engine_args = EngineArgs(model=model,
+                                 load_format=RUNAI_STREAMER_LOAD_FORMAT,
                                  distributed_executor_backend=Mock)
         LLMEngine.from_engine_args(engine_args)
     with pytest.raises(ValueError):
@@ -45,7 +51,7 @@ def test_custom_executor_type_checking(model):
         AsyncLLMEngine.from_engine_args(engine_args)
 
 
-@pytest.mark.parametrize("model", ["facebook/opt-125m"])
+@pytest.mark.parametrize("model", [f"{MODEL_WEIGHTS_S3_BUCKET}/distilgpt2"])
 def test_custom_executor(model, tmp_path):
     cwd = os.path.abspath(".")
     os.chdir(tmp_path)
@@ -54,6 +60,7 @@ def test_custom_executor(model, tmp_path):
 
         engine_args = EngineArgs(
             model=model,
+            load_format=RUNAI_STREAMER_LOAD_FORMAT,
             distributed_executor_backend=CustomUniExecutor,
             enforce_eager=True,  # reduce test time
         )
@@ -68,7 +75,7 @@ def test_custom_executor(model, tmp_path):
         os.chdir(cwd)
 
 
-@pytest.mark.parametrize("model", ["facebook/opt-125m"])
+@pytest.mark.parametrize("model", [f"{MODEL_WEIGHTS_S3_BUCKET}/distilgpt2"])
 def test_custom_executor_async(model, tmp_path):
     cwd = os.path.abspath(".")
     os.chdir(tmp_path)
@@ -77,6 +84,7 @@ def test_custom_executor_async(model, tmp_path):
 
         engine_args = AsyncEngineArgs(
             model=model,
+            load_format=RUNAI_STREAMER_LOAD_FORMAT,
             distributed_executor_backend=CustomUniExecutorAsync,
             enforce_eager=True,  # reduce test time
         )
@@ -95,7 +103,7 @@ async def t():
         os.chdir(cwd)
 
 
-@pytest.mark.parametrize("model", ["facebook/opt-125m"])
+@pytest.mark.parametrize("model", [f"{MODEL_WEIGHTS_S3_BUCKET}/distilgpt2"])
 def test_respect_ray(model):
     # even for TP=1 and PP=1,
     # if users specify ray, we should use ray.
@@ -104,6 +112,7 @@ def test_respect_ray(model):
     engine_args = EngineArgs(
         model=model,
         distributed_executor_backend="ray",
+        load_format=RUNAI_STREAMER_LOAD_FORMAT,
         enforce_eager=True,  # reduce test time
     )
     engine = LLMEngine.from_engine_args(engine_args)

diff --git a/tests/engine/test_skip_tokenizer_init.py b/tests/engine/test_skip_tokenizer_init.py
@@ -2,16 +2,21 @@
 
 import pytest
 
+from vllm.config import LoadFormat
 from vllm.entrypoints.llm import LLM
 from vllm.sampling_params import SamplingParams
 
+from ..conftest import MODEL_WEIGHTS_S3_BUCKET
 
-@pytest.mark.parametrize("model", ["facebook/opt-125m"])
+
+@pytest.mark.parametrize("model", [f"{MODEL_WEIGHTS_S3_BUCKET}/distilgpt2"])
 def test_skip_tokenizer_initialization(model: str):
     # This test checks if the flag skip_tokenizer_init skips the initialization
     # of tokenizer and detokenizer. The generated output is expected to contain
     # token ids.
-    llm = LLM(model=model, skip_tokenizer_init=True)
+    llm = LLM(model=model,
+              skip_tokenizer_init=True,
+              load_format=LoadFormat.RUNAI_STREAMER)
     sampling_params = SamplingParams(prompt_logprobs=True, detokenize=True)
 
     with pytest.raises(ValueError, match="cannot pass text prompts when"):

diff --git a/tests/engine/test_stop_reason.py b/tests/engine/test_stop_reason.py
@@ -12,7 +12,7 @@
 
 from vllm import SamplingParams
 
-MODEL = "facebook/opt-350m"
+MODEL = "distilbert/distilgpt2"
 STOP_STR = "."
 SEED = 42
 MAX_TOKENS = 1024

@@ -5,12 +5,17 @@
 import pytest
 
 from vllm import LLM
+from vllm.config import LoadFormat
 
+from ...conftest import MODEL_WEIGHTS_S3_BUCKET
 from ..openai.test_vision import TEST_IMAGE_URLS
 
+RUNAI_STREAMER_LOAD_FORMAT = LoadFormat.RUNAI_STREAMER
+
 
 def test_chat():
-    llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct")
+    llm = LLM(model=f"{MODEL_WEIGHTS_S3_BUCKET}/Llama-3.2-1B-Instruct",
+              load_format=RUNAI_STREAMER_LOAD_FORMAT)
 
     prompt1 = "Explain the concept of entropy."
     messages = [
@@ -28,7 +33,8 @@ def test_chat():
 
 
 def test_multi_chat():
-    llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct")
+    llm = LLM(model=f"{MODEL_WEIGHTS_S3_BUCKET}/Llama-3.2-1B-Instruct",
+              load_format=RUNAI_STREAMER_LOAD_FORMAT)
 
     prompt1 = "Explain the concept of entropy."
     prompt2 = "Explain what among us is."
@@ -65,7 +71,8 @@ def test_multi_chat():
                          [[TEST_IMAGE_URLS[0], TEST_IMAGE_URLS[1]]])
 def test_chat_multi_image(image_urls: List[str]):
     llm = LLM(
-        model="microsoft/Phi-3.5-vision-instruct",
+        model=f"{MODEL_WEIGHTS_S3_BUCKET}/Phi-3.5-vision-instruct",
+        load_format=RUNAI_STREAMER_LOAD_FORMAT,
         dtype="bfloat16",
         max_model_len=4096,
         max_num_seqs=5,

@@ -28,7 +28,7 @@ class MyWorker(Worker):
         def echo_rank(self):
             return self.rank
 
-    llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct",
+    llm = LLM(model="s3://vllm-ci-model-weights/Llama-3.2-1B-Instruct",
               enforce_eager=True,
               load_format="dummy",
               tensor_parallel_size=tp_size,