From 6510636ecb31a6c9f7045181946ab16cdc833ca5 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Sun, 16 Feb 2025 03:38:26 +0000 Subject: [PATCH 1/6] Fix 2 Node tests Signed-off-by: DarkLight1337 --- tests/distributed/test_pipeline_parallel.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py index 06f9435816d5c..92a366892ea16 100644 --- a/tests/distributed/test_pipeline_parallel.py +++ b/tests/distributed/test_pipeline_parallel.py @@ -307,7 +307,8 @@ def _compare_tp( "8", ] if chunked_prefill: - common_args.append("--enable-chunked-prefill") + common_args.extend( + ["--enable-chunked-prefill", "--disable-sliding-window"]) if eager_mode: common_args.append("--enforce-eager") if task != "auto": From f5e98e7b5b84898d91afc5b86be72dda7fef0205 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Sun, 16 Feb 2025 04:52:13 +0000 Subject: [PATCH 2/6] Try another fix Signed-off-by: DarkLight1337 --- tests/distributed/test_pipeline_parallel.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py index 92a366892ea16..e768ca23009ef 100644 --- a/tests/distributed/test_pipeline_parallel.py +++ b/tests/distributed/test_pipeline_parallel.py @@ -277,6 +277,7 @@ def _compare_tp( text_overrides = { "num_layers": 1, "num_hidden_layers": 1, + "num_attention_heads": tp_size * pp_size, "num_experts": 2, "num_experts_per_tok": 2, "num_local_experts": 2, @@ -307,8 +308,7 @@ def _compare_tp( "8", ] if chunked_prefill: - common_args.extend( - ["--enable-chunked-prefill", "--disable-sliding-window"]) + common_args.append("--enable-chunked-prefill") if eager_mode: common_args.append("--enforce-eager") if task != "auto": From e880fc6186299e8e227988ed4310e4bddd6b2575 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Sun, 16 Feb 2025 05:03:37 +0000 Subject: [PATCH 3/6] Fix `NGramWorker` initialization Signed-off-by: DarkLight1337 --- vllm/spec_decode/ngram_worker.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/vllm/spec_decode/ngram_worker.py b/vllm/spec_decode/ngram_worker.py index 86390c99c2fbc..57ae173af6744 100644 --- a/vllm/spec_decode/ngram_worker.py +++ b/vllm/spec_decode/ngram_worker.py @@ -6,6 +6,7 @@ import torch import torch.nn as nn +from vllm.config import VllmConfig from vllm.model_executor.layers.sampler import SamplerOutput from vllm.sequence import ExecuteModelRequest from vllm.spec_decode.interfaces import SpeculativeProposals @@ -25,11 +26,18 @@ class NGramWorker(NonLLMProposerWorkerBase): which don't rely on LLM model to give proposals. """ - def __init__(self, *args, **kwargs): + def __init__( + self, + vllm_config: VllmConfig, + local_rank: int, + device_type: str = "cuda", + **kwargs, + ): + super().__init__(vllm_config) + # Get local_rank/vocab_size from kwargs attribute - self.local_rank = kwargs["local_rank"] - self.vocab_size = kwargs["vllm_config"].model_config.get_vocab_size() - self.device_type = kwargs.get("device_type", "cuda") + self.local_rank = local_rank + self.device_type = device_type # Lazy initialization list. self._proposer: Top1Proposer From a855844f3ba7fe72b743b07280de5a52dc946aa9 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Sun, 16 Feb 2025 07:36:00 +0000 Subject: [PATCH 4/6] Reduce memory footprint to avoid OOM Signed-off-by: DarkLight1337 --- tests/distributed/test_pipeline_parallel.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py index e768ca23009ef..dcff12dc9c876 100644 --- a/tests/distributed/test_pipeline_parallel.py +++ b/tests/distributed/test_pipeline_parallel.py @@ -275,6 +275,8 @@ def _compare_tp( if load_format == "dummy": # Avoid OOM text_overrides = { + "hidden_size": 512, + "intermediate_size": 800, "num_layers": 1, "num_hidden_layers": 1, "num_attention_heads": tp_size * pp_size, From fb2fcf7a4c30c027d99d9bb5863cc6a0085d7907 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Sun, 16 Feb 2025 08:30:44 +0000 Subject: [PATCH 5/6] Test Signed-off-by: DarkLight1337 --- tests/distributed/test_pipeline_parallel.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py index dcff12dc9c876..b3149e8604772 100644 --- a/tests/distributed/test_pipeline_parallel.py +++ b/tests/distributed/test_pipeline_parallel.py @@ -279,7 +279,8 @@ def _compare_tp( "intermediate_size": 800, "num_layers": 1, "num_hidden_layers": 1, - "num_attention_heads": tp_size * pp_size, + "num_attention_heads": 4, + "num_key_value_heads": 1, "num_experts": 2, "num_experts_per_tok": 2, "num_local_experts": 2, From 2e3cbae5c40b56b64a42ca8dacc8842409a94e5a Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Sun, 16 Feb 2025 09:35:41 +0000 Subject: [PATCH 6/6] Test Signed-off-by: DarkLight1337 --- tests/distributed/test_pipeline_parallel.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py index b3149e8604772..9677ccd2ea823 100644 --- a/tests/distributed/test_pipeline_parallel.py +++ b/tests/distributed/test_pipeline_parallel.py @@ -275,15 +275,11 @@ def _compare_tp( if load_format == "dummy": # Avoid OOM text_overrides = { + "num_hidden_layers": 4, "hidden_size": 512, "intermediate_size": 800, - "num_layers": 1, - "num_hidden_layers": 1, "num_attention_heads": 4, "num_key_value_heads": 1, - "num_experts": 2, - "num_experts_per_tok": 2, - "num_local_experts": 2, } if is_multimodal: