diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py index 06f9435816d5c..9677ccd2ea823 100644 --- a/tests/distributed/test_pipeline_parallel.py +++ b/tests/distributed/test_pipeline_parallel.py @@ -275,11 +275,11 @@ def _compare_tp( if load_format == "dummy": # Avoid OOM text_overrides = { - "num_layers": 1, - "num_hidden_layers": 1, - "num_experts": 2, - "num_experts_per_tok": 2, - "num_local_experts": 2, + "num_hidden_layers": 4, + "hidden_size": 512, + "intermediate_size": 800, + "num_attention_heads": 4, + "num_key_value_heads": 1, } if is_multimodal: diff --git a/vllm/spec_decode/ngram_worker.py b/vllm/spec_decode/ngram_worker.py index 86390c99c2fbc..57ae173af6744 100644 --- a/vllm/spec_decode/ngram_worker.py +++ b/vllm/spec_decode/ngram_worker.py @@ -6,6 +6,7 @@ import torch import torch.nn as nn +from vllm.config import VllmConfig from vllm.model_executor.layers.sampler import SamplerOutput from vllm.sequence import ExecuteModelRequest from vllm.spec_decode.interfaces import SpeculativeProposals @@ -25,11 +26,18 @@ class NGramWorker(NonLLMProposerWorkerBase): which don't rely on LLM model to give proposals. """ - def __init__(self, *args, **kwargs): + def __init__( + self, + vllm_config: VllmConfig, + local_rank: int, + device_type: str = "cuda", + **kwargs, + ): + super().__init__(vllm_config) + # Get local_rank/vocab_size from kwargs attribute - self.local_rank = kwargs["local_rank"] - self.vocab_size = kwargs["vllm_config"].model_config.get_vocab_size() - self.device_type = kwargs.get("device_type", "cuda") + self.local_rank = local_rank + self.device_type = device_type # Lazy initialization list. self._proposer: Top1Proposer