NVIDIA · syuoni · Jun 15, 2025 · Jun 13, 2025
@@ -233,17 +233,18 @@ def __init__(
 class Llama4MoE(nn.Module):
 
     def __init__(
-            self,
-            *,
-            num_experts: int,
-            top_k: int,
-            hidden_size: int,
-            intermediate_size: int,
-            shared_expert_intermediate_size: int,
-            aux_stream: torch.cuda.Stream,
-            dtype: Optional[torch.dtype] = None,
-            tune_max_num_tokens: int = 8192,
-            model_config: ModelConfig = ModelConfig(),
+        self,
+        *,
+        num_experts: int,
+        top_k: int,
+        hidden_size: int,
+        intermediate_size: int,
+        shared_expert_intermediate_size: int,
+        aux_stream: torch.cuda.Stream,
+        dtype: Optional[torch.dtype] = None,
+        tune_max_num_tokens: int = 8192,
+        model_config: ModelConfig = ModelConfig(),
+        layer_idx: Optional[int] = None,
     ):
         from tensorrt_llm._torch.distributed import AllReduce
 
@@ -273,7 +274,8 @@ def __init__(
             False,  # In both low latency and max-throughput scenarios, FusedMoE needs not to do allreduce inside op.
             weight_loading_mode=MoEWeightLoadingMode.FUSED_GATE_UP_PROJ,
             model_config=model_config,
-            apply_router_weight_on_input=True)
+            apply_router_weight_on_input=True,
+            layer_idx=layer_idx)
 
         self.router = Linear(hidden_size,
                              num_experts,
@@ -403,7 +405,8 @@ def __init__(
                 shared_expert_intermediate_size=config.intermediate_size,
                 model_config=model_config,
                 aux_stream=aux_stream,
-                dtype=config.torch_dtype)
+                dtype=config.torch_dtype,
+                layer_idx=layer_idx)
 
             # self.fusion_config.POST_MOE_FUSION = model_config.mapping.has_tp(
             # )

@@ -25,6 +25,7 @@ def __init__(
         self,
         model_config: ModelConfig[PretrainedConfig],
         aux_stream: torch.cuda.Stream,
+        layer_idx: Optional[int] = None,
     ):
         super().__init__()
         config = model_config.pretrained_config
@@ -51,7 +52,8 @@ def __init__(
             aux_stream=aux_stream,
             dtype=config.torch_dtype,
             reduce_results=reduce_results,
-            model_config=model_config)
+            model_config=model_config,
+            layer_idx=layer_idx)
 
     def forward(
         self,
@@ -108,7 +110,9 @@ def __init__(self, model_config: ModelConfig[PretrainedConfig],
 
         self.self_attn = MixtralAttention(model_config, layer_idx=layer_idx)
 
-        self.block_sparse_moe = MixtralMoE(model_config, aux_stream)
+        self.block_sparse_moe = MixtralMoE(model_config,
+                                           aux_stream,
+                                           layer_idx=layer_idx)
 
         self.input_layernorm = RMSNorm(hidden_size=config.hidden_size,
                                        eps=config.rms_norm_eps,

@@ -78,7 +78,7 @@ def __init__(
         self,
         model_config: ModelConfig[Qwen3MoeConfig],
         aux_stream: torch.cuda.Stream,
-        layer_idx: int,
+        layer_idx: Optional[int] = None,
     ):
         super().__init__()
         config = model_config.pretrained_config
@@ -115,6 +115,7 @@ def __init__(
             dtype=config.torch_dtype,
             reduce_results=False,
             model_config=model_config,
+            layer_idx=layer_idx,
         )
 
     @staticmethod

@@ -29,6 +29,7 @@ def __init__(
         self,
         model_config: ModelConfig[Qwen2MoeConfig],
         aux_stream: torch.cuda.Stream,
+        layer_idx: Optional[int] = None,
     ):
         super().__init__()
         config = model_config.pretrained_config
@@ -57,7 +58,8 @@ def __init__(
             aux_stream=aux_stream,
             dtype=config.torch_dtype,
             reduce_results=reduce_results,
-            model_config=model_config)
+            model_config=model_config,
+            layer_idx=layer_idx)
 
         self.shared_expert = GatedMLP(
             hidden_size=config.hidden_size,
@@ -143,7 +145,7 @@ def __init__(self, model_config: ModelConfig[Qwen2MoeConfig],
             layer_idx=layer_idx,
         )
 
-        self.mlp = QwenMoE(model_config, aux_stream)
+        self.mlp = QwenMoE(model_config, aux_stream, layer_idx=layer_idx)
 
         self.input_layernorm = RMSNorm(hidden_size=config.hidden_size,
                                        eps=config.rms_norm_eps,

@@ -754,6 +754,10 @@ def __exit__(self, exc_type, exc_val, exc_tb):
 
 moe_model_arch_list = [
     'DeepseekV3ForCausalLM',
+    'MixtralForCausalLM',
+    'Llama4ForConditionalGeneration',
+    'Qwen2MoeForCausalLM',
+    'Qwen3MoeForCausalLM',
 ]