Merge branch 'main' into fix/ppu-1107

tastelikefeet · tastelikefeet · commit db02ec7555fa · 2025-11-10T11:14:34.000+08:00
diff --git a/docs/source/Megatron-SWIFT/Command-line-parameters.md b/docs/source/Megatron-SWIFT/Command-line-parameters.md
@@ -185,10 +185,12 @@
 - moe_ffn_hidden_size: 每个专家的前馈网络（ffn）的隐藏层大小。默认为None，自动从config.json读取。若未读取到且`num_experts`不为None，则设置为ffn_hidden_size。
 - moe_shared_expert_intermediate_size: 共享专家的总FFN隐藏层大小。如果有多个共享专家，它应等于 `num_shared_experts * ffn_size_of_each_shared_expert`。 默认为None。自动从config.json读取。
 - moe_router_topk: 每个token路由到的专家数量。默认为None。自动从config.json读取。
+- moe_router_num_groups: 将专家分成的组数，用于组限制路由。参考DeepSeek-V2和DeepSeek-V3。默认为None。自动从config.json读取。
+- moe_router_group_topk: 组限制路由中选择的组数。默认为None。自动从config.json读取。
 - moe_router_pre_softmax: 为MoE启用预softmax路由，这意味着softmax会在top-k选择之前进行。默认为None。自动从config.json读取。
 - 🔥moe_router_dtype: 用于路由计算和专家输出加权平均的数据类型。可选为'none', 'fp32'、'fp64'，这增强了数值稳定性，尤其是在专家数量较多时。与`moe_permute_fusion`一起使用时，性能影响可以忽略不计。默认为'fp32'。'none'代表不改变数据类型。
 - moe_router_score_function: MoE TopK 路由的评分函数。可以为 "softmax" 或 "sigmoid"。默认为None，从config.json中读取。
-- moe_router_bias_update_rate: 在无辅助损失负载均衡策略中，专家偏置的更新速率。专家偏置根据每个专家在全局批次中被分配的 token 数量进行更新，对于分配到的 token 较少的专家，偏置会增加；对于分配到的 token 较多的专家，偏置会减少。默认值 1e-3，与 DeepSeekV3 中使用的值相同。
+- moe_router_bias_update_rate: 在无辅助损失负载均衡策略中，专家偏置的更新速率。专家偏置根据每个专家在全局批次中被分配的 token 数量进行更新，对于分配到的 token 较少的专家，偏置会增加；对于分配到的 token 较多的专家，偏置会减少。默认为None，从config.json中读取。
 - moe_router_enable_expert_bias: 在无辅助损失负载均衡策略中，带有动态专家偏置的 TopK 路由。路由决策基于路由分数与专家偏置之和。详情请参见：https://arxiv.org/abs/2408.15664。默认为None，自动从config.json读取。
 - moe_router_topk_scaling_factor: 默认为None。从config.json中读取。
 - moe_router_load_balancing_type: 确定路由器的负载均衡策略。可选项为"aux_loss"、"seq_aux_loss"、"sinkhorn"、"none"。默认值为 None。从config.json中读取。
diff --git a/docs/source_en/Megatron-SWIFT/Command-line-parameters.md b/docs/source_en/Megatron-SWIFT/Command-line-parameters.md
@@ -197,10 +197,12 @@ For guidance on selecting parallelization strategies, please refer to the [Train
 - moe_ffn_hidden_size: Hidden layer size of the feedforward network (ffn) for each expert. Default is None and will be automatically read from config.json. If not found and `num_experts` is not None, it will be set to ffn_hidden_size.
 - moe_shared_expert_intermediate_size: The total FFN hidden layer size for shared experts. If there are multiple shared experts, it should equal `num_shared_experts * ffn_size_of_each_shared_expert`. Default is None. Automatically read from config.json.
 - moe_router_topk: The number of experts each token is routed to. Default is None. Automatically read from config.json.
+- moe_router_num_groups: Number of groups to divide experts into for group-limited routing. Refers to DeepSeek-V2 and DeepSeek-V3. Default is None. Automatically read from config.json.
+- moe_router_group_topk: Number of selected groups for group-limited routing. Default is None. Automatically read from config.json.
 - moe_router_pre_softmax: Enable pre-softmax routing for MoE, meaning that softmax will be applied before top-k selection. Default is None. Automatically read from config.json.
 - 🔥moe_router_dtype: Data type used for routing computation and expert output weighted averaging. Options are 'none', 'fp32', and 'fp64', which enhances numerical stability, especially when the number of experts is large. When used together with `moe_permute_fusion`, the performance impact is negligible. Default is 'fp32'. 'none' means no change to data type.
 - moe_router_score_function: Scoring function for MoE TopK routing. Can be "softmax" or "sigmoid". Default is None and is read from config.json.
-- moe_router_bias_update_rate: Update rate of expert bias in the auxiliary-loss-free load balancing strategy. Expert bias is updated based on the number of tokens each expert is assigned in the global batch: bias increases for experts assigned fewer tokens, and decreases for those assigned more tokens. Default is 1e-3, same as used in DeepSeekV3.
+- moe_router_bias_update_rate: Update rate of expert bias in the auxiliary-loss-free load balancing strategy. Expert bias is updated based on the number of tokens each expert is assigned in the global batch: bias increases for experts assigned fewer tokens, and decreases for those assigned more tokens. Default is None and is read from config.json.
 - moe_router_enable_expert_bias: TopK routing with dynamic expert bias in the auxiliary-loss-free load balancing strategy. Routing decisions are based on the sum of routing scores and expert bias. See details at: https://arxiv.org/abs/2408.15664. Default is None and is automatically read from config.json.
 - moe_router_topk_scaling_factor: Default is None. This parameter is read from config.json.
 - moe_router_load_balancing_type: Determines the router’s load balancing strategy. Options are "aux_loss", "seq_aux_loss", "sinkhorn", and "none". Default is None and is read from config.json.
diff --git a/examples/train/multi-gpu/fsdp2_lora/fsdp2.json b/examples/train/multi-gpu/fsdp2_lora/fsdp2.json
@@ -0,0 +1,25 @@
+{
+  "compute_environment": "LOCAL_MACHINE",
+  "debug": false,
+  "distributed_type": "FSDP",
+  "downcast_bf16": "no",
+  "fsdp_config": {
+    "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
+    "fsdp_cpu_ram_efficient_loading": true,
+    "fsdp_reshard_after_forward": true,
+    "fsdp_state_dict_type": "FULL_STATE_DICT",
+    "fsdp_activation_checkpointing": true,
+    "fsdp_version": 2
+  },
+  "machine_rank": 0,
+  "main_training_function": "main",
+  "mixed_precision": "bf16",
+  "num_machines": 1,
+  "num_processes": 2,
+  "rdzv_backend": "static",
+  "same_network": true,
+  "tpu_env": [],
+  "tpu_use_cluster": false,
+  "tpu_use_sudo": false,
+  "use_cpu": false
+}
diff --git a/examples/train/multi-gpu/fsdp2_lora/train.sh b/examples/train/multi-gpu/fsdp2_lora/train.sh
@@ -0,0 +1,31 @@
+# 14.7GiB * 2
+nproc_per_node=2
+
+CUDA_VISIBLE_DEVICES=0,1 \
+accelerate launch --config_file "./examples/train/multi-gpu/fsdp2_lora/fsdp2.json" \
+    swift/cli/sft.py \
+    --model Qwen/Qwen2.5-7B-Instruct \
+    --train_type lora \
+    --dataset 'swift/self-cognition#1000' \
+    --torch_dtype bfloat16 \
+    --num_train_epochs 1 \
+    --per_device_train_batch_size 1 \
+    --per_device_eval_batch_size 1 \
+    --learning_rate 1e-4 \
+    --lora_rank 8 \
+    --lora_alpha 32 \
+    --gradient_checkpointing false \
+    --weight_decay 0.1 \
+    --target_modules all-linear \
+    --gradient_accumulation_steps $(expr 16 / $nproc_per_node) \
+    --eval_steps 100 \
+    --save_steps 100 \
+    --save_total_limit 2 \
+    --logging_steps 5 \
+    --max_length 2048 \
+    --output_dir output \
+    --system 'You are a helpful assistant.' \
+    --warmup_ratio 0.05 \
+    --dataloader_num_workers 4 \
+    --model_author swift \
+    --model_name swift-robot
diff --git a/examples/train/seq_cls/multi_label/infer.py b/examples/train/seq_cls/multi_label/infer.py
@@ -0,0 +1,34 @@
+import os
+from typing import List
+
+from swift.llm import BaseArguments, InferRequest, PtEngine, get_template
+
+os.environ['IMAGE_MAX_TOKEN_NUM'] = '1024'
+os.environ['VIDEO_MAX_TOKEN_NUM'] = '128'
+os.environ['FPS_MAX_FRAMES'] = '16'
+
+infer_request = InferRequest(
+    messages=[{
+        'role':
+        'user',
+        'content':
+        "多标签分类，类别包括：['aeroplane', 'bicycle', 'bird', 'boat', 'bottle', "
+        "'bus', 'car', 'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse', "
+        "'motorbike', 'person', 'pottedplant', 'sheep', 'sofa', 'train', 'tvmonitor']"
+    }],
+    images=['xxx.jpg'])
+adapter_path = 'output/vx-xxx/checkpoint-xxx'
+args = BaseArguments.from_pretrained(adapter_path)
+
+engine = PtEngine(
+    args.model,
+    adapters=[adapter_path],
+    task_type='seq_cls',
+    num_labels=args.num_labels,
+    problem_type=args.problem_type)
+template = get_template(args.template, engine.processor, args.system, use_chat_template=args.use_chat_template)
+engine.default_template = template
+
+resp_list = engine.infer([infer_request])
+response: List[int] = resp_list[0].choices[0].message.content
+print(f'response: {response}')
diff --git a/examples/train/seq_cls/multi_label/infer.sh b/examples/train/seq_cls/multi_label/infer.sh
@@ -0,0 +1,7 @@
+CUDA_VISIBLE_DEVICES=0 \
+IMAGE_MAX_TOKEN_NUM=1024 \
+VIDEO_MAX_TOKEN_NUM=128 \
+FPS_MAX_FRAMES=16 \
+swift infer \
+    --adapters output/vx-xxx/checkpoint-xxx \
+    --load_data_args true
diff --git a/examples/train/seq_cls/multi_label/vlm.sh b/examples/train/seq_cls/multi_label/vlm.sh
@@ -0,0 +1,31 @@
+CUDA_VISIBLE_DEVICES=0 \
+IMAGE_MAX_TOKEN_NUM=1024 \
+VIDEO_MAX_TOKEN_NUM=128 \
+FPS_MAX_FRAMES=16 \
+swift sft \
+    --model Qwen/Qwen3-VL-4B-Instruct \
+    --train_type lora \
+    --dataset 'clip-benchmark/wds_voc2007_multilabel' \
+    --load_from_cache_file true \
+    --split_dataset_ratio 0.01 \
+    --torch_dtype bfloat16 \
+    --num_train_epochs 2 \
+    --per_device_train_batch_size 16 \
+    --per_device_eval_batch_size 16 \
+    --learning_rate 1e-4 \
+    --lora_rank 8 \
+    --lora_alpha 32 \
+    --target_modules all-linear \
+    --gradient_accumulation_steps 1 \
+    --eval_steps 100 \
+    --save_steps 100 \
+    --save_total_limit 2 \
+    --logging_steps 5 \
+    --max_length 2048 \
+    --output_dir output \
+    --warmup_ratio 0.05 \
+    --dataloader_num_workers 4 \
+    --dataset_num_proc 4 \
+    --num_labels 20 \
+    --task_type seq_cls \
+    --problem_type multi_label_classification
diff --git a/swift/llm/dataset/dataset/mllm.py b/swift/llm/dataset/dataset/mllm.py
@@ -1306,3 +1306,22 @@ def preprocess(self, row: Dict[str, Any]) -> Dict[str, Any]:
         hf_dataset_id='leonardPKU/clevr_cogen_a_train',
         preprocess_func=ClevrPreprocessor(),
         tags=['qa', 'math', 'vision', 'grpo']))
+
+
+class Voc2007MultilabelPreprocessor(ResponsePreprocessor):
+    CLASS_NAME = ('aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car', 'cat', 'chair', 'cow', 'diningtable',
+                  'dog', 'horse', 'motorbike', 'person', 'pottedplant', 'sheep', 'sofa', 'train', 'tvmonitor')
+
+    def preprocess(self, row: Dict[str, Any]) -> Dict[str, Any]:
+        row['query'] = f'多标签分类，类别包括：{list(self.CLASS_NAME)}'
+        row['label'] = [i for i, x in enumerate(row['npy']) if x == 1]
+        return super().preprocess(row)
+
+
+register_dataset(
+    DatasetMeta(
+        ms_dataset_id='clip-benchmark/wds_voc2007_multilabel',
+        hf_dataset_id='clip-benchmark/wds_voc2007_multilabel',
+        preprocess_func=Voc2007MultilabelPreprocessor(columns={'webp': 'images'}),
+        tags=['multilabel', 'multi-modal'],
+    ))
diff --git a/swift/llm/infer/deploy.py b/swift/llm/infer/deploy.py
@@ -13,7 +13,7 @@
 import uvicorn
 from aiohttp import ClientConnectorError
 from fastapi import FastAPI, Request
-from fastapi.responses import JSONResponse, StreamingResponse
+from fastapi.responses import JSONResponse, Response, StreamingResponse
 
 from swift.llm import AdapterRequest, DeployArguments, InferArguments
 from swift.llm.infer.protocol import EmbeddingRequest, MultiModalRequestMixin
@@ -42,6 +42,9 @@ def get_infer_engine(args: InferArguments, template=None, **kwargs):
         return SwiftInfer.get_infer_engine(args, template, **kwargs)
 
     def _register_app(self):
+        self.app.get('/health')(self.health)
+        self.app.get('/ping')(self.ping)
+        self.app.post('/ping')(self.ping)
         self.app.get('/v1/models')(self.get_available_models)
         self.app.post('/v1/chat/completions')(self.create_chat_completion)
         self.app.post('/v1/completions')(self.create_completion)
@@ -85,6 +88,17 @@ def _get_model_list(self):
             model_list += [name for name in args.adapter_mapping.keys()]
         return model_list
 
+    async def health(self) -> Response:
+        """Health check endpoint."""
+        if self.infer_engine is not None:
+            return Response(status_code=200)
+        else:
+            return Response(status_code=503)
+
+    async def ping(self) -> Response:
+        """Ping check endpoint. Required for SageMaker compatibility."""
+        return await self.health()
+
     async def get_available_models(self):
         model_list = self._get_model_list()
         data = [Model(id=model_id, owned_by=self.args.owned_by) for model_id in model_list]
diff --git a/swift/llm/train/tuner.py b/swift/llm/train/tuner.py
@@ -93,6 +93,7 @@ def get_multimodal_target_regex(
     freeze_vit: bool = True,
     freeze_aligner: bool = True,
     include_embedding: bool = False,
+    exclude_router: bool = False,
 ) -> str:
     model_arch = model.model_meta.model_arch
     modules = []
@@ -117,6 +118,8 @@ def get_multimodal_target_regex(
 
         sub_module = deep_getattr(model, module)
         target_modules = find_all_linears(sub_module, model_arch, extra_layers)
+        if exclude_router and model.model_info.is_moe_model:
+            target_modules = [tm for tm in target_modules if tm not in {'gate'}]
         if not target_modules:
             continue
         target_modules = [tm for tm in target_modules if tm]
diff --git a/swift/megatron/argument/export_args.py b/swift/megatron/argument/export_args.py
@@ -40,7 +40,7 @@ def __post_init__(self):
         super().__post_init__()
         self._init_save()
         self.test_convert_dtype = HfConfigFactory.to_torch_dtype(self.test_convert_dtype)
-        extra_config = MegatronArguments.load_args_config(self.adapter_load or self.load)
+        extra_config = MegatronArguments.load_args_config(self.ckpt_dir)
         extra_config['adapter_load'] = self.adapter_load
         if self.load:
             extra_config['load'] = self.load
diff --git a/swift/megatron/argument/megatron_args.py b/swift/megatron/argument/megatron_args.py
@@ -284,10 +284,12 @@ class MegatronArguments(ExtraMegatronArguments):
     moe_shared_expert_intermediate_size: Optional[int] = None
 
     moe_router_topk: Optional[int] = None
+    moe_router_num_groups: Optional[int] = None
+    moe_router_group_topk: Optional[int] = None
     moe_router_pre_softmax: Optional[bool] = None
     moe_router_dtype: Literal['none', 'fp32', 'fp64'] = 'fp32'
     moe_router_score_function: Literal['sigmoid', 'softmax'] = None
-    moe_router_bias_update_rate: float = 1e-3
+    moe_router_bias_update_rate: Optional[float] = None
     moe_router_enable_expert_bias: Optional[bool] = None
     moe_router_topk_scaling_factor: Optional[float] = None
     moe_router_load_balancing_type: Literal['aux_loss', 'seq_aux_loss', 'sinkhorn', 'none'] = None
diff --git a/swift/megatron/convert.py b/swift/megatron/convert.py
@@ -157,14 +157,14 @@ def test_convert_precision(hf_model, mg_model, template, torch_dtype=torch.float
     _test_params_sum(mg_model)
 
     is_multimodal = template.model_meta.is_multimodal
-    inputs = template.encode(get_examples(is_multimodal))
-    hf_inputs = to_device(template.data_collator([inputs]), 'cuda')
     mg_language_model = mg_model.language_model if is_multimodal else mg_model
     share_embedding = mg_language_model.share_embeddings_and_output_weights
     if hf_model is not None:
         hf_model.eval()
         if dist.get_world_size() == 1:
             _test_params_sum(hf_model)
+        inputs = template.encode(get_examples(is_multimodal))
+        hf_inputs = to_device(template.data_collator([inputs]), 'cuda')
         template.register_post_encode_hook([hf_model])
         HfConfigFactory.set_model_config_attr(hf_model, 'use_cache', False)
         model_arch = hf_model.model_meta.model_arch
diff --git a/swift/megatron/model/gpt_bridge.py b/swift/megatron/model/gpt_bridge.py
diff --git a/swift/megatron/trainers/base.py b/swift/megatron/trainers/base.py
diff --git a/swift/megatron/utils/config.py b/swift/megatron/utils/config.py
diff --git a/swift/trainers/arguments.py b/swift/trainers/arguments.py
diff --git a/swift/trainers/mixin.py b/swift/trainers/mixin.py