[megatron] support vit_lr aligner_lr (modelscope#6469)

Jintao-Huang · web-flow · commit 4140cfbfa615 · 2025-11-07T11:12:29.000+08:00
diff --git a/docs/source/Megatron-SWIFT/Command-line-parameters.md b/docs/source/Megatron-SWIFT/Command-line-parameters.md
@@ -283,6 +283,9 @@ Megatron训练参数继承自Megatron参数和基本参数（**与ms-swift共用
   - 注意：**Megatron-SWIFT训练特性优先支持padding_free格式**，若非特殊情况，请勿修改该值。
 - mlp_padding_free: 默认为False。用于padding_free设置为false时，对mlp进行padding_free优化。这可以在自定义attention_mask的同时，提升训练速度和减少显存占用。
 - vit_gradient_checkpointing: 多模态模型训练时，是否对vit部分开启gradient_checkpointing。默认为True。（**Megatron-SWIFT的vit实现使用transformers实现**）
+- vit_lr: 当训练多模态大模型时，该参数指定vit的学习率，默认为None，等于learning_rate。
+  - 通常与`--freeze_vit false`、`--freeze_aligner false`参数结合使用。
+- aligner_lr: 当训练多模态大模型时，该参数指定aligner的学习率，默认为None，等于learning_rate。
 - gradient_checkpointing_kwargs: 传入`torch.utils.checkpoint`中的参数。例如设置为`--gradient_checkpointing_kwargs '{"use_reentrant": false}'`。默认为None。该参数只对`vit_gradient_checkpointing`生效。
 - 🔥packing: 是否使用序列packing提升计算效率（不同节点与进程更负载均衡，GPU利用率更高；但需要额外的预处理时间）并稳定显存占用，默认为False。当前支持CPT/SFT/DPO/KTO/RM。
   - 注意：**同一batch的不同序列之间依旧是不可见的**，除了Qwen3-Next。
diff --git a/docs/source_en/Megatron-SWIFT/Command-line-parameters.md b/docs/source_en/Megatron-SWIFT/Command-line-parameters.md
@@ -300,6 +300,9 @@ Megatron training parameters are inherited from Megatron parameters and basic pa
   - Note: **The Megatron-SWIFT training feature prioritizes support for the padding-free format**. Unless under special circumstances, please do not modify this value.
 - mlp_padding_free: The default is False. This is used for applying padding-free optimization to the MLP when padding_free is set to false. It allows for improved training speed and reduced memory usage while customizing the attention_mask.
 - vit_gradient_checkpointing: Whether to enable gradient checkpointing for the ViT (Vision Transformer) component during multimodal model training. Defaults to `True`. (**The ViT implementation in Megatron-SWIFT uses the Hugging Face `transformers` library.**)
+- vit_lr: Specifies the learning rate for the ViT module when training multimodal models. Default is `None`, same as `learning_rate`.
+  - Typically used together with `--freeze_vit false` and `--freeze_aligner false`.
+- aligner_lr: Specifies the learning rate for the aligner module in multimodal models. Default is `None`, same as `learning_rate`.
 - gradient_checkpointing_kwargs: Arguments passed to `torch.utils.checkpoint`. For example: set `--gradient_checkpointing_kwargs '{"use_reentrant": false}'`. Defaults to `None`. This parameter only takes effect when `vit_gradient_checkpointing` is enabled.
 - 🔥packing: Whether to use sequence packing to improve computational efficiency (achieving better load balancing across nodes and processes, and higher GPU utilization), at the cost of additional preprocessing time, while also stabilizing GPU memory usage. Defaults to `False`. Currently supported for CPT, SFT, DPO, KTO and RM.
   - Note: **Sequences within the same batch remain mutually invisible**, except for Qwen3-Next.
diff --git a/swift/megatron/argument/megatron_args.py b/swift/megatron/argument/megatron_args.py
@@ -127,6 +127,8 @@ class ExtraMegatronArguments(RLHFMegatronArgumentsMixin, MegatronTunerMixin):
 
     # visual
     vit_gradient_checkpointing: bool = True
+    vit_lr: Optional[float] = None
+    aligner_lr: Optional[float] = None
     gradient_checkpointing_kwargs: Optional[Union[dict, str]] = None
     # qwen3_next
     linear_num_value_heads: Optional[int] = None
diff --git a/swift/megatron/trainers/base.py b/swift/megatron/trainers/base.py
@@ -6,16 +6,18 @@
 from abc import ABC, abstractmethod
 from contextlib import contextmanager
 from datetime import datetime
-from typing import Dict, Literal
+from typing import Callable, Dict, List, Literal, Optional
 
 import megatron.core
 import torch
 import torch.nn
 from megatron.core import mpu
 from megatron.core.enums import ModelType
 from megatron.core.num_microbatches_calculator import get_num_microbatches
+from megatron.core.optimizer import _update_min_and_max_lr_in_param_groups, param_group_identifier_keys
 from megatron.core.pipeline_parallel import get_forward_backward_func
 from megatron.core.rerun_state_machine import RerunMode, get_rerun_state_machine
+from megatron.core.transformer.module import MegatronModule
 from megatron.core.transformer.moe.moe_utils import track_moe_metrics
 from megatron.core.transformer.multi_token_prediction import MTPLossLoggingHelper
 from megatron.core.utils import StragglerDetector
@@ -235,6 +237,157 @@ def load_state_dict(self, state_dict, strict: bool = True, *args, **kwargs):
             args.no_load_rng = origin_no_load_rng
             args.finetune = origin_finetune
 
+    # Code borrowed from Megatron-LM
+    def _get_param_groups(
+        self,
+        model_chunks: List[MegatronModule],
+        no_weight_decay_cond: Optional[Callable],
+        scale_lr_cond: Optional[Callable],
+        lr_mult: float,
+        lr: float,
+        min_lr: float,
+        decoupled_lr: Optional[float],
+        decoupled_min_lr: Optional[float],
+        default_skip_embedding_weight_decay: bool = False,
+    ) -> List[Dict]:
+        """Create parameter groups for optimizer.
+
+        Creates parameter groups based on weight decay condition (regularized vs
+        non regularized), learning rate scale condition (lr vs lr_mult * lr),
+        and whether it is expert parameters. scale_lr_cond is used during finetuning
+        where head of the network requires a scaled version of the base learning rate.
+
+        Args:
+            model_chunks (List[MegatronModule]): model chunks to create parameter
+                groups for.
+            no_weight_decay_cond (func, optional): function to determine whether a
+                parameter should not perform weight decay.
+            scale_lr_cond (func, optional): function to determine whether a parameter
+                should have a scaled learning rate.
+            lr_mult (float): learning rate multiplier for parameters that
+                satisfy scale_lr_cond.
+            lr (float): learning rate.
+            min_lr (float): minimum learning rate.
+            decoupled_lr (Optional[float]): optional decoupled learning rate.
+            decoupled_min_lr (Optional[float]): optional decoupled minimum learning rate.
+            default_skip_embedding_weight_decay (bool): whether to skip weight decay for embedding
+                parameters by default, if no_weight_decay_cond is not provided.
+
+        Returns:
+            List of parameter groups.
+        """
+
+        use_decoupled_learning_rate = decoupled_lr is not None
+
+        # Map (wd_mult, lr_mult, is_expert_parallel, is_decoupled_lr) to params.
+        params_map = {}
+        for model_chunk in model_chunks:
+            visual = model_chunk.module.module.visual
+            for name, param in model_chunk.named_parameters():
+                if not param.requires_grad:
+                    continue
+
+                is_expert_parallel = not getattr(param, 'allreduce', True)
+
+                if no_weight_decay_cond is not None:
+                    no_wd: bool = no_weight_decay_cond(name, param)
+                else:
+                    # Do not regularize biases and norm parameters.
+                    #  optionally, also skip weight decay for embedding parameters if requested
+                    #  (useful if you do not want embeddings to shrink to zero in training
+                    #  https://arxiv.org/abs/2312.16903)
+                    no_wd = (
+                        name.endswith('.bias') or len(param.shape) == 1
+                        or (default_skip_embedding_weight_decay and 'embedding' in name))
+                _lr_mult = lr_mult
+                if scale_lr_cond is not None:
+                    scale_lr = scale_lr_cond(name, param)
+                else:
+                    scale_lr = False
+                    # Handling multimodal models: vit_lr, aligner_lr
+                    unwrapped_name = name.removeprefix('module.').removeprefix('module.')
+                    is_aligner = any(unwrapped_name.startswith(f'visual.{k}') for k in visual._aligner)
+                    is_vit = any(unwrapped_name.startswith(f'visual.{k}')
+                                 for k in visual._vision_tower) and not is_aligner
+                    if is_vit and self.args.vit_lr:
+                        scale_lr = True
+                        _lr_mult = self.args.vit_lr / lr
+                    elif is_aligner and self.args.aligner_lr:
+                        scale_lr = True
+                        _lr_mult = self.args.aligner_lr / lr
+
+                if not no_wd and not scale_lr:
+                    wd_mult, _lr_mult = 1.0, 1.0
+                elif not no_wd and scale_lr:
+                    wd_mult, _lr_mult = 1.0, _lr_mult
+                elif no_wd and not scale_lr:
+                    wd_mult, _lr_mult = 0.0, 1.0
+                else:
+                    wd_mult, _lr_mult = 0.0, _lr_mult
+
+                is_decoupled_lr = False
+                # For input/embedding and output layer: embedding.word_embeddings.weight /
+                # output_layer.weight.
+                if use_decoupled_learning_rate and getattr(param, 'is_embedding_or_output_parameter', False):
+                    is_decoupled_lr = True
+
+                key = (wd_mult, _lr_mult, is_expert_parallel, is_decoupled_lr)
+                if key not in params_map:
+                    params_map[key] = []
+                params_map[key].append(param)
+
+        # Distributed checkpoint requires all ranks to have the same param groups,
+        # so we need to align the param groups across ranks, otherwise we may have
+        # runtime error when loading the checkpoint or numerical error when resuming training.
+        params_key = list(params_map.keys())
+        gathered_params_key = [None for _ in range(torch.distributed.get_world_size())]
+        torch.distributed.all_gather_object(gathered_params_key, params_key)
+        for keys in gathered_params_key:
+            for key in keys:
+                if key not in params_key:
+                    params_key.append(key)
+
+        param_groups = []
+        for key in params_key:
+            wd_mult, _lr_mult, is_expert_parallel, is_decoupled_lr = key
+            params = params_map[key] if key in params_map else []
+            param_group = {
+                'params': params,
+                'wd_mult': wd_mult,
+                'lr_mult': _lr_mult,
+                'is_expert_parallel': is_expert_parallel,
+                'is_decoupled_lr': is_decoupled_lr,
+            }
+            # Ensure param_group has required keys for matching when loading optimizer state
+            # See MegatronOptimizer._filter_and_reorder_param_groups.
+            assert set(param_group.keys()) - set(param_group_identifier_keys) == {'params'}
+            param_groups.append(param_group)
+
+        param_groups = _update_min_and_max_lr_in_param_groups(
+            param_groups,
+            lr=lr,
+            min_lr=min_lr,
+            decoupled_lr=decoupled_lr,
+            decoupled_min_lr=decoupled_min_lr,
+        )
+
+        return param_groups
+
+    @contextmanager
+    def _patch_get_param_groups(self):
+        if not self.args.megatron_model_meta.is_multimodal or (self.args.vit_lr is None
+                                                               and self.args.aligner_lr is None):
+            yield
+            return
+        from megatron.core import optimizer
+
+        _get_param_groups = optimizer._get_param_groups
+        optimizer._get_param_groups = self._get_param_groups
+        try:
+            yield
+        finally:
+            optimizer._get_param_groups = _get_param_groups
+
     def setup_model_and_optimizer(self, model_provider_func, model_type, *_args, **kwargs):
 
         args = get_args()
@@ -254,7 +407,7 @@ def new_model_provider_func(*_args, **kwargs):
             return model
 
         self._init_multimodal_full()
-        with self._patch_load_state_dict(self._load_base_checkpoint):
+        with self._patch_load_state_dict(self._load_base_checkpoint), self._patch_get_param_groups():
             model, optimizer, opt_param_scheduler = self._origin_setup_model_and_optimizer(
                 new_model_provider_func, model_type, *_args, **kwargs)
         if args.initialize_embedding: