Implement gradient clipping (#286)

tengyifei · web-flow · commit 3a2ce6fa64bb · 2025-06-07T08:52:37.000Z
* Implement gradient clipping

* Fix tests

* Support both clipping by norm and by value

* Update base_trainer.py

* Update test_trainer.py

* Add gradient clipping tests
diff --git a/torchprime/torch_xla_models/configs/task/train.yaml b/torchprime/torch_xla_models/configs/task/train.yaml
@@ -1,12 +1,22 @@
-# this is for basic training loop, i.e., forward/backward pass without any special task logic.
+# This is for basic training loop, i.e., forward/backward pass without any special task logic.
 name: train 
 global_batch_size: 4
 max_steps: 15
 
+# Optimizer configuration.
 optimizer:
   learning_rate: 5.e-5
   type: adafactor
 
+# Defaults to clip the L2 norm of gradients to 1.0.
+# Set to null to disable gradient clipping by norm.
+max_grad_norm: 1.0
+
+# Defaults to not clip gradients by their absolute value.
+# Set to a number to clip gradients by the specified max absolute value.
+max_grad_value: null
+
+# Learning rate scheduler configuration.
 lr_scheduler:
   type: linear
   warmup_steps: 0
diff --git a/torchprime/torch_xla_models/tests/test_trainer.py b/torchprime/torch_xla_models/tests/test_trainer.py
@@ -83,6 +83,8 @@ def dummy_config():
         "global_batch_size": 4,
         "max_steps": 2,
         "optimizer": {"type": "adafactor", "learning_rate": 1e-3},
+        "max_grad_norm": None,
+        "max_grad_value": None,
         "lr_scheduler": {"type": "constant", "warmup_steps": 0},
       },
       "run_name": None,
@@ -170,7 +172,131 @@ def test_trainer_train_step(monkeypatch, dummy_config):
   trainer = Trainer(model, dummy_config, dataset)
 
   batch = {k: v.unsqueeze(0).to(device) for k, v in dataset[0].items()}
-  loss = trainer.train_step(batch)
+  loss, grad_norm = trainer.train_step(batch)
 
   assert isinstance(loss, torch.Tensor)
   assert loss.ndim == 0  # scalar loss
+  assert isinstance(grad_norm, torch.Tensor)
+  assert grad_norm.ndim == 0  # scalar gradient norm
+
+
+def test_trainer_clip_gradients_by_norm(monkeypatch, dummy_config):
+  """Test correctness of gradient clipping by norm in a train step."""
+  import torch_xla
+
+  from torchprime.torch_xla_models.model_rewriting import sharding_initialization
+
+  # Arrange
+  monkeypatch.setattr(
+    sharding_initialization, "get_mesh", lambda *args, **kwargs: FakeMesh()
+  )
+  monkeypatch.setattr(
+    sharding_initialization,
+    "shard_torch_xla_model_from_config",
+    lambda model, *args, **kwargs: model,
+  )
+
+  class SumModel(nn.Module):
+    def __init__(self):
+      super().__init__()
+      self.linear = nn.Linear(4, 1, bias=False)
+
+    def forward(self, input_ids=None, attention_mask=None, **kwargs):
+      logits = self.linear(input_ids)
+      loss = logits.mean()
+      return logits, loss
+
+  dummy_config.task.max_grad_norm = 1.0
+  dummy_config.task.max_grad_value = None
+  model = SumModel().to("xla")
+  with torch.no_grad():
+    model.linear.weight.fill_(1.0)
+  dataset = DummyDataset()
+  trainer = Trainer(model, dummy_config, dataset)
+  torch_xla.sync()
+
+  # Act
+  batch = {k: v.unsqueeze(0).to("xla") for k, v in dataset[0].items()}
+  loss, grad_norm = trainer.train_step(batch)
+
+  # Assert
+  # Loss should be exactly 4.0 since we are summing 4 inputs of 1.0.
+  assert loss.item() == 4.0
+
+  # ∂L/∂W = 1.0 for each weight in the linear layer
+  # Expected gradient norm before clipping: sqrt(4 * 1^2) = 2.0
+  assert pytest.approx(grad_norm.item(), rel=1e-5) == 2.0
+
+  # Verify the actual gradients on the model
+  # The original gradient for each weight would be 1.0
+  # With clipping factor 0.5 (1.0/2.0), each gradient becomes 0.5
+  if hasattr(model.linear.weight, "grad") and model.linear.weight.grad is not None:
+    expected_clipped_grad = torch.full_like(model.linear.weight, 0.5)
+    torch.testing.assert_close(
+      model.linear.weight.grad, expected_clipped_grad, rtol=1e-5, atol=1e-5
+    )
+
+    # Also verify the gradient norm matches what we expect
+    actual_grad_norm = torch.norm(model.linear.weight.grad)
+    assert pytest.approx(actual_grad_norm.item(), rel=1e-5) == 1.0
+
+
+def test_trainer_clip_gradients_by_value(monkeypatch, dummy_config):
+  """Test correctness of gradient clipping by max absolute value in a train step."""
+  import torch_xla
+
+  from torchprime.torch_xla_models.model_rewriting import sharding_initialization
+
+  # Arrange
+  monkeypatch.setattr(
+    sharding_initialization, "get_mesh", lambda *args, **kwargs: FakeMesh()
+  )
+  monkeypatch.setattr(
+    sharding_initialization,
+    "shard_torch_xla_model_from_config",
+    lambda model, *args, **kwargs: model,
+  )
+
+  class SumModel(nn.Module):
+    def __init__(self):
+      super().__init__()
+      self.linear = nn.Linear(4, 1, bias=False)
+
+    def forward(self, input_ids=None, attention_mask=None, **kwargs):
+      logits = self.linear(input_ids)
+      loss = logits.mean()
+      return logits, loss
+
+  dummy_config.task.max_grad_value = 0.5
+  dummy_config.task.max_grad_norm = None
+  model = SumModel().to("xla")
+  with torch.no_grad():
+    model.linear.weight.fill_(1.0)
+  dataset = DummyDataset()
+  trainer = Trainer(model, dummy_config, dataset)
+  torch_xla.sync()
+
+  # Act
+  batch = {k: v.unsqueeze(0).to("xla") for k, v in dataset[0].items()}
+  loss, grad_norm = trainer.train_step(batch)
+
+  # Assert
+  # Loss should be exactly 4.0 since we are summing 4 inputs of 1.0.
+  assert loss.item() == 4.0
+
+  # ∂L/∂W = 1.0 for each weight in the linear layer
+  # Expected gradient norm before clipping: sqrt(4 * 1^2) = 2.0
+  assert pytest.approx(grad_norm.item(), rel=1e-5) == 2.0
+
+  # Verify the actual gradients on the model
+  # The original gradient for each weight would be 1.0
+  # With value clipping at 0.5, each gradient becomes 0.5
+  if hasattr(model.linear.weight, "grad") and model.linear.weight.grad is not None:
+    expected_clipped_grad = torch.full_like(model.linear.weight, 0.5)
+    torch.testing.assert_close(
+      model.linear.weight.grad, expected_clipped_grad, rtol=1e-5, atol=1e-5
+    )
+
+    # Verify all gradient values are within [-max_grad_value, max_grad_value]
+    assert torch.all(model.linear.weight.grad <= dummy_config.task.max_grad_value)
+    assert torch.all(model.linear.weight.grad >= -dummy_config.task.max_grad_value)
diff --git a/torchprime/torch_xla_models/trainer/base_trainer.py b/torchprime/torch_xla_models/trainer/base_trainer.py
@@ -18,6 +18,7 @@
 from timeit import default_timer as timer
 
 import torch
+import torch.nn.utils as nn_utils
 import torch_xla
 import torch_xla.core.xla_model as xm
 import torch_xla.debug.profiler as xp
@@ -182,12 +183,13 @@ def _get_train_dataloader(self) -> pl.MpDeviceLoader:
     return loader
 
   def _log_to_tensorboard(
-    self, epoch: float, step: int, loss: float, learning_rate: float
+    self, epoch: float, step: int, loss: float, learning_rate: float, grad_norm: float
   ):
     """Log metrics to TensorBoard."""
     self.summary_writer.add_scalar("train/epoch", epoch, step)
     self.summary_writer.add_scalar("train/loss", loss, step)
     self.summary_writer.add_scalar("train/learning_rate", learning_rate, step)
+    self.summary_writer.add_scalar("train/grad_norm", grad_norm, step)
     self.summary_writer.flush()
 
   def train_loop(self, metrics_logger) -> None:
@@ -214,22 +216,26 @@ def train_loop(self, metrics_logger) -> None:
         batch = next(train_iterator)
 
       trace_start_time = timer()
-      loss = self.train_step(batch)
+      loss, grad_norm = self.train_step(batch)
       trace_end_time = timer()
 
       if step % self.config.logging_steps == 0:
 
-        def step_closure(epoch, step, loss, trace_start_time, trace_end_time, lr):
+        def step_closure(
+          epoch, step, loss, grad_norm, trace_start_time, trace_end_time, lr
+        ):
           loss = loss.detach().item()
+          grad_norm = grad_norm.detach().item()
           logger.info(
-            "Epoch: %d, step: %d, loss: %.4f, lr: %.2e, trace time: %.2f ms",
+            "Epoch: %d, step: %d, loss: %.4f, grad_norm: %.4f, lr: %.2e, trace time: %.2f ms",
             epoch,
             step,
             loss,
+            grad_norm,
             lr,
             (trace_end_time - trace_start_time) * 1000,
           )
-          self._log_to_tensorboard(epoch, step, loss, lr)
+          self._log_to_tensorboard(epoch, step, loss, lr, grad_norm)
           if math.isnan(loss):
             raise ValueError(f"Loss is NaN at step {step}")
 
@@ -239,6 +245,7 @@ def step_closure(epoch, step, loss, trace_start_time, trace_end_time, lr):
             epoch,
             step,
             loss,
+            grad_norm,
             trace_start_time,
             trace_end_time,
             self.lr_scheduler.get_last_lr()[0],
@@ -301,10 +308,25 @@ def step_closure(epoch, step, loss, trace_start_time, trace_end_time, lr):
     OmegaConf.save(config=self.config, f=config_save_path)
 
   @torch_xla.compile(full_graph=True)
-  def train_step(self, batch: dict) -> torch.Tensor:
+  def train_step(self, batch: dict) -> tuple[torch.Tensor, torch.Tensor]:
     _logits, loss = self.model(**batch)
     loss.backward()
+    grad_norm = self.clip_gradients()
     self.optimizer.step()
     self.lr_scheduler.step()
     self.model.zero_grad()
-    return loss
+    return loss, grad_norm
+
+  def clip_gradients(self):
+    """Clip gradients by the specified max norm and/or max absolute value."""
+    max_grad_norm = self.config.task.max_grad_norm
+    if max_grad_norm is None or max_grad_norm <= 0:
+      grad_norm = nn_utils.get_total_norm(self.model.parameters(), norm_type=2)
+    else:
+      grad_norm = nn_utils.clip_grad_norm_(
+        self.model.parameters(), max_norm=max_grad_norm, norm_type=2
+      )
+    max_grad_value = self.config.task.max_grad_value
+    if max_grad_value is not None and max_grad_value > 0:
+      nn_utils.clip_grad_value_(self.model.parameters(), clip_value=max_grad_value)
+    return grad_norm