NVIDIA · ashors1 · Jan 23, 2025 · Dec 14, 2024 · Dec 16, 2024 · Dec 16, 2024
diff --git a/nemo/collections/common/metrics/perf_metrics.py b/nemo/collections/common/metrics/perf_metrics.py
@@ -17,8 +17,8 @@
 import numpy as np
 from lightning.pytorch.callbacks import Callback
 
-from nemo.collections.common.parts.perf_metrics_utils import LLM_VOCAB_SIZE_MAP, read_tb_log
-from nemo.utils import logging
+from nemo.collections.common.parts.perf_metrics_utils import read_tb_log
+from nemo.utils import flops_formulas, logging
 
 __all__ = ["FLOPsMeasurementCallback"]
 
@@ -128,12 +128,12 @@ def eval_model_flops(self):
         """
 
         model_flops_map = {
-            "gpt3": self._gpt3,
-            "llama2": self._llama2,
-            "llama3": self._llama3,
-            "nemotron": self._nemotron,
-            "mixtral": self._mixtral,
-            "bert": self._bert,
+            "gpt3": flops_formulas.gpt3,
+            "llama2": flops_formulas.llama2,
+            "llama3": flops_formulas.llama3,
+            "nemotron": flops_formulas.nemotron,
+            "mixtral": flops_formulas.mixtral,
+            "bert": flops_formulas.bert,
         }
 
         if self.model is not None:
@@ -143,107 +143,7 @@ def eval_model_flops(self):
             logging.info(f"FLOPs measurement supported for {list(model_flops_map.keys())}")
             raise KeyError(f"Failed to extract valid model name from or missing FLOPs calculations for {self.model}")
 
-        total_flops = model_flops_map[self.model]()
+        total_flops = model_flops_map[self.model](self)
         flops_per_gpu = total_flops / (self.num_nodes * self.num_gpus_per_node)
 
         return total_flops, flops_per_gpu
-
-    def _gpt3(self):
-        """Model FLOPs for GPT3 family"""
-
-        vocab_size = LLM_VOCAB_SIZE_MAP["gpt3"]
-
-        return (
-            24 * self.gbs * self.enc_seq_len * self.hs * self.hs
-            + 4 * self.gbs * self.enc_seq_len * self.enc_seq_len * self.hs
-        ) * (3 * self.layers) + (6 * self.gbs * self.enc_seq_len * self.hs * vocab_size)
-
-    def _llama2(self):
-        """Model FLOPs for llama2 family"""
-        vocab_size = LLM_VOCAB_SIZE_MAP["llama2"]
-
-        return (
-            self.gbs
-            * self.enc_seq_len
-            * self.layers
-            * self.hs
-            * self.hs
-            * (
-                12
-                + (12 * self.query_groups / self.attention_heads)
-                + (18 * self.ffn_hs / self.hs)
-                + (12 * self.enc_seq_len / self.hs)
-                + (6 * vocab_size / (self.layers * self.hs))
-            )
-        )
-
-    def _llama3(self):
-        """Model FLOPs for llama3 family"""
-        vocab_size = LLM_VOCAB_SIZE_MAP["llama3"]
-
-        return (
-            self.gbs
-            * self.enc_seq_len
-            * self.layers
-            * self.hs
-            * self.hs
-            * (
-                12
-                + (12 * self.query_groups / self.attention_heads)
-                + (18 * self.ffn_hs / self.hs)
-                + (12 * self.enc_seq_len / self.hs)
-                + (6 * vocab_size / (self.layers * self.hs))
-            )
-        )
-
-    def _nemotron(self):
-        """Model FLOPs for nemotron family"""
-        vocab_size = LLM_VOCAB_SIZE_MAP["nemotron"]
-
-        return (
-            self.gbs
-            * self.enc_seq_len
-            * self.layers
-            * self.hs
-            * self.hs
-            * (
-                12
-                + (12 * self.query_groups / self.attention_heads)
-                + (12 * self.ffn_hs / self.hs)
-                + (12 * self.enc_seq_len / self.hs)
-                + (6 * vocab_size / (self.layers * self.hs))
-            )
-        )
-
-    def _mixtral(self):
-        """Model FLOPs for mixtral family"""
-        vocab_size = LLM_VOCAB_SIZE_MAP["mixtral"]
-
-        return (
-            self.gbs
-            * self.enc_seq_len
-            * self.layers
-            * self.hs
-            * self.hs
-            * (
-                12
-                + (12 * self.query_groups / self.attention_heads)
-                + (18 * self.moe_router_topk * self.ffn_hs / self.hs)
-                + (12 * self.enc_seq_len / self.hs)
-                + (6 * vocab_size / (self.layers * self.hs))
-            )
-        )
-
-    def _bert(self):
-        """Model FLOPs for BERT family"""
-        vocab_size = LLM_VOCAB_SIZE_MAP["bert"]
-
-        return (
-            72
-            * self.gbs
-            * self.layers
-            * self.enc_seq_len
-            * self.hs
-            * self.hs
-            * (1 + (self.enc_seq_len / (6 * self.hs)) + (vocab_size / (12 * self.hs * self.layers)))
-        )
diff --git a/nemo/lightning/pytorch/callbacks/flops_callback.py b/nemo/lightning/pytorch/callbacks/flops_callback.py
@@ -0,0 +1,158 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, Dict, List, Optional
+
+import lightning.pytorch as pl
+import numpy as np
+import torch
+from lightning.pytorch.callbacks import Callback
+
+from nemo.lightning.pytorch.callbacks import PEFT
+from nemo.utils import flops_formulas, logging
+
+__all__ = ["FLOPsMeasurementCallback"]
+
+
+class FLOPsMeasurementCallback(Callback):
+    """
+    Calculate FLOPs per second after last train step for a given job run.
+
+    Args:
+        model_config (Dict[str, Any]): params for running the experiment/job.
+        Expects a nested dictionary with parent keys
+            1. run- for assessing model name (Eg. 'gpt3', 'llama2', etc.) from sub-key 'name'.
+                'name' usually has value like- train_gpt3_5b_*, which is matched to model name 'gpt3'.
+            2. exp_manager- for accessing 'explicit_log_dir'. tensorboard log file is stored here,
+                used for accessing step time needed for calculating TFLOPs per sec per GPU
+            3. trainer- for accessing 'num_nodes' and 'devices' needed for calculating
+                TFLOPs per sec per GPU
+            4. model- Hyperparams for the model. Specifically- global batch size, sequence length,
+                hidden size,  ffn hidden size, num_layers, num_attention_heads, num_query_groups,
+                moe_router_topk. (list might increase with new models as required)
+        log_dir (Optional[str]): Directory with tenbsorboard log file. If present, will overrride
+            'explicit_log_dir' in model_config. Defaults to None.
+        model_name (Optional[str]): If present, will override 'name' under 'run' in model_config.
+            Defaults to None.
+    """
+
+    higher_is_better = True
+
+    def __init__(
+        self,
+        model_config: Dict[str, Any],
+        data_config: pl.LightningDataModule,
+        model_name: Optional[str],
+    ):
+        self.model_cfg = model_config
+        self.data_cfg = data_config
+
+        # use config params only when NOT provided explicitly
+        self.model = model_name
+
+        self.gbs = self.data_cfg.global_batch_size
+        self.enc_seq_len = self.model_cfg.seq_length
+        self.hs = self.model_cfg.hidden_size
+        self.layers = self.model_cfg.num_layers
+        self.ffn_hs = self.model_cfg.ffn_hidden_size
+        self.attention_heads = self.model_cfg.num_attention_heads
+        self.moe_router_topk = self.model_cfg.moe_router_topk
+
+        # this handles both- 1. key is present, value is None; 2. key is absent
+        self.query_groups = self.model_cfg.num_query_groups
+        if self.query_groups is None:
+            self.query_groups = self.attention_heads
+
+        self.model = self.model.lower() if self.model is not None else self.model
+
+        self.avg_train_step_time = 0
+
+    def on_train_start(self, trainer, pl_module):
+        for callback in trainer.callbacks:
+            if isinstance(callback, PEFT):
+                raise NotImplementedError("FLOPs measurement not supported for finetuning jobs")
+
+    def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx: int):
+        """
+        PyTorch Lightning callback hook to calculate TFLOPs per sec per GPU after training
+        """
+        try:
+            self.avg_train_step_time += trainer.progress_bar_metrics['train_step_timing in s']
+        except KeyError:
+            print("'train_step_timing in s' not found. Make sure to use TimingCallback with FLOPsMeasurementCallback.")
+
+        n = trainer.strategy.current_epoch_step
+        if n % trainer.log_every_n_steps == 0:
+            ## skip calculation if we haven't accumulated any timing data
+            if self.avg_train_step_time == 0:
+                return
+            tflops_per_sec_per_gpu = self.eval_tflops_per_sec_per_gpu(
+                self.avg_train_step_time / trainer.log_every_n_steps
+            )
+            self.avg_train_step_time = 0
+            pl_module.log(
+                "tflops_per_sec_per_gpu",
+                tflops_per_sec_per_gpu,
+                on_step=True,
+                on_epoch=False,
+                batch_size=1,
+                prog_bar=True,
+            )
+
+    def eval_tflops_per_sec_per_gpu(self, train_step_time: List | float | int) -> float:
+        """
+        Args:
+            train_step_time (Any[List, float, int]): Train step time (in seconds).
+            Step time will be less stable for initial steps (~10 steps)- less
+            accurate measurement
+            Use average step time over several steps for higher accuracy
+        Returns:
+            (float): Model TFLOPs per sec per gpu
+        """
+        total_flops, flops_per_gpu = self.eval_model_flops()
+
+        if not isinstance(train_step_time, list):
+            train_step_time = [train_step_time]
+        # efficient mean computation if num train steps is very large
+        step_time_arr = np.array(train_step_time)
+        train_step_time = np.mean(step_time_arr[len(step_time_arr) // 2 :])
+
+        return flops_per_gpu / (1e12 * train_step_time)
+
+    def eval_model_flops(self):
+        """
+        Calculate model FLOPs for a given model
+        """
+
+        model_flops_map = {
+            "gpt3": flops_formulas.gpt3,
+            "llama2": flops_formulas.llama2,
+            "llama3": flops_formulas.llama3,
+            "nemotron": flops_formulas.nemotron,
+            "mixtral": flops_formulas.mixtral,
+            "bert": flops_formulas.bert,
+        }
+
+        if self.model is not None:
+            model_matches = [model for model in model_flops_map if model in self.model]
+            self.model = model_matches[0] if len(model_matches) > 0 else self.model
+        if self.model not in model_flops_map:
+            logging.info(f"FLOPs measurement supported for {list(model_flops_map.keys())}")
+            raise KeyError(f"Failed to extract valid model name from or missing FLOPs calculations for {self.model}")
+
+        total_flops = model_flops_map[self.model](self)
+        num_devices = torch.distributed.get_world_size() if torch.distributed.is_initialized() else 1
+        flops_per_gpu = total_flops / num_devices
+
+        return total_flops, flops_per_gpu
diff --git a/nemo/lightning/pytorch/callbacks/progress_printer.py b/nemo/lightning/pytorch/callbacks/progress_printer.py
@@ -122,9 +122,15 @@ def on_train_start(self, trainer, *_):
     ## TODO(ashors): handle nan losses
     @override
     def on_train_batch_end(self, trainer, pl_module, *_, **__):
+        n = trainer.strategy.current_epoch_step
+
+        if self.should_log(n) and getattr(trainer.strategy, "timers", None):
+            timers = trainer.strategy._mcore_config.timers  # pointer to timers used in megatron
+            megatron_log_string = self.log_megatron_timers(timers)
+
         if self.is_disabled:
             return
-        n = trainer.strategy.current_epoch_step
+
         metrics = self.get_metrics(trainer, pl_module)
         for key in metrics:
             if key in self.exclude_metrics:
@@ -138,6 +144,8 @@ def on_train_batch_end(self, trainer, pl_module, *_, **__):
             prefix = self.train_description + f" epoch {trainer.current_epoch}, iteration {n-1}/{self.total-1}"
             log_string = self.format_string(prefix, self.average_metrics_dict)
             print(log_string)
+            if megatron_log_string:
+                print(megatron_log_string, flush=True)
 
             self.total_metrics_dict = defaultdict(lambda: 0.0)
 
@@ -201,3 +209,9 @@ def on_test_batch_end(
 
     def should_log(self, n):
         return n % self.log_interval == 0
+
+    def log_megatron_timers(self, timers):
+        output_string = timers.get_all_timers_string(names=None, normalizer=self.log_interval)
+        if output_string is not None:
+            return output_string + "\n"
+        return None
diff --git a/nemo/lightning/pytorch/strategies/megatron_strategy.py b/nemo/lightning/pytorch/strategies/megatron_strategy.py
@@ -47,6 +47,7 @@
 from lightning.pytorch.strategies.ddp import DDPStrategy
 from lightning.pytorch.trainer.states import RunningStage, TrainerFn
 from lightning.pytorch.utilities.types import STEP_OUTPUT
+from megatron.core import Timers
 from megatron.core.distributed import DistributedDataParallelConfig
 from megatron.core.optimizer import OptimizerConfig
 from torch import nn
@@ -169,6 +170,12 @@ class MegatronStrategy(DDPStrategy, io.IOMixin):
             that prints the metrics to stdout. Suitable for non-interactive settings.
         progress_interval (int): How frequently to print progress to stdout. Only used when
             replace_progress_bar is True.
+        megatron_log_level (int): Granularity level to measure and report timing.
+            0: report only iteration time and make sure timing does not introduce extra overhead.
+            1: report timing for operations that are executed very limited times (basically once) during
+               each iteration (such as gradient all-reduce)
+            2: report timing for operations that migh be executed numerous times during each iteration.
+            Note that setting the level to 1 or 2 might cause increase in iteration time.
         **kwargs: Additional keyword arguments.
 
     Note:
@@ -216,6 +223,7 @@ def __init__(
         replace_progress_bar: bool = True,
         progress_interval: int = 1,
         restore_config: Optional[RestoreConfig] = None,
+        megatron_log_level: int = 0,
         **kwargs,
     ) -> None:
         super().__init__(
@@ -267,6 +275,7 @@ def __init__(
         self.progress_interval = progress_interval
 
         self.restore_config = restore_config
+        self.timers = Timers(megatron_log_level, "minmax")  ## could also set this for optimizer if we want
 
         self._ddp = ddp
         if ddp == "megatron":
@@ -312,6 +321,9 @@ def connect(self, model: pl.LightningModule) -> None:
 
             model.config = update_config_with_dtype_overrides(dtype_config, model.config)
 
+        ## add megatron timer to config
+        model.config.timers = self.timers
+
         has_optim = getattr(model, "optim", None)
         if has_optim and self._setup_optimizers:
             opt_config = getattr(model.optim, "config", None)