add functionality for aggregating different runs

jarlsondre · jarlsondre · commit 608bf165c3af · 2025-03-19T14:35:36.000+01:00
diff --git a/src/itwinai/cli.py b/src/itwinai/cli.py
@@ -51,6 +51,7 @@ def generate_scalability_report(
             )
         ),
     ] = False,
+    run_id: str | None = None,
     backup_root_dir: Annotated[
         str, typer.Option(help=("Which directory to store the backup files in."))
     ] = "backup-scalability-metrics/",
@@ -96,45 +97,65 @@ def generate_scalability_report(
     plot_dir_path = Path(plot_dir)
     plot_dir_path.mkdir(exist_ok=True, parents=True)
 
-    report_dirs = {
-        "Epoch Time": {
-            "dir": log_dir_path / "epoch-time",
-            "func": epoch_time_report,
-        },
-        "GPU Data": {
-            "dir": log_dir_path / "gpu-energy-data",
-            "func": gpu_data_report,
-        },
-        "Communication Data": {
-            "dir": log_dir_path / "communication-data",
-            "func": communication_data_report,
-        },
-    }
+    # Finding all the appropriate paths
+    epoch_time_logdirs = []
+    gpu_data_logdirs = []
+    comm_time_logdirs = []
+    if run_id is None:
+        print("run_id was not passed, so will aggregate data from all runs in given directory!")
+        for path_elem in log_dir_path.iterdir():
+            print(f"Adding data from {path_elem}!")
+            if not path_elem.is_dir():
+                raise ValueError(
+                    f"Found element in logdir that was not itself a directory: "
+                    f"{path_elem.resolve()}"
+                )
+            epoch_time_logdirs.append(path_elem / "epoch-time")
+            gpu_data_logdirs.append(path_elem / "gpu-energy-data")
+            comm_time_logdirs.append(path_elem / "communication-data")
+        print()
+    else:
+        epoch_time_logdirs.append(log_dir_path / run_id / "epoch-time")
+        gpu_data_logdirs.append(log_dir_path / run_id / "gpu-energy-data")
+        comm_time_logdirs.append(log_dir_path / run_id / "communication-data")
 
+    # TODO: Add run_id into this, somehow
     # Setting the backup directory from exp name and run name
     experiment_name = experiment_name or f"exp_{uuid.uuid4().hex[:6]}"
     backup_dir = Path(backup_root_dir) / experiment_name
 
-    # Creating reports from dictionary
-    for report_name, details in report_dirs.items():
-        report_dir = details["dir"]
-        report_func = details["func"]
-
-        if report_dir.exists():
-            print("#" * 8, f"{report_name} Report", "#" * 8)
-            report_func(
-                report_dir,
-                plot_dir=plot_dir_path,
-                backup_dir=backup_dir,
-                do_backup=do_backup,
-                plot_file_suffix=plot_file_suffix,
-            )
-            print()
-        else:
-            print(
-                f"No report was created for {report_name} as '{report_dir.resolve()}' does "
-                f"not exist."
-            )
+    epoch_time_table = epoch_time_report(
+        log_dirs=epoch_time_logdirs,
+        plot_dir=plot_dir_path,
+        backup_dir=backup_dir,
+        do_backup=do_backup,
+        plot_file_suffix=plot_file_suffix,
+    )
+    gpu_data_table = gpu_data_report(
+        log_dirs=gpu_data_logdirs,
+        plot_dir=plot_dir_path,
+        backup_dir=backup_dir,
+        do_backup=do_backup,
+        plot_file_suffix=plot_file_suffix,
+    )
+    communication_data_table = communication_data_report(
+        log_dirs=comm_time_logdirs,
+        plot_dir=plot_dir_path,
+        backup_dir=backup_dir,
+        do_backup=do_backup,
+        plot_file_suffix=plot_file_suffix,
+    )
+
+    print()
+    print("#" * 8, "Epoch Time Report", "#" * 8)
+    print(epoch_time_table)
+    print()
+    print("#" * 8, "GPU Data Report", "#" * 8)
+    print(gpu_data_table)
+    print()
+    print("#" * 8, "Communication Data Report", "#" * 8)
+    print(communication_data_table)
+    print()
 
 
 @app.command()
diff --git a/src/itwinai/scalability_report/data.py b/src/itwinai/scalability_report/data.py
@@ -54,9 +54,7 @@ def read_scalability_metrics_from_csv(
     dataframes = []
     for file_path in file_paths:
         df = pd.read_csv(file_path)
-        check_contains_columns(
-            df=df, expected_columns=expected_columns, file_path=file_path
-        )
+        check_contains_columns(df=df, expected_columns=expected_columns, file_path=file_path)
         dataframes.append(df)
 
     return pd.concat(dataframes)
diff --git a/src/itwinai/scalability_report/reports.py b/src/itwinai/scalability_report/reports.py
@@ -9,6 +9,8 @@
 # --------------------------------------------------------------------------------------
 
 from pathlib import Path
+from typing import List
+import pandas as pd
 
 from itwinai.scalability_report.data import read_scalability_metrics_from_csv
 from itwinai.scalability_report.plot import (
@@ -24,37 +26,41 @@
 
 
 def epoch_time_report(
-    epoch_time_dir: Path | str,
+    log_dirs: List[Path] | List[str], 
     plot_dir: Path | str,
     backup_dir: Path,
     do_backup: bool = False,
     plot_file_suffix: str = ".png",
-) -> None:
+) -> str:
     """Generates reports and plots for epoch training times across distributed training
     strategies, including a log-log plot of absolute average epoch times against the
     number of GPUs and a log-log plot of relative speedup as more GPUs are added. The
     function optionally creates backups of the data.
 
     Args:
-        epoch_time_dir (Path | str): Path to the directory containing CSV files with
-            epoch time metrics. The files must include the columns "name", "nodes",
-            "epoch_id", and "time".
+        # epoch_time_dir (Path | str): Path to the directory containing CSV files with
+        #     epoch time metrics. The files must include the columns "name", "nodes",
+        #     "epoch_id", and "time".
         plot_dir (Path | str): Path to the directory where the generated plots will
             be saved.
         backup_dir (Path): Path to the directory where backups of the data will be stored
             if `do_backup` is True.
         do_backup (bool): Whether to create a backup of the epoch time data in the
             `backup_dir`. Defaults to False.
     """
-    if isinstance(epoch_time_dir, str):
-        epoch_time_dir = Path(epoch_time_dir)
     if isinstance(plot_dir, str):
         plot_dir = Path(plot_dir)
 
     epoch_time_expected_columns = {"name", "nodes", "epoch_id", "time"}
-    epoch_time_df = read_scalability_metrics_from_csv(
-        data_dir=epoch_time_dir, expected_columns=epoch_time_expected_columns
-    )
+
+    # Reading data from all the logdirs and concatenating the results
+    dataframes = []
+    for log_dir in log_dirs:
+        temp_df = read_scalability_metrics_from_csv(
+            data_dir=log_dir, expected_columns=epoch_time_expected_columns
+        )
+        dataframes.append(temp_df)
+    epoch_time_df = pd.concat(dataframes)
 
     # Calculate the average time per epoch for each strategy and number of nodes
     avg_epoch_time_df = (
@@ -66,7 +72,7 @@ def epoch_time_report(
     # Print the resulting table
     formatters = {"avg_epoch_time": "{:.2f} s".format}
     epoch_time_table = avg_epoch_time_df.to_string(index=False, formatters=formatters)
-    print(epoch_time_table)
+    # print(epoch_time_table)
 
     # Create and save the figures
     absolute_fig, _ = absolute_avg_epoch_time_plot(avg_epoch_time_df=avg_epoch_time_df)
@@ -81,21 +87,22 @@ def epoch_time_report(
     print(f"Saved relative average time plot at '{relative_speedup_plot_path.resolve()}'.")
 
     if not do_backup:
-        return
+        return epoch_time_table
 
     backup_dir.mkdir(exist_ok=True, parents=True)
     backup_path = backup_dir / "epoch_time_data.csv"
     epoch_time_df.to_csv(backup_path)
     print(f"Storing backup file at '{backup_path.resolve()}'.")
+    return epoch_time_table
 
 
 def gpu_data_report(
-    gpu_data_dir: Path | str,
+    log_dirs: List[Path] | List[str],
     plot_dir: Path | str,
     backup_dir: Path,
     do_backup: bool = False,
     plot_file_suffix: str = ".png",
-) -> None:
+) -> str:
     """Generates reports and plots for GPU energy consumption and utilization across
     distributed training strategies. Includes bar plots for energy consumption and GPU
     utilization by strategy and number of GPUs. The function optionally creates backups
@@ -115,6 +122,7 @@ def gpu_data_report(
     """
     if isinstance(plot_dir, str):
         plot_dir = Path(plot_dir)
+
     gpu_data_expected_columns = {
         "sample_idx",
         "utilization",
@@ -125,9 +133,14 @@ def gpu_data_report(
         "strategy",
         "probing_interval",
     }
-    gpu_data_df = read_scalability_metrics_from_csv(
-        data_dir=gpu_data_dir, expected_columns=gpu_data_expected_columns
-    )
+    dataframes = []
+    for log_dir in log_dirs:
+        temp_df = read_scalability_metrics_from_csv(
+            data_dir=log_dir, expected_columns=gpu_data_expected_columns
+        )
+        dataframes.append(temp_df)
+    gpu_data_df = pd.concat(dataframes)
+
     gpu_data_statistics_df = calculate_gpu_statistics(
         gpu_data_df=gpu_data_df, expected_columns=gpu_data_expected_columns
     )
@@ -136,7 +149,6 @@ def gpu_data_report(
         "utilization": "{:.2f} %".format,
     }
     gpu_data_table = gpu_data_statistics_df.to_string(index=False, formatters=formatters)
-    print(gpu_data_table)
 
     energy_plot_path = plot_dir / ("gpu_energy_plot" + plot_file_suffix)
     utilization_plot_path = plot_dir / ("utilization_plot" + plot_file_suffix)
@@ -158,21 +170,22 @@ def gpu_data_report(
     print(f"Saved utilization plot at '{utilization_plot_path.resolve()}'.")
 
     if not do_backup:
-        return
+        return gpu_data_table
 
     backup_dir.mkdir(exist_ok=True, parents=True)
     backup_path = backup_dir / "gpu_data.csv"
     gpu_data_df.to_csv(backup_path)
     print(f"Storing backup file at '{backup_path.resolve()}'.")
+    return gpu_data_table
 
 
 def communication_data_report(
-    communication_data_dir: Path | str,
+    log_dirs: List[Path] | List[str],
     plot_dir: Path | str,
     backup_dir: Path,
     do_backup: bool = False,
     plot_file_suffix: str = ".png",
-) -> None:
+) -> str:
     """Generates reports and plots for communication and computation fractions across
     distributed training strategies. Includes a bar plot showing the fraction of time
     spent on computation vs communication for each strategy and GPU count. The function
@@ -199,17 +212,19 @@ def communication_data_report(
         "name",
         "self_cuda_time_total",
     }
-    communication_data_df = read_scalability_metrics_from_csv(
-        data_dir=communication_data_dir,
-        expected_columns=communication_data_expected_columns,
-    )
+    dataframes = []
+    for log_dir in log_dirs:
+        temp_df = read_scalability_metrics_from_csv(
+            data_dir=log_dir, expected_columns=communication_data_expected_columns
+        )
+        dataframes.append(temp_df)
+    communication_data_df = pd.concat(dataframes)
     computation_fraction_df = get_computation_fraction_data(communication_data_df)
 
     formatters = {"computation_fraction": lambda x: "{:.2f} %".format(x * 100)}
     communication_data_table = computation_fraction_df.to_string(
         index=False, formatters=formatters
     )
-    print(communication_data_table)
 
     computation_fraction_plot_path = plot_dir / (
         "computation_fraction_plot" + plot_file_suffix
@@ -219,9 +234,10 @@ def communication_data_report(
     print(f"Saved computation fraction plot at '{computation_fraction_plot_path.resolve()}'.")
 
     if not do_backup:
-        return
+        return communication_data_table
 
     backup_dir.mkdir(exist_ok=True, parents=True)
     backup_path = backup_dir / "communication_data.csv"
     communication_data_df.to_csv(backup_path)
     print(f"Storing backup file at '{backup_path.resolve()}'.")
+    return communication_data_table
diff --git a/src/itwinai/torch/monitoring/monitoring.py b/src/itwinai/torch/monitoring/monitoring.py
@@ -175,7 +175,7 @@ def measured_method(self: 'TorchTrainer', *args, **kwargs) -> Any:
 
         global_utilization_log = strategy.gather_obj(local_utilization_log, dst_rank=0)
         if strategy.is_main_worker:
-            output_dir = Path("scalability-metrics/gpu-energy-data")
+            output_dir = Path(f"scalability-metrics/{self.run_id}/gpu-energy-data")
             output_dir.mkdir(exist_ok=True, parents=True)
             output_path = output_dir / f"{strategy_name}_{num_global_gpus}.csv"
 
diff --git a/src/itwinai/torch/profiling/profiler.py b/src/itwinai/torch/profiling/profiler.py
@@ -124,7 +124,7 @@ def profiled_method(self: 'TorchTrainer', *args, **kwargs) -> Any:
         profiling_dataframe["num_gpus"] = num_gpus_global
         profiling_dataframe["global_rank"] = global_rank
 
-        profiling_log_dir = Path("scalability-metrics/communication-data")
+        profiling_log_dir = Path(f"scalability-metrics/{self.run_id}/communication-data")
         profiling_log_dir.mkdir(parents=True, exist_ok=True)
 
         filename = f"{strategy_name}_{num_gpus_global}_{global_rank}.csv"
diff --git a/src/itwinai/torch/trainer.py b/src/itwinai/torch/trainer.py
@@ -123,7 +123,13 @@ class TorchTrainer(Trainer, LogMixin):
     #: PyTorch Profiler for communication vs. computation comparison
     profiler: Any | None
 
+    #: Toggles for the profilers
     measure_gpu_data: bool = False
+    measure_communication_overhead: bool = False
+    measure_epoch_time: bool = False
+
+    #: Run ID
+    run_id: str 
 
     def __init__(
         self,
@@ -144,7 +150,8 @@ def __init__(
         profiling_warmup_epochs: int = 2,
         measure_gpu_data: bool = False,
         measure_communication_overhead: bool = False,
-        measure_epoch_time: bool = False
+        measure_epoch_time: bool = False,
+        run_id: str | None = None
     ) -> None:
         super().__init__(name)
         self.save_parameters(**self.locals2params(locals()))
@@ -174,6 +181,9 @@ def __init__(
         self.measure_communication_overhead = measure_communication_overhead
         self.measure_epoch_time = measure_epoch_time
 
+        if run_id is None:
+            run_id = "run0"
+        self.run_id = run_id
 
     @property
     def strategy(self) -> TorchDistributedStrategy:
@@ -565,7 +575,7 @@ def train(self):
                     " when running distributed training!"
                 )
             num_nodes = int(os.environ["SLURM_NNODES"])
-            epoch_time_output_dir = Path("scalability-metrics/epoch-time")
+            epoch_time_output_dir = Path(f"scalability-metrics/{self.run_id}/epoch-time")
             epoch_time_file_name = f"epochtime_{self.strategy.name}_{num_nodes}N.csv"
             epoch_time_output_path = epoch_time_output_dir / epoch_time_file_name