9
9
# --------------------------------------------------------------------------------------
10
10
11
11
from pathlib import Path
12
+ from typing import List
13
+ import pandas as pd
12
14
13
15
from itwinai .scalability_report .data import read_scalability_metrics_from_csv
14
16
from itwinai .scalability_report .plot import (
24
26
25
27
26
28
def epoch_time_report (
27
- epoch_time_dir : Path | str ,
29
+ log_dirs : List [ Path ] | List [ str ],
28
30
plot_dir : Path | str ,
29
31
backup_dir : Path ,
30
32
do_backup : bool = False ,
31
33
plot_file_suffix : str = ".png" ,
32
- ) -> None :
34
+ ) -> str :
33
35
"""Generates reports and plots for epoch training times across distributed training
34
36
strategies, including a log-log plot of absolute average epoch times against the
35
37
number of GPUs and a log-log plot of relative speedup as more GPUs are added. The
36
38
function optionally creates backups of the data.
37
39
38
40
Args:
39
- epoch_time_dir (Path | str): Path to the directory containing CSV files with
40
- epoch time metrics. The files must include the columns "name", "nodes",
41
- "epoch_id", and "time".
41
+ # epoch_time_dir (Path | str): Path to the directory containing CSV files with
42
+ # epoch time metrics. The files must include the columns "name", "nodes",
43
+ # "epoch_id", and "time".
42
44
plot_dir (Path | str): Path to the directory where the generated plots will
43
45
be saved.
44
46
backup_dir (Path): Path to the directory where backups of the data will be stored
45
47
if `do_backup` is True.
46
48
do_backup (bool): Whether to create a backup of the epoch time data in the
47
49
`backup_dir`. Defaults to False.
48
50
"""
49
- if isinstance (epoch_time_dir , str ):
50
- epoch_time_dir = Path (epoch_time_dir )
51
51
if isinstance (plot_dir , str ):
52
52
plot_dir = Path (plot_dir )
53
53
54
54
epoch_time_expected_columns = {"name" , "nodes" , "epoch_id" , "time" }
55
- epoch_time_df = read_scalability_metrics_from_csv (
56
- data_dir = epoch_time_dir , expected_columns = epoch_time_expected_columns
57
- )
55
+
56
+ # Reading data from all the logdirs and concatenating the results
57
+ dataframes = []
58
+ for log_dir in log_dirs :
59
+ temp_df = read_scalability_metrics_from_csv (
60
+ data_dir = log_dir , expected_columns = epoch_time_expected_columns
61
+ )
62
+ dataframes .append (temp_df )
63
+ epoch_time_df = pd .concat (dataframes )
58
64
59
65
# Calculate the average time per epoch for each strategy and number of nodes
60
66
avg_epoch_time_df = (
@@ -66,7 +72,7 @@ def epoch_time_report(
66
72
# Print the resulting table
67
73
formatters = {"avg_epoch_time" : "{:.2f} s" .format }
68
74
epoch_time_table = avg_epoch_time_df .to_string (index = False , formatters = formatters )
69
- print (epoch_time_table )
75
+ # print(epoch_time_table)
70
76
71
77
# Create and save the figures
72
78
absolute_fig , _ = absolute_avg_epoch_time_plot (avg_epoch_time_df = avg_epoch_time_df )
@@ -81,21 +87,22 @@ def epoch_time_report(
81
87
print (f"Saved relative average time plot at '{ relative_speedup_plot_path .resolve ()} '." )
82
88
83
89
if not do_backup :
84
- return
90
+ return epoch_time_table
85
91
86
92
backup_dir .mkdir (exist_ok = True , parents = True )
87
93
backup_path = backup_dir / "epoch_time_data.csv"
88
94
epoch_time_df .to_csv (backup_path )
89
95
print (f"Storing backup file at '{ backup_path .resolve ()} '." )
96
+ return epoch_time_table
90
97
91
98
92
99
def gpu_data_report (
93
- gpu_data_dir : Path | str ,
100
+ log_dirs : List [ Path ] | List [ str ] ,
94
101
plot_dir : Path | str ,
95
102
backup_dir : Path ,
96
103
do_backup : bool = False ,
97
104
plot_file_suffix : str = ".png" ,
98
- ) -> None :
105
+ ) -> str :
99
106
"""Generates reports and plots for GPU energy consumption and utilization across
100
107
distributed training strategies. Includes bar plots for energy consumption and GPU
101
108
utilization by strategy and number of GPUs. The function optionally creates backups
@@ -115,6 +122,7 @@ def gpu_data_report(
115
122
"""
116
123
if isinstance (plot_dir , str ):
117
124
plot_dir = Path (plot_dir )
125
+
118
126
gpu_data_expected_columns = {
119
127
"sample_idx" ,
120
128
"utilization" ,
@@ -125,9 +133,14 @@ def gpu_data_report(
125
133
"strategy" ,
126
134
"probing_interval" ,
127
135
}
128
- gpu_data_df = read_scalability_metrics_from_csv (
129
- data_dir = gpu_data_dir , expected_columns = gpu_data_expected_columns
130
- )
136
+ dataframes = []
137
+ for log_dir in log_dirs :
138
+ temp_df = read_scalability_metrics_from_csv (
139
+ data_dir = log_dir , expected_columns = gpu_data_expected_columns
140
+ )
141
+ dataframes .append (temp_df )
142
+ gpu_data_df = pd .concat (dataframes )
143
+
131
144
gpu_data_statistics_df = calculate_gpu_statistics (
132
145
gpu_data_df = gpu_data_df , expected_columns = gpu_data_expected_columns
133
146
)
@@ -136,7 +149,6 @@ def gpu_data_report(
136
149
"utilization" : "{:.2f} %" .format ,
137
150
}
138
151
gpu_data_table = gpu_data_statistics_df .to_string (index = False , formatters = formatters )
139
- print (gpu_data_table )
140
152
141
153
energy_plot_path = plot_dir / ("gpu_energy_plot" + plot_file_suffix )
142
154
utilization_plot_path = plot_dir / ("utilization_plot" + plot_file_suffix )
@@ -158,21 +170,22 @@ def gpu_data_report(
158
170
print (f"Saved utilization plot at '{ utilization_plot_path .resolve ()} '." )
159
171
160
172
if not do_backup :
161
- return
173
+ return gpu_data_table
162
174
163
175
backup_dir .mkdir (exist_ok = True , parents = True )
164
176
backup_path = backup_dir / "gpu_data.csv"
165
177
gpu_data_df .to_csv (backup_path )
166
178
print (f"Storing backup file at '{ backup_path .resolve ()} '." )
179
+ return gpu_data_table
167
180
168
181
169
182
def communication_data_report (
170
- communication_data_dir : Path | str ,
183
+ log_dirs : List [ Path ] | List [ str ] ,
171
184
plot_dir : Path | str ,
172
185
backup_dir : Path ,
173
186
do_backup : bool = False ,
174
187
plot_file_suffix : str = ".png" ,
175
- ) -> None :
188
+ ) -> str :
176
189
"""Generates reports and plots for communication and computation fractions across
177
190
distributed training strategies. Includes a bar plot showing the fraction of time
178
191
spent on computation vs communication for each strategy and GPU count. The function
@@ -199,17 +212,19 @@ def communication_data_report(
199
212
"name" ,
200
213
"self_cuda_time_total" ,
201
214
}
202
- communication_data_df = read_scalability_metrics_from_csv (
203
- data_dir = communication_data_dir ,
204
- expected_columns = communication_data_expected_columns ,
205
- )
215
+ dataframes = []
216
+ for log_dir in log_dirs :
217
+ temp_df = read_scalability_metrics_from_csv (
218
+ data_dir = log_dir , expected_columns = communication_data_expected_columns
219
+ )
220
+ dataframes .append (temp_df )
221
+ communication_data_df = pd .concat (dataframes )
206
222
computation_fraction_df = get_computation_fraction_data (communication_data_df )
207
223
208
224
formatters = {"computation_fraction" : lambda x : "{:.2f} %" .format (x * 100 )}
209
225
communication_data_table = computation_fraction_df .to_string (
210
226
index = False , formatters = formatters
211
227
)
212
- print (communication_data_table )
213
228
214
229
computation_fraction_plot_path = plot_dir / (
215
230
"computation_fraction_plot" + plot_file_suffix
@@ -219,9 +234,10 @@ def communication_data_report(
219
234
print (f"Saved computation fraction plot at '{ computation_fraction_plot_path .resolve ()} '." )
220
235
221
236
if not do_backup :
222
- return
237
+ return communication_data_table
223
238
224
239
backup_dir .mkdir (exist_ok = True , parents = True )
225
240
backup_path = backup_dir / "communication_data.csv"
226
241
communication_data_df .to_csv (backup_path )
227
242
print (f"Storing backup file at '{ backup_path .resolve ()} '." )
243
+ return communication_data_table
0 commit comments