Skip to content

Commit 50ed3ce

Browse files
authored
[WIP] style(megabench): improve code formatting and import ordering (#497)
* 💄 style(megabench): improve code formatting and import ordering - re-order imports based on PEP 8 - remove unnecessary blank lines - format tuples and lists to be more concise - apply consistent string formatting across files ♻️ refactor(megabench): optimize loop and conditional structures - refactor match-case statements for Python 3.9 compatibility - optimize list comprehensions for improved performance - simplify nested conditionals for better readability * 💄 style(metrics): reformat import statements for readability - convert single-line import statements to multi-line format using parentheses for better readability - remove unnecessary blank lines to improve code consistency * ✨ feat(utils): add timestamp to submission file names - import time module for timestamp generation - append timestamp to submission file names for uniqueness and traceability
1 parent 4bb2f27 commit 50ed3ce

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

44 files changed

+346
-565
lines changed

lmms_eval/tasks/megabench/breakdown/analysis_utils.py

Lines changed: 6 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,11 @@
11
import json
2-
from collections import defaultdict
32
import os
3+
from collections import defaultdict
44

55
# Add path definition at the top after imports
66
all_task_meta_path = os.path.join(os.path.dirname(__file__), "all_task_meta.json")
77

8+
89
def task_list_refine(task_list):
910
task_results = []
1011
for task in task_list:
@@ -47,12 +48,7 @@ def derive_keyword_stats(task_results_with_meta, include_per_task_info=False):
4748
if include_per_task_info:
4849
skills_stats[skill]["tasks"].append((task_name, score))
4950

50-
for stat_dict, key in [
51-
(input_format_stats, "input_format"),
52-
(output_format_stats, "output_format"),
53-
(input_num_stats, "num_input"),
54-
(app_stats, "app")
55-
]:
51+
for stat_dict, key in [(input_format_stats, "input_format"), (output_format_stats, "output_format"), (input_num_stats, "num_input"), (app_stats, "app")]:
5652
if value := task.get(key):
5753
stat_dict[value]["count"] += 1
5854
stat_dict[value]["total_score"] += score
@@ -83,16 +79,16 @@ def collect_task_metadata(model_results):
8379
# Load the complete task metadata
8480
with open(all_task_meta_path, "r") as f:
8581
all_meta = json.load(f)
86-
82+
8783
# Create result dictionary
8884
all_task_meta = {}
89-
85+
9086
# Match results with metadata
9187
for task_result in model_results:
9288
task_name = task_result["name"]
9389
if task_name in all_meta:
9490
meta = all_meta[task_name].copy() # Create a copy to avoid modifying original
9591
meta.update(task_result)
9692
all_task_meta[task_name] = meta
97-
93+
9894
return all_task_meta
Lines changed: 36 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,18 @@
1-
import json
21
import argparse
2+
import json
33
from pathlib import Path
4-
from analysis_utils import (
5-
task_list_refine,
6-
collect_task_metadata,
7-
derive_keyword_stats,
8-
)
4+
5+
from analysis_utils import collect_task_metadata, derive_keyword_stats, task_list_refine
6+
97

108
def calculate_model_summary(task_results_with_meta):
119
"""
1210
Re-calculate model performance summary statistics across core and open tasks.
13-
11+
1412
Args:
1513
task_results: List of task results with scores
1614
task_metadata: Dictionary containing task metadata including task types
17-
15+
1816
Returns:
1917
Dictionary containing summary statistics for core and open tasks
2018
"""
@@ -23,27 +21,27 @@ def calculate_model_summary(task_results_with_meta):
2321

2422
# Separate core and open tasks
2523
for task in task_results_with_meta.values():
26-
if task['eval_type'] == 'llm':
24+
if task["eval_type"] == "llm":
2725
open_tasks.append(task)
2826
else:
2927
core_tasks.append(task)
30-
28+
3129
def calculate_stats(tasks):
3230
if not tasks:
3331
return None
34-
35-
total_samples = sum(task.get('num_query', 0) for task in tasks)
36-
macro_scores = [task.get('score', 0) for task in tasks]
37-
32+
33+
total_samples = sum(task.get("num_query", 0) for task in tasks)
34+
macro_scores = [task.get("score", 0) for task in tasks]
35+
3836
return {
3937
"num_eval_tasks": len(tasks),
4038
"num_eval_samples": total_samples,
4139
"macro_mean_score": sum(macro_scores) / len(tasks) if tasks else 0,
4240
}
43-
41+
4442
core_stats = calculate_stats(core_tasks)
4543
open_stats = calculate_stats(open_tasks)
46-
44+
4745
# Calculate overall score (weighted average based on number of tasks)
4846
# If either stat is None, use only the available stat
4947
if core_stats is None:
@@ -53,17 +51,11 @@ def calculate_stats(tasks):
5351
overall_score = core_stats["macro_mean_score"] if core_stats else 0
5452
total_tasks = core_stats["num_eval_tasks"] if core_stats else 0
5553
else:
56-
total_tasks = (core_stats["num_eval_tasks"] + open_stats["num_eval_tasks"])
57-
overall_score = (
58-
(core_stats["macro_mean_score"] * core_stats["num_eval_tasks"] +
59-
open_stats["macro_mean_score"] * open_stats["num_eval_tasks"]) / total_tasks
60-
)
61-
62-
return {
63-
"core": core_stats,
64-
"open": open_stats,
65-
"overall_score": overall_score
66-
}
54+
total_tasks = core_stats["num_eval_tasks"] + open_stats["num_eval_tasks"]
55+
overall_score = (core_stats["macro_mean_score"] * core_stats["num_eval_tasks"] + open_stats["macro_mean_score"] * open_stats["num_eval_tasks"]) / total_tasks
56+
57+
return {"core": core_stats, "open": open_stats, "overall_score": overall_score}
58+
6759

6860
def merge_json_files(input_dir, output_path, key="name"):
6961
"""
@@ -72,76 +64,73 @@ def merge_json_files(input_dir, output_path, key="name"):
7264
Prioritizes LLM evaluations over rule-based ones when duplicates exist.
7365
"""
7466
data_dict = {} # Using name as key for easy lookup and updates
75-
67+
7668
# Find all matching JSON files in the directory
7769
json_paths = list(Path(input_dir).glob("megabench*data_with_scores*.json"))
7870
print(f"Found {len(json_paths)} files to merge")
79-
71+
8072
# Load and merge all JSON files
8173
for path in json_paths:
8274
print(f"Processing {path}")
8375
with open(path, "r") as f:
8476
data = json.load(f)
8577
if isinstance(data, dict) and "data" in data:
8678
data = task_list_refine(data["data"])
87-
79+
8880
# Update or add entries
8981
for item in data:
9082
item_key = item[key]
9183
# If new item or if new item is LLM-evaluated (prioritize LLM eval)
92-
if item_key not in data_dict or (
93-
item.get("eval_type") == "llm" and data_dict[item_key].get("eval_type") != "llm"
94-
):
84+
if item_key not in data_dict or (item.get("eval_type") == "llm" and data_dict[item_key].get("eval_type") != "llm"):
9585
data_dict[item_key] = item
9686

9787
# Convert back to list
9888
merged_data = list(data_dict.values())
99-
89+
10090
# Save the merged result
10191
output_path.parent.mkdir(parents=True, exist_ok=True)
10292
with open(output_path, "w") as f:
10393
json.dump(merged_data, f, indent=4)
104-
94+
10595
print(f"Merged file with {len(merged_data)} tasks saved to {output_path}")
10696
return merged_data
10797

98+
10899
def main():
109100
# Parse command line arguments
110-
parser = argparse.ArgumentParser(description='Merge and process evaluation score files.')
111-
parser.add_argument('--input_dir', type=str, help='Directory containing score files')
101+
parser = argparse.ArgumentParser(description="Merge and process evaluation score files.")
102+
parser.add_argument("--input_dir", type=str, help="Directory containing score files")
112103
args = parser.parse_args()
113104

114105
# Convert path to Path object
115106
input_dir = Path(args.input_dir)
116-
107+
117108
# Create analysis directory under input directory
118109
output_dir = input_dir / "analysis"
119110
output_dir.mkdir(parents=True, exist_ok=True)
120-
111+
121112
# Merge files
122113
output_path = output_dir / "task_results.json"
123114
task_results = merge_json_files(input_dir, output_path)
124-
115+
125116
# Collect metadata and derive keyword stats
126117
task_results_with_meta = collect_task_metadata(task_results)
127118
keyword_stats = derive_keyword_stats(task_results_with_meta)
128-
119+
129120
# Calculate model summary
130121
model_summary = calculate_model_summary(task_results_with_meta)
131122

132-
summary_results = {
133-
"model_summary": model_summary,
134-
"keyword_stats": keyword_stats
135-
}
136-
123+
summary_results = {"model_summary": model_summary, "keyword_stats": keyword_stats}
124+
137125
# Save keyword stats
138126
stats_output = output_dir / "summary_and_keyword_stats.json"
139127
with open(stats_output, "w") as f:
140128
json.dump(summary_results, f, indent=4)
141-
129+
142130
print(f"\nResults saved in {output_dir}:")
143131
print(f"- Merged data: {output_path}")
144132
print(f"- Multi-dimensional keywords stats: {stats_output}")
145133

134+
146135
if __name__ == "__main__":
147136
main()

0 commit comments

Comments
 (0)