1
- import json
2
1
import argparse
2
+ import json
3
3
from pathlib import Path
4
- from analysis_utils import (
5
- task_list_refine ,
6
- collect_task_metadata ,
7
- derive_keyword_stats ,
8
- )
4
+
5
+ from analysis_utils import collect_task_metadata , derive_keyword_stats , task_list_refine
6
+
9
7
10
8
def calculate_model_summary (task_results_with_meta ):
11
9
"""
12
10
Re-calculate model performance summary statistics across core and open tasks.
13
-
11
+
14
12
Args:
15
13
task_results: List of task results with scores
16
14
task_metadata: Dictionary containing task metadata including task types
17
-
15
+
18
16
Returns:
19
17
Dictionary containing summary statistics for core and open tasks
20
18
"""
@@ -23,27 +21,27 @@ def calculate_model_summary(task_results_with_meta):
23
21
24
22
# Separate core and open tasks
25
23
for task in task_results_with_meta .values ():
26
- if task [' eval_type' ] == ' llm' :
24
+ if task [" eval_type" ] == " llm" :
27
25
open_tasks .append (task )
28
26
else :
29
27
core_tasks .append (task )
30
-
28
+
31
29
def calculate_stats (tasks ):
32
30
if not tasks :
33
31
return None
34
-
35
- total_samples = sum (task .get (' num_query' , 0 ) for task in tasks )
36
- macro_scores = [task .get (' score' , 0 ) for task in tasks ]
37
-
32
+
33
+ total_samples = sum (task .get (" num_query" , 0 ) for task in tasks )
34
+ macro_scores = [task .get (" score" , 0 ) for task in tasks ]
35
+
38
36
return {
39
37
"num_eval_tasks" : len (tasks ),
40
38
"num_eval_samples" : total_samples ,
41
39
"macro_mean_score" : sum (macro_scores ) / len (tasks ) if tasks else 0 ,
42
40
}
43
-
41
+
44
42
core_stats = calculate_stats (core_tasks )
45
43
open_stats = calculate_stats (open_tasks )
46
-
44
+
47
45
# Calculate overall score (weighted average based on number of tasks)
48
46
# If either stat is None, use only the available stat
49
47
if core_stats is None :
@@ -53,17 +51,11 @@ def calculate_stats(tasks):
53
51
overall_score = core_stats ["macro_mean_score" ] if core_stats else 0
54
52
total_tasks = core_stats ["num_eval_tasks" ] if core_stats else 0
55
53
else :
56
- total_tasks = (core_stats ["num_eval_tasks" ] + open_stats ["num_eval_tasks" ])
57
- overall_score = (
58
- (core_stats ["macro_mean_score" ] * core_stats ["num_eval_tasks" ] +
59
- open_stats ["macro_mean_score" ] * open_stats ["num_eval_tasks" ]) / total_tasks
60
- )
61
-
62
- return {
63
- "core" : core_stats ,
64
- "open" : open_stats ,
65
- "overall_score" : overall_score
66
- }
54
+ total_tasks = core_stats ["num_eval_tasks" ] + open_stats ["num_eval_tasks" ]
55
+ overall_score = (core_stats ["macro_mean_score" ] * core_stats ["num_eval_tasks" ] + open_stats ["macro_mean_score" ] * open_stats ["num_eval_tasks" ]) / total_tasks
56
+
57
+ return {"core" : core_stats , "open" : open_stats , "overall_score" : overall_score }
58
+
67
59
68
60
def merge_json_files (input_dir , output_path , key = "name" ):
69
61
"""
@@ -72,76 +64,73 @@ def merge_json_files(input_dir, output_path, key="name"):
72
64
Prioritizes LLM evaluations over rule-based ones when duplicates exist.
73
65
"""
74
66
data_dict = {} # Using name as key for easy lookup and updates
75
-
67
+
76
68
# Find all matching JSON files in the directory
77
69
json_paths = list (Path (input_dir ).glob ("megabench*data_with_scores*.json" ))
78
70
print (f"Found { len (json_paths )} files to merge" )
79
-
71
+
80
72
# Load and merge all JSON files
81
73
for path in json_paths :
82
74
print (f"Processing { path } " )
83
75
with open (path , "r" ) as f :
84
76
data = json .load (f )
85
77
if isinstance (data , dict ) and "data" in data :
86
78
data = task_list_refine (data ["data" ])
87
-
79
+
88
80
# Update or add entries
89
81
for item in data :
90
82
item_key = item [key ]
91
83
# If new item or if new item is LLM-evaluated (prioritize LLM eval)
92
- if item_key not in data_dict or (
93
- item .get ("eval_type" ) == "llm" and data_dict [item_key ].get ("eval_type" ) != "llm"
94
- ):
84
+ if item_key not in data_dict or (item .get ("eval_type" ) == "llm" and data_dict [item_key ].get ("eval_type" ) != "llm" ):
95
85
data_dict [item_key ] = item
96
86
97
87
# Convert back to list
98
88
merged_data = list (data_dict .values ())
99
-
89
+
100
90
# Save the merged result
101
91
output_path .parent .mkdir (parents = True , exist_ok = True )
102
92
with open (output_path , "w" ) as f :
103
93
json .dump (merged_data , f , indent = 4 )
104
-
94
+
105
95
print (f"Merged file with { len (merged_data )} tasks saved to { output_path } " )
106
96
return merged_data
107
97
98
+
108
99
def main ():
109
100
# Parse command line arguments
110
- parser = argparse .ArgumentParser (description = ' Merge and process evaluation score files.' )
111
- parser .add_argument (' --input_dir' , type = str , help = ' Directory containing score files' )
101
+ parser = argparse .ArgumentParser (description = " Merge and process evaluation score files." )
102
+ parser .add_argument (" --input_dir" , type = str , help = " Directory containing score files" )
112
103
args = parser .parse_args ()
113
104
114
105
# Convert path to Path object
115
106
input_dir = Path (args .input_dir )
116
-
107
+
117
108
# Create analysis directory under input directory
118
109
output_dir = input_dir / "analysis"
119
110
output_dir .mkdir (parents = True , exist_ok = True )
120
-
111
+
121
112
# Merge files
122
113
output_path = output_dir / "task_results.json"
123
114
task_results = merge_json_files (input_dir , output_path )
124
-
115
+
125
116
# Collect metadata and derive keyword stats
126
117
task_results_with_meta = collect_task_metadata (task_results )
127
118
keyword_stats = derive_keyword_stats (task_results_with_meta )
128
-
119
+
129
120
# Calculate model summary
130
121
model_summary = calculate_model_summary (task_results_with_meta )
131
122
132
- summary_results = {
133
- "model_summary" : model_summary ,
134
- "keyword_stats" : keyword_stats
135
- }
136
-
123
+ summary_results = {"model_summary" : model_summary , "keyword_stats" : keyword_stats }
124
+
137
125
# Save keyword stats
138
126
stats_output = output_dir / "summary_and_keyword_stats.json"
139
127
with open (stats_output , "w" ) as f :
140
128
json .dump (summary_results , f , indent = 4 )
141
-
129
+
142
130
print (f"\n Results saved in { output_dir } :" )
143
131
print (f"- Merged data: { output_path } " )
144
132
print (f"- Multi-dimensional keywords stats: { stats_output } " )
145
133
134
+
146
135
if __name__ == "__main__" :
147
136
main ()
0 commit comments