-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathrun_loader_eval.py
executable file
·115 lines (100 loc) · 5.15 KB
/
run_loader_eval.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import os, json
from utils.logger import setup_logger
from metrics import get_metric
import tqdm
from utils.run_utils import *
from argparse import Namespace
from argparse import ArgumentParser
def loader_eval(formulation, multi_round_eval, eval_stability, infer_method='generation', prediction_file=None):
args = Namespace()
if 'WORLD_SIZE' in os.environ:
args.local_rank = int(os.environ['LOCAL_RANK'])
else:
args.local_rank = -1
args.formulation = formulation
args.multi_round_eval = multi_round_eval
args.eval_stability = eval_stability
# check the output_dir
args.prediction_file = prediction_file
args.output_dir = os.path.dirname(args.prediction_file)
args.infer_method = infer_method
# args.full_path = json_file # os.path.join(args.output_dir, args.json_file)
global logger
logger = setup_logger('ReForm-Eval Evaluation', args.output_dir, args.local_rank)
# logger.info('Evaluating with {} GPUs'.format(args.n_gpus))
if os.path.exists(args.prediction_file):
logger.info('found the existing prediction in {}'.format(args.prediction_file))
full_res = json.load(open(args.prediction_file, 'r'))
# ori_args = torch.load(get_output_name(args, mid_output=False)[:-4]+'args.bin')
# logger.info('And the original arguments are: %s', ori_args)
metric_eval(args, full_res=full_res)
return
def metric_eval(args, full_res):
from collections import defaultdict
import numpy as np
# loading the evluating metric
logger.info('evaluating the predictions with the {} metric'.format(args.formulation))
if args.formulation == 'SingleChoice':
metric_param = {'infer_method': args.infer_method}
else:
metric_param = None
metric = get_metric(args.formulation, metric_param)
sum_of_metric = 0
# for accuracy metric
question2metric = defaultdict(list)
# for stability measurement
question2pred = defaultdict(list)
# for multi-round measurement
if args.multi_round_eval:
round2metric = defaultdict(list)
if args.formulation == 'Generation':
generation_metrics = metric(full_res)
for method, value in generation_metrics.items():
logger.info('the evalueted {} {} result: {}'.format(args.formulation, method, value))
else:
### for format hit rate
hit_num = 0
for item in tqdm.tqdm(full_res, desc='judging with the selected metric'):
m, pred = metric(item['prediction'], item['answer'])
sum_of_metric += m
if args.multi_round_eval:
round2metric[item['round_id']].append(m)
question2metric[item['sample_id']].append(m)
# map the predicted index back to the option
if pred is not None:
hit_num += 1
try:
question2pred[item['sample_id']].append(item['answer_options'][pred])
except:
print('found out of range prediction: {}'.format(pred))
question2pred[item['sample_id']].append(item['prediction'])
else:
question2pred[item['sample_id']].append(item['prediction'])
metric_matrix = np.array(list(question2metric.values()))
mean_metric = np.mean(metric_matrix)
logger.info('the evalueted {} result: {}'.format(args.formulation, mean_metric))
logger.info('the format hit rate is {}'.format(hit_num/len(full_res)))
if args.eval_stability:
# perform stability measurement
assert args.formulation == 'SingleChoice', 'only single-choice problems support instability evaluation!'
mean_entropy = entropy_calculation(question2pred)
logger.info('the measured stability (entropy on predictions) across prompts: {}'.format(mean_entropy))
if args.multi_round_eval:
multi_round_res = multi_round_eval(round2metric)
logger.info('corr(round, performance):{}, slope of linear_model(round, performance):{}'.format(multi_round_res[0], multi_round_res[1]))
def main():
parser = ArgumentParser()
parser.add_argument('--formulation', type=str, default=None, help='the problem formulation to perform')
parser.add_argument('--infer_method', type=str, default='generation', help='the imference method to use, likelihood or generation')
parser.add_argument('--eval_stability', action='store_true', help='whether to evaluate the stability')
parser.add_argument('--multi_round_eval', action='store_true', help='whether to evaluate multi-round performance')
# output setup
parser.add_argument('--prediction_file', type=str, default=None, required=True, help='the prediction json file')
# parser.add_argument('--output_dir', type=str, default=None, help='the path to save the log, default to be in the directory of the prediction file')
args = parser.parse_args()
# set the output dir to the prediction directory
# if args.output_dir is None:
# args.output_dir = os.path.dirname(args.prediction_file)
loader_eval(args.formulation, args.multi_round_eval, args.eval_stability, args.infer_method, args.prediction_file)
if __name__=='__main__':
main()