-
Notifications
You must be signed in to change notification settings - Fork 20
/
Copy pathbenchmarks.py
252 lines (187 loc) · 9.93 KB
/
benchmarks.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
import torch
from transformers import pipeline, TextGenerationPipeline
import json
import time
import re
class BaseEvaluator:
def __init__(self, dataset, config):
self.dataset = dataset
self.max_new_tokens = config['max_new_tokens']
self.batch_size = config['eval_batch_size']
def infer(self, model, tokenizer):
generator = pipeline('text-generation', model=model, tokenizer=tokenizer)
input_text = [i['prompt'] for i in self.dataset]
responses = generator(input_text, max_new_tokens=self.max_new_tokens, do_sample=False, return_full_text=False, temperature=None, top_p=None, batch_size=self.batch_size)
output = [{"prompt": input_text[i], "raw_prediction": responses[i][0]['generated_text'], "raw_answers": self.dataset[i]['raw_answers']} for i in range(len(responses))]
return output
def eval_metric(self, results):
scores = []
for sample in results:
raw_prediction, raw_answers = sample["raw_prediction"], sample["raw_answers"]
prediction, answers = self.post_process(raw_prediction, raw_answers)
score = self._metrics(prediction, answers[0])
scores.append(score)
return scores
def post_process(self, raw_prediction, ground_truths):
pred = raw_prediction.strip()
if pred == "":
pred = "None"
pred.strip(".。")
ground_truth = ground_truths[0]
return pred, [ground_truth]
def _metrics(self, prediction, ground_truth):
raise NotImplementedError
def evaluate(self, model, tokenizer):
print("Running inference on evaluation dataset...")
results = self.infer(model, tokenizer)
print("Evaluating results...")
metrics = self.eval_metric(results)
print("Evaluation complete. The result is as follows:")
print(f"Average score: {sum(metrics) / len(metrics)}")
return results, metrics
class GPT4Evaluator(BaseEvaluator):
def __init__(self, dataset, config):
super().__init__(dataset, config)
import openai
self.client = openai.AzureOpenAI(
api_key=config['openai_api_key'],
api_version="2024-02-15-preview"
)
def query_gpt4(self, text):
# try for 5 times
MAX_TRIAL=5
for i in range(MAX_TRIAL):
try:
chat_completion = self.client.chat.completions.create(
model="gpt-4-1106-preview",
messages=[
{"role": "system", "content": "You are a helpful assistant. Follow the user's instructions carefully. Respond using markdown."},
{"role": "user", "content": text}
],
max_tokens=80
)
response_text = chat_completion.choices[0].message.content
break
except Exception as e:
print("ERROR:", e)
print(f"error in connecting to OpenAI server for {i+1}-th time. try again")
response_text = ""
time.sleep(10)
return response_text
def parse_gpt4(self, response_text):
score = re.findall(self.pattern, response_text)
if score:
score = float(score[0]) / 10
else:
score = 0.0
print("GPT4没有给出合理的分数:", response_text)
return score
@property
def template(self):
raise NotImplementedError
@property
def pattern(self):
raise NotImplementedError
def _metrics(self, prediction, ground_truth):
text = self.template.format(prediction=prediction, ground_truth=ground_truth)
response_text = self.query_gpt4(text)
score = self.parse_gpt4(response_text)
return score
class IntentEvaluator(BaseEvaluator):
def post_process(self, raw_prediction, ground_truths):
pred = raw_prediction.strip()
if pred == "":
pred = "None"
pred = pred.strip('.。')
if "```json" in pred:
try:
pred = pred[pred.index("```json") + 7:]
pred = pred[:pred.index("```")]
except:
print("unable to parse answer", pred)
pred = "{}"
if "\n" in pred:
pred = [i for i in pred.split("\n") if i][0]
pred = pred.strip('.。')
ground_truth = ground_truths[0]
return pred, [ground_truth]
def _metrics(self, prediction, ground_truth):
ground_truth = json.loads(ground_truth)
try:
prediction = json.loads(prediction)
except:
print(f"unable to parse prediction {prediction} of example with gt {ground_truth}")
return 0.0
intent_em = prediction.get('intent', '') == ground_truth.get('intent', '')
gt_slots = {(k, str(tuple(sorted([str(i) for i in v]))) if isinstance(v, list) else v) for k, v in ground_truth.get('slots', {}).items()}
try:
pred_slots = {(k, str(tuple(sorted([str(i).replace(" ", "") for i in v]))) if isinstance(v, list) else v.replace(" ", "")) for k, v in prediction.get('slots', {}).items()}
except:
print(f"OK to parse prediction slots {prediction} of example with gt {ground_truth}, but failed in processing the contents.")
return 0.0
correct_slots = pred_slots.intersection(gt_slots)
slots_em = (len(correct_slots) == len(pred_slots)) and (len(correct_slots) == len(gt_slots))
return int(intent_em and slots_em)
SummaryTemplate = """
请你进行以下电话总结内容的评分。请依据以下标准综合考量,以确定预测答案与标准答案之间的一致性程度。满分为10分,根据预测答案的准确性、完整性和相关性来逐项扣分。请先给每一项打分并给出总分,再给出打分理由。总分为10分减去每一项扣除分数之和,最低可扣到0分。请以“内容准确性扣x分,详细程度/完整性扣x分,...,总分是:x分"为开头。
1. **内容准确性**:
- 预测答案是否准确反映了客户问题或投诉的核心要点。
- 是否有任何关键信息被错误陈述或误解。
2. **详细程度/完整性**:
- 预测答案中包含的细节是否充分,能否覆盖标准答案中所有重要点。
- 对于任何遗漏的关键信息,应相应减分。
3. **内容冗余度**:
- 预测答案是否简洁明了,和标准答案风格一致,不存在冗余信息。
- 如果预测答案过长或与标准答案风格不一致,需相应减分。
4. **行动指令正确性**:
- 预测答案对后续处理的建议或请求是否与标准答案相符。
- 如果处理建议发生改变或丢失,需相应减分。
预测答案:{prediction}
参考答案:{ground_truth}
"""
class SummaryEvaluator(GPT4Evaluator):
@property
def pattern(self):
return r"总分是:(\d+\.\d+|\d+)分"
@property
def template(self):
return SummaryTemplate
LawTemplate = """
请你进行以下法案判决预测内容的评分。请依据以下标准综合考量,以确定预测答案与标准答案之间的一致性程度。满分为10分,根据预测答案的准确性、完整性和相关性来逐项扣分。请先给每一项打分并给出总分,再给出打分理由。总分为10分减去每一项扣除分数之和,最低可扣到0分。请以“相关性扣x分,完整性扣x分,...,总分是:x分"为开头。
1. **相关性**:预测答案与标准答案的相关程度是最重要的评分标准。如果预测的判决情况与标准答案完全一致,即所有事实和结果都被精确复制或以不同但等效的方式表述,则应给予高分。若只有部分一致或存在偏差,则根据一致的程度适当扣分。如果没有预测判决内容,扣10分。
2. **完整性**:评估预测答案是否涵盖了所有标准答案中提到的关键点,包括但不限于当事人、具体金额、责任判定、费用承担等。如果遗漏重要信息,则应相应扣分。
3. **准确性**:检查预测答案中提及的细节、数字、日期和法律依据是否与标准答案保持一致。任何错误信息均需扣分,并且严重错误应该导致更多的扣分。
4. **客观性与专业性**:预测答案应客观反映法案内容并使用恰当的法律术语。主观臆断或非专业表达需酌情扣分。
预测答案:{prediction}
参考答案:{ground_truth}
"""
class LawEvaluator(GPT4Evaluator):
@property
def pattern(self):
return r"总分是:(\d+\.\d+|\d+)分"
@property
def template(self):
return LawTemplate
TranslationTemplate = """
You are an expert master in machine translation. Please score the predicted answer against the standard answer out of 10 points based on the following criteria:
Content accuracy: Does the predicted answer accurately reflect the key points of the reference answer?
Level of detail/completeness: Does the predicted answer cover all important points from the standard answer?
Content redundancy: Is the predicted answer concise and consistent with the style of the standard answer?
Respond following the format:"Content accuracy x points, level of detail/completeness x points, ..., total score: x points". The total score is the average of all the scores. Do not give reasons for your scores.
Predicted answer: {prediction}
Reference answer: {ground_truth}
"""
class TranslationEvaluator(GPT4Evaluator):
@property
def pattern(self):
return r"score: *?(\d+\.\d+|\d+) *?point"
@property
def template(self):
return TranslationTemplate
def post_process(self, raw_prediction, ground_truths):
pred = raw_prediction.strip().split("\n\n")[0]
if pred == "":
pred = "None"
pred.strip(".。")
ground_truth = ground_truths[0]
return pred, [ground_truth]