5
5
from collections import defaultdict
6
6
7
7
import requests
8
- from lmms_eval .tasks ._task_utils .file_utils import generate_submission_file
9
8
from loguru import logger as eval_logger
10
9
11
- dir_name = os .path .dirname (os .path .abspath (__file__ ))
12
-
13
10
14
11
LLM_PARSE_ANSWER_PROMPT = """
15
12
You are given a pairwise judgement for two responses. Please return the better response according to the judgement.
18
15
Judgement: {judgement}
19
16
"""
20
17
18
+ API_TYPE = os .getenv ("API_TYPE" , "openai" )
19
+
20
+ if API_TYPE == "openai" :
21
+ API_URL = os .getenv ("OPENAI_API_URL" , "https://api.openai.com/v1/chat/completions" )
22
+ API_KEY = os .getenv ("OPENAI_API_KEY" , "YOUR_API_KEY" )
23
+ headers = {
24
+ "Authorization" : f"Bearer { API_KEY } " ,
25
+ "Content-Type" : "application/json" ,
26
+ }
27
+
21
28
22
29
def get_prompt (data_obj , random_number ):
23
30
answers = [data_obj ["response" ][0 ], data_obj ["response" ][1 ]] if random_number == 0 else [data_obj ["response" ][1 ], data_obj ["response" ][0 ]]
@@ -47,32 +54,18 @@ def vlrewardbench_doc_to_visual(doc):
47
54
return [doc ["image" ].convert ("RGB" )]
48
55
49
56
50
- def vlrewardbench_doc_to_text (doc , lmms_eval_specific_kwargs = None ):
57
+ def vlrewardbench_doc_to_text (doc ):
51
58
# we randomly choose the order of the answers to avoid positional bias
52
59
random_number = sum (len (res ) for res in doc ["response" ]) % 2 # we use the length sum % 2 as a random number generator to decide the order of the answers
53
- # doc["random_number"] = random_number # save it for later use Notes: This cannot be done as the doc iterator would be reset
54
60
query_prompt = get_prompt (doc , random_number )
55
-
56
61
return query_prompt
57
62
58
63
59
- API_TYPE = os .getenv ("API_TYPE" , "openai" )
60
-
61
- if API_TYPE == "openai" :
62
- API_URL = os .getenv ("OPENAI_API_URL" , "https://api.openai.com/v1/chat/completions" )
63
- API_KEY = os .getenv ("OPENAI_API_KEY" , "YOUR_API_KEY" )
64
- headers = {
65
- "Authorization" : f"Bearer { API_KEY } " ,
66
- "Content-Type" : "application/json" ,
67
- }
68
-
69
-
70
64
def parse_pred_ans (pred_ans ):
71
- pred_ans = pred_ans .lower (). strip ()
72
- pattern = r"(?:overall judgment|therefore )\s*.*\s*-*\s*answer \s*(\d+)\s*is\s*(?:the\s*)?(?:slightly\s*)?better"
65
+ pred_ans = pred_ans .strip ()
66
+ pattern = r"(?:Overall Judgment|Therefore )\s*.*\s*-*\s*Answer \s*(\d+)\s*is\s*(?:the\s*)?(?:slightly\s*)?better"
73
67
match = re .search (pattern , pred_ans .replace ("\n " , "" ).replace ("*" , "" ), re .IGNORECASE )
74
68
flag_choice = - 1
75
-
76
69
if match :
77
70
answer_number = int (match .group (1 ))
78
71
flag_choice = answer_number
@@ -92,7 +85,7 @@ def parse_pred_ans(pred_ans):
92
85
93
86
def parse_by_llm (response , model = "gpt-4o-mini" , max_tokens = 32 ):
94
87
# get the judgement from response using gpt-4o
95
- data = {"max_tokens" : max_tokens , "model" : model , "temperature" : 0.0 , "top_p" : 1 , "presence_penalty" : 1 , "messages" : [{"role" : "user" , "content" : LLM_PARSE_ANSWER_PROMPT .format (judgement = response )}]}
88
+ data = {"max_tokens" : max_tokens , "model" : model , "temperature" : 0.0 , "top_p" : 1.0 , "presence_penalty" : 1 , "messages" : [{"role" : "user" , "content" : LLM_PARSE_ANSWER_PROMPT .format (judgement = response )}]}
96
89
response = requests .post (API_URL , headers = headers , data = json .dumps (data ).encode ("utf-8" ))
97
90
result = response .content .decode ("utf-8" )
98
91
dict_result = json .loads (result )
@@ -109,10 +102,11 @@ def vlrewardbench_process_results(doc, results):
109
102
a dictionary with key: metric name (in this case mme score), value: metric value
110
103
"""
111
104
pred = results [0 ]
112
- pred_ans = parse_pred_ans (pred )
105
+ pred_ans = parse_pred_ans (pred ) # 1 or 2 indicte which one is better
113
106
random_number = sum (len (res ) for res in doc ["response" ]) % 2 # we use the length sum % 2 as a random number generator to decide the order of the answers
107
+ # Note: human_ranking [0, 1] -> answer 1 is better, [1, 0] -> answer 2 is better
108
+ gt_ans = doc ["human_ranking" ].index (0 if random_number == 0 else 1 ) + 1
114
109
115
- gt_ans = doc ["human_ranking" ].index (1 if random_number == 0 else 0 ) + 1
116
110
if pred_ans == gt_ans :
117
111
score = 1.0
118
112
else :
0 commit comments