Skip to content

Commit f4eb826

Browse files
committed
fix
1 parent 2b40807 commit f4eb826

File tree

2 files changed

+20
-31
lines changed

2 files changed

+20
-31
lines changed

lmms_eval/tasks/vl_rewardbench/utils.py

Lines changed: 17 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,8 @@
55
from collections import defaultdict
66

77
import requests
8-
from lmms_eval.tasks._task_utils.file_utils import generate_submission_file
98
from loguru import logger as eval_logger
109

11-
dir_name = os.path.dirname(os.path.abspath(__file__))
12-
1310

1411
LLM_PARSE_ANSWER_PROMPT = """
1512
You are given a pairwise judgement for two responses. Please return the better response according to the judgement.
@@ -18,6 +15,16 @@
1815
Judgement: {judgement}
1916
"""
2017

18+
API_TYPE = os.getenv("API_TYPE", "openai")
19+
20+
if API_TYPE == "openai":
21+
API_URL = os.getenv("OPENAI_API_URL", "https://api.openai.com/v1/chat/completions")
22+
API_KEY = os.getenv("OPENAI_API_KEY", "YOUR_API_KEY")
23+
headers = {
24+
"Authorization": f"Bearer {API_KEY}",
25+
"Content-Type": "application/json",
26+
}
27+
2128

2229
def get_prompt(data_obj, random_number):
2330
answers = [data_obj["response"][0], data_obj["response"][1]] if random_number == 0 else [data_obj["response"][1], data_obj["response"][0]]
@@ -47,32 +54,18 @@ def vlrewardbench_doc_to_visual(doc):
4754
return [doc["image"].convert("RGB")]
4855

4956

50-
def vlrewardbench_doc_to_text(doc, lmms_eval_specific_kwargs=None):
57+
def vlrewardbench_doc_to_text(doc):
5158
# we randomly choose the order of the answers to avoid positional bias
5259
random_number = sum(len(res) for res in doc["response"]) % 2 # we use the length sum % 2 as a random number generator to decide the order of the answers
53-
# doc["random_number"] = random_number # save it for later use Notes: This cannot be done as the doc iterator would be reset
5460
query_prompt = get_prompt(doc, random_number)
55-
5661
return query_prompt
5762

5863

59-
API_TYPE = os.getenv("API_TYPE", "openai")
60-
61-
if API_TYPE == "openai":
62-
API_URL = os.getenv("OPENAI_API_URL", "https://api.openai.com/v1/chat/completions")
63-
API_KEY = os.getenv("OPENAI_API_KEY", "YOUR_API_KEY")
64-
headers = {
65-
"Authorization": f"Bearer {API_KEY}",
66-
"Content-Type": "application/json",
67-
}
68-
69-
7064
def parse_pred_ans(pred_ans):
71-
pred_ans = pred_ans.lower().strip()
72-
pattern = r"(?:overall judgment|therefore)\s*.*\s*-*\s*answer\s*(\d+)\s*is\s*(?:the\s*)?(?:slightly\s*)?better"
65+
pred_ans = pred_ans.strip()
66+
pattern = r"(?:Overall Judgment|Therefore)\s*.*\s*-*\s*Answer\s*(\d+)\s*is\s*(?:the\s*)?(?:slightly\s*)?better"
7367
match = re.search(pattern, pred_ans.replace("\n", "").replace("*", ""), re.IGNORECASE)
7468
flag_choice = -1
75-
7669
if match:
7770
answer_number = int(match.group(1))
7871
flag_choice = answer_number
@@ -92,7 +85,7 @@ def parse_pred_ans(pred_ans):
9285

9386
def parse_by_llm(response, model="gpt-4o-mini", max_tokens=32):
9487
# get the judgement from response using gpt-4o
95-
data = {"max_tokens": max_tokens, "model": model, "temperature": 0.0, "top_p": 1, "presence_penalty": 1, "messages": [{"role": "user", "content": LLM_PARSE_ANSWER_PROMPT.format(judgement=response)}]}
88+
data = {"max_tokens": max_tokens, "model": model, "temperature": 0.0, "top_p": 1.0, "presence_penalty": 1, "messages": [{"role": "user", "content": LLM_PARSE_ANSWER_PROMPT.format(judgement=response)}]}
9689
response = requests.post(API_URL, headers=headers, data=json.dumps(data).encode("utf-8"))
9790
result = response.content.decode("utf-8")
9891
dict_result = json.loads(result)
@@ -109,10 +102,11 @@ def vlrewardbench_process_results(doc, results):
109102
a dictionary with key: metric name (in this case mme score), value: metric value
110103
"""
111104
pred = results[0]
112-
pred_ans = parse_pred_ans(pred)
105+
pred_ans = parse_pred_ans(pred) # 1 or 2 indicte which one is better
113106
random_number = sum(len(res) for res in doc["response"]) % 2 # we use the length sum % 2 as a random number generator to decide the order of the answers
107+
# Note: human_ranking [0, 1] -> answer 1 is better, [1, 0] -> answer 2 is better
108+
gt_ans = doc["human_ranking"].index(0 if random_number == 0 else 1) + 1
114109

115-
gt_ans = doc["human_ranking"].index(1 if random_number == 0 else 0) + 1
116110
if pred_ans == gt_ans:
117111
score = 1.0
118112
else:

lmms_eval/tasks/vl_rewardbench/vl_rewardbench.yaml

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -9,20 +9,15 @@ doc_to_text: !function utils.vlrewardbench_doc_to_text
99
doc_to_target: "human_ranking"
1010
generation_kwargs:
1111
max_new_tokens: 1024
12-
temperature: 0
13-
top_p: 1.0
12+
temperature: 1.0
13+
top_p: 1.0
1414
num_beams: 1
15-
do_sample: false
15+
do_sample: true
1616
# The return value of process_results will be used by metrics
1717
process_results: !function utils.vlrewardbench_process_results
18-
# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
19-
# e.g. Following metrics `mme_perception_score` is custom defined.
20-
# So `mme_process_results` function should return the dict `{"mme_perception_score": {sub_k:sub_v, ..., } }`
21-
# And the `mme_aggregate_results` function could get the dict `{sub_k:sub_v, ..., }`, and use the information to gather the final accuracy.
2218
metric_list:
2319
- metric: vlreward_score
2420
aggregation: !function utils.vlrewardbench_aggregate_results
2521
higher_is_better: true
26-
2722
metadata:
2823
- version: 0.0

0 commit comments

Comments
 (0)