Skip to content

Commit ca47099

Browse files
committed
adds olympiad bench
1 parent cb075a5 commit ca47099

File tree

3 files changed

+105
-4
lines changed

3 files changed

+105
-4
lines changed

src/lighteval/models/litellm_model.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ def __init__(self, config, env_config) -> None:
8585
self.API_RETRY_SLEEP = 3
8686
self.API_RETRY_MULTIPLIER = 2
8787
self.CONCURENT_CALLS = 20 # 100 leads to hitting Anthropic rate limits
88-
self.TEMPERATURE = 0.7
88+
self.TEMPERATURE = 0.3
8989
self.TOP_P = 0.95
9090
self.model = config.model
9191
self._tokenizer = encode
@@ -99,8 +99,6 @@ def _prepare_stop_sequence(self, stop_sequence):
9999
# Filter out whitespace-only stop sequences
100100
if stop_sequence:
101101
stop_sequence = [s for s in stop_sequence if s and s.strip()]
102-
if not stop_sequence: # If empty after filtering
103-
stop_sequence = ["\n"]
104102
return stop_sequence
105103

106104
def _prepare_max_new_tokens(self, max_new_tokens):
@@ -143,6 +141,9 @@ def __call_api(self, prompt, return_logits, max_new_tokens, num_samples, stop_se
143141

144142
response = litellm.completion(**kwargs)
145143

144+
print(response)
145+
print(kwargs)
146+
146147
# If response is empty, retry without caching (maybe the error is recoverable and solved with a retry)
147148
if response.choices[0].message.content is None:
148149
kwargs["caching"] = False

src/lighteval/tasks/extended/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,9 +27,10 @@
2727
import lighteval.tasks.extended.ifeval.main as ifeval
2828
import lighteval.tasks.extended.mix_eval.main as mix_eval
2929
import lighteval.tasks.extended.mt_bench.main as mt_bench
30+
import lighteval.tasks.extended.olympiade_bench.main as olympiad_bench
3031
import lighteval.tasks.extended.tiny_benchmarks.main as tiny_benchmarks
3132

32-
AVAILABLE_EXTENDED_TASKS_MODULES = [ifeval, tiny_benchmarks, mt_bench, mix_eval]
33+
AVAILABLE_EXTENDED_TASKS_MODULES = [ifeval, tiny_benchmarks, mt_bench, mix_eval, olympiad_bench]
3334

3435
else:
3536
AVAILABLE_EXTENDED_TASKS_MODULES = []
Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
# MIT License
2+
3+
# Copyright (c) 2024 The HuggingFace Team
4+
5+
# Permission is hereby granted, free of charge, to any person obtaining a copy
6+
# of this software and associated documentation files (the "Software"), to deal
7+
# in the Software without restriction, including without limitation the rights
8+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+
# copies of the Software, and to permit persons to whom the Software is
10+
# furnished to do so, subject to the following conditions:
11+
12+
# The above copyright notice and this permission notice shall be included in all
13+
# copies or substantial portions of the Software.
14+
15+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21+
# SOFTWARE.
22+
23+
24+
from lighteval.metrics.dynamic_metrics import (
25+
ExprExtractionConfig,
26+
LatexExtractionConfig,
27+
multilingual_extractive_match_metric,
28+
)
29+
from lighteval.tasks.lighteval_task import LightevalTaskConfig
30+
from lighteval.tasks.requests import Doc
31+
from lighteval.utils.language import Language
32+
33+
34+
# Very specific task where there are no precise outputs but instead we test if the format obeys rules
35+
def olympiad_bench_prompt(line, task_name: str = None):
36+
return Doc(
37+
task_name=task_name,
38+
query=line["question"],
39+
choices=[line["final_answer"]],
40+
gold_index=0,
41+
instruction="",
42+
specific={},
43+
)
44+
45+
46+
# * OE: Open-ended questions
47+
# * TP: Theorem proof problems
48+
# * MM: Multimodal
49+
# * TO: Text-only
50+
# * physics: Physics problems
51+
# * maths: Math problems
52+
# * en: English
53+
# * zh: Chinese
54+
# * COMP: Competition problems
55+
# * CEE: Chinese College Entrance Exam problems
56+
57+
question_type = ["OE", "TP"]
58+
multimodality = ["TO"] # MM
59+
subject = ["physics", "maths"]
60+
language = ["en"] # "zh"]
61+
source = ["COMP", "CEE"]
62+
63+
olympiad_bench_subsets = []
64+
65+
for qt in question_type:
66+
for mm in multimodality:
67+
for sub in subject:
68+
for lang in language:
69+
for src in source:
70+
olympiad_bench_subsets.append(f"{qt}_{mm}_{sub}_{lang}_{src}")
71+
72+
extraction_targets = [ExprExtractionConfig(), LatexExtractionConfig()]
73+
74+
metric = multilingual_extractive_match_metric(
75+
language=Language.ENGLISH,
76+
gold_extraction_target=extraction_targets,
77+
pred_extraction_target=extraction_targets,
78+
precision=6,
79+
)
80+
# We create the task config
81+
olympiad_bench = LightevalTaskConfig(
82+
name="olympiad_bench",
83+
prompt_function=olympiad_bench_prompt,
84+
suite=["extended"],
85+
hf_repo="Hothan/OlympiadBench",
86+
hf_subset=olympiad_bench_subsets[0],
87+
metric=[metric],
88+
hf_avail_splits=["train"],
89+
evaluation_splits=["train"],
90+
few_shots_split="train",
91+
few_shots_select="random_sampling",
92+
generation_size=2048,
93+
stop_sequence=[], # no stop sequence, will use eot token
94+
version="1.0",
95+
)
96+
97+
# print(olympiad_bench)
98+
99+
TASKS_TABLE = [olympiad_bench]

0 commit comments

Comments
 (0)