Skip to content

Commit

Permalink
adds olympiad bench
Browse files Browse the repository at this point in the history
  • Loading branch information
NathanHB committed Jan 28, 2025
1 parent cb075a5 commit ca47099
Show file tree
Hide file tree
Showing 3 changed files with 105 additions and 4 deletions.
7 changes: 4 additions & 3 deletions src/lighteval/models/litellm_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ def __init__(self, config, env_config) -> None:
self.API_RETRY_SLEEP = 3
self.API_RETRY_MULTIPLIER = 2
self.CONCURENT_CALLS = 20 # 100 leads to hitting Anthropic rate limits
self.TEMPERATURE = 0.7
self.TEMPERATURE = 0.3
self.TOP_P = 0.95
self.model = config.model
self._tokenizer = encode
Expand All @@ -99,8 +99,6 @@ def _prepare_stop_sequence(self, stop_sequence):
# Filter out whitespace-only stop sequences
if stop_sequence:
stop_sequence = [s for s in stop_sequence if s and s.strip()]
if not stop_sequence: # If empty after filtering
stop_sequence = ["\n"]
return stop_sequence

def _prepare_max_new_tokens(self, max_new_tokens):
Expand Down Expand Up @@ -143,6 +141,9 @@ def __call_api(self, prompt, return_logits, max_new_tokens, num_samples, stop_se

response = litellm.completion(**kwargs)

print(response)
print(kwargs)

# If response is empty, retry without caching (maybe the error is recoverable and solved with a retry)
if response.choices[0].message.content is None:
kwargs["caching"] = False
Expand Down
3 changes: 2 additions & 1 deletion src/lighteval/tasks/extended/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,10 @@
import lighteval.tasks.extended.ifeval.main as ifeval
import lighteval.tasks.extended.mix_eval.main as mix_eval
import lighteval.tasks.extended.mt_bench.main as mt_bench
import lighteval.tasks.extended.olympiade_bench.main as olympiad_bench
import lighteval.tasks.extended.tiny_benchmarks.main as tiny_benchmarks

AVAILABLE_EXTENDED_TASKS_MODULES = [ifeval, tiny_benchmarks, mt_bench, mix_eval]
AVAILABLE_EXTENDED_TASKS_MODULES = [ifeval, tiny_benchmarks, mt_bench, mix_eval, olympiad_bench]

else:
AVAILABLE_EXTENDED_TASKS_MODULES = []
99 changes: 99 additions & 0 deletions src/lighteval/tasks/extended/olympiade_bench/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
# MIT License

# Copyright (c) 2024 The HuggingFace Team

# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:

# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.


from lighteval.metrics.dynamic_metrics import (
ExprExtractionConfig,
LatexExtractionConfig,
multilingual_extractive_match_metric,
)
from lighteval.tasks.lighteval_task import LightevalTaskConfig
from lighteval.tasks.requests import Doc
from lighteval.utils.language import Language


# Very specific task where there are no precise outputs but instead we test if the format obeys rules
def olympiad_bench_prompt(line, task_name: str = None):
return Doc(
task_name=task_name,
query=line["question"],
choices=[line["final_answer"]],
gold_index=0,
instruction="",
specific={},
)


# * OE: Open-ended questions
# * TP: Theorem proof problems
# * MM: Multimodal
# * TO: Text-only
# * physics: Physics problems
# * maths: Math problems
# * en: English
# * zh: Chinese
# * COMP: Competition problems
# * CEE: Chinese College Entrance Exam problems

question_type = ["OE", "TP"]
multimodality = ["TO"] # MM
subject = ["physics", "maths"]
language = ["en"] # "zh"]
source = ["COMP", "CEE"]

olympiad_bench_subsets = []

for qt in question_type:
for mm in multimodality:
for sub in subject:
for lang in language:
for src in source:
olympiad_bench_subsets.append(f"{qt}_{mm}_{sub}_{lang}_{src}")

extraction_targets = [ExprExtractionConfig(), LatexExtractionConfig()]

metric = multilingual_extractive_match_metric(
language=Language.ENGLISH,
gold_extraction_target=extraction_targets,
pred_extraction_target=extraction_targets,
precision=6,
)
# We create the task config
olympiad_bench = LightevalTaskConfig(
name="olympiad_bench",
prompt_function=olympiad_bench_prompt,
suite=["extended"],
hf_repo="Hothan/OlympiadBench",
hf_subset=olympiad_bench_subsets[0],
metric=[metric],
hf_avail_splits=["train"],
evaluation_splits=["train"],
few_shots_split="train",
few_shots_select="random_sampling",
generation_size=2048,
stop_sequence=[], # no stop sequence, will use eot token
version="1.0",
)

# print(olympiad_bench)

TASKS_TABLE = [olympiad_bench]

0 comments on commit ca47099

Please sign in to comment.