From ca470997011ed73ba2d401e7257668a4ce519e64 Mon Sep 17 00:00:00 2001 From: Nathan Habib Date: Tue, 28 Jan 2025 07:25:53 +0000 Subject: [PATCH] adds olympiad bench --- src/lighteval/models/litellm_model.py | 7 +- src/lighteval/tasks/extended/__init__.py | 3 +- .../tasks/extended/olympiade_bench/main.py | 99 +++++++++++++++++++ 3 files changed, 105 insertions(+), 4 deletions(-) create mode 100644 src/lighteval/tasks/extended/olympiade_bench/main.py diff --git a/src/lighteval/models/litellm_model.py b/src/lighteval/models/litellm_model.py index 195221aa..ccb10c47 100644 --- a/src/lighteval/models/litellm_model.py +++ b/src/lighteval/models/litellm_model.py @@ -85,7 +85,7 @@ def __init__(self, config, env_config) -> None: self.API_RETRY_SLEEP = 3 self.API_RETRY_MULTIPLIER = 2 self.CONCURENT_CALLS = 20 # 100 leads to hitting Anthropic rate limits - self.TEMPERATURE = 0.7 + self.TEMPERATURE = 0.3 self.TOP_P = 0.95 self.model = config.model self._tokenizer = encode @@ -99,8 +99,6 @@ def _prepare_stop_sequence(self, stop_sequence): # Filter out whitespace-only stop sequences if stop_sequence: stop_sequence = [s for s in stop_sequence if s and s.strip()] - if not stop_sequence: # If empty after filtering - stop_sequence = ["\n"] return stop_sequence def _prepare_max_new_tokens(self, max_new_tokens): @@ -143,6 +141,9 @@ def __call_api(self, prompt, return_logits, max_new_tokens, num_samples, stop_se response = litellm.completion(**kwargs) + print(response) + print(kwargs) + # If response is empty, retry without caching (maybe the error is recoverable and solved with a retry) if response.choices[0].message.content is None: kwargs["caching"] = False diff --git a/src/lighteval/tasks/extended/__init__.py b/src/lighteval/tasks/extended/__init__.py index 97d99188..3e913593 100644 --- a/src/lighteval/tasks/extended/__init__.py +++ b/src/lighteval/tasks/extended/__init__.py @@ -27,9 +27,10 @@ import lighteval.tasks.extended.ifeval.main as ifeval import lighteval.tasks.extended.mix_eval.main as mix_eval import lighteval.tasks.extended.mt_bench.main as mt_bench + import lighteval.tasks.extended.olympiade_bench.main as olympiad_bench import lighteval.tasks.extended.tiny_benchmarks.main as tiny_benchmarks - AVAILABLE_EXTENDED_TASKS_MODULES = [ifeval, tiny_benchmarks, mt_bench, mix_eval] + AVAILABLE_EXTENDED_TASKS_MODULES = [ifeval, tiny_benchmarks, mt_bench, mix_eval, olympiad_bench] else: AVAILABLE_EXTENDED_TASKS_MODULES = [] diff --git a/src/lighteval/tasks/extended/olympiade_bench/main.py b/src/lighteval/tasks/extended/olympiade_bench/main.py new file mode 100644 index 00000000..a648ed57 --- /dev/null +++ b/src/lighteval/tasks/extended/olympiade_bench/main.py @@ -0,0 +1,99 @@ +# MIT License + +# Copyright (c) 2024 The HuggingFace Team + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + + +from lighteval.metrics.dynamic_metrics import ( + ExprExtractionConfig, + LatexExtractionConfig, + multilingual_extractive_match_metric, +) +from lighteval.tasks.lighteval_task import LightevalTaskConfig +from lighteval.tasks.requests import Doc +from lighteval.utils.language import Language + + +# Very specific task where there are no precise outputs but instead we test if the format obeys rules +def olympiad_bench_prompt(line, task_name: str = None): + return Doc( + task_name=task_name, + query=line["question"], + choices=[line["final_answer"]], + gold_index=0, + instruction="", + specific={}, + ) + + +# * OE: Open-ended questions +# * TP: Theorem proof problems +# * MM: Multimodal +# * TO: Text-only +# * physics: Physics problems +# * maths: Math problems +# * en: English +# * zh: Chinese +# * COMP: Competition problems +# * CEE: Chinese College Entrance Exam problems + +question_type = ["OE", "TP"] +multimodality = ["TO"] # MM +subject = ["physics", "maths"] +language = ["en"] # "zh"] +source = ["COMP", "CEE"] + +olympiad_bench_subsets = [] + +for qt in question_type: + for mm in multimodality: + for sub in subject: + for lang in language: + for src in source: + olympiad_bench_subsets.append(f"{qt}_{mm}_{sub}_{lang}_{src}") + +extraction_targets = [ExprExtractionConfig(), LatexExtractionConfig()] + +metric = multilingual_extractive_match_metric( + language=Language.ENGLISH, + gold_extraction_target=extraction_targets, + pred_extraction_target=extraction_targets, + precision=6, +) +# We create the task config +olympiad_bench = LightevalTaskConfig( + name="olympiad_bench", + prompt_function=olympiad_bench_prompt, + suite=["extended"], + hf_repo="Hothan/OlympiadBench", + hf_subset=olympiad_bench_subsets[0], + metric=[metric], + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split="train", + few_shots_select="random_sampling", + generation_size=2048, + stop_sequence=[], # no stop sequence, will use eot token + version="1.0", +) + +# print(olympiad_bench) + +TASKS_TABLE = [olympiad_bench]