adds olympiad bench

NathanHB · NathanHB · commit ca470997011e · 2025-01-28T07:25:53.000Z
diff --git a/src/lighteval/models/litellm_model.py b/src/lighteval/models/litellm_model.py
@@ -85,7 +85,7 @@ def __init__(self, config, env_config) -> None:
         self.API_RETRY_SLEEP = 3
         self.API_RETRY_MULTIPLIER = 2
         self.CONCURENT_CALLS = 20  # 100 leads to hitting Anthropic rate limits
-        self.TEMPERATURE = 0.7
+        self.TEMPERATURE = 0.3
         self.TOP_P = 0.95
         self.model = config.model
         self._tokenizer = encode
@@ -99,8 +99,6 @@ def _prepare_stop_sequence(self, stop_sequence):
             # Filter out whitespace-only stop sequences
             if stop_sequence:
                 stop_sequence = [s for s in stop_sequence if s and s.strip()]
-        if not stop_sequence:  # If empty after filtering
-            stop_sequence = ["\n"]
         return stop_sequence
 
     def _prepare_max_new_tokens(self, max_new_tokens):
@@ -143,6 +141,9 @@ def __call_api(self, prompt, return_logits, max_new_tokens, num_samples, stop_se
 
                 response = litellm.completion(**kwargs)
 
+                print(response)
+                print(kwargs)
+
                 # If response is empty, retry without caching (maybe the error is recoverable and solved with a retry)
                 if response.choices[0].message.content is None:
                     kwargs["caching"] = False
diff --git a/src/lighteval/tasks/extended/__init__.py b/src/lighteval/tasks/extended/__init__.py
@@ -27,9 +27,10 @@
     import lighteval.tasks.extended.ifeval.main as ifeval
     import lighteval.tasks.extended.mix_eval.main as mix_eval
     import lighteval.tasks.extended.mt_bench.main as mt_bench
+    import lighteval.tasks.extended.olympiade_bench.main as olympiad_bench
     import lighteval.tasks.extended.tiny_benchmarks.main as tiny_benchmarks
 
-    AVAILABLE_EXTENDED_TASKS_MODULES = [ifeval, tiny_benchmarks, mt_bench, mix_eval]
+    AVAILABLE_EXTENDED_TASKS_MODULES = [ifeval, tiny_benchmarks, mt_bench, mix_eval, olympiad_bench]
 
 else:
     AVAILABLE_EXTENDED_TASKS_MODULES = []
diff --git a/src/lighteval/tasks/extended/olympiade_bench/main.py b/src/lighteval/tasks/extended/olympiade_bench/main.py
@@ -0,0 +1,99 @@
+# MIT License
+
+# Copyright (c) 2024 The HuggingFace Team
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+
+from lighteval.metrics.dynamic_metrics import (
+    ExprExtractionConfig,
+    LatexExtractionConfig,
+    multilingual_extractive_match_metric,
+)
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.tasks.requests import Doc
+from lighteval.utils.language import Language
+
+
+# Very specific task where there are no precise outputs but instead we test if the format obeys rules
+def olympiad_bench_prompt(line, task_name: str = None):
+    return Doc(
+        task_name=task_name,
+        query=line["question"],
+        choices=[line["final_answer"]],
+        gold_index=0,
+        instruction="",
+        specific={},
+    )
+
+
+# * OE: Open-ended questions
+# * TP: Theorem proof problems
+# * MM: Multimodal
+# * TO: Text-only
+# * physics: Physics problems
+# * maths: Math problems
+# * en: English
+# * zh: Chinese
+# * COMP: Competition problems
+# * CEE: Chinese College Entrance Exam problems
+
+question_type = ["OE", "TP"]
+multimodality = ["TO"]  # MM
+subject = ["physics", "maths"]
+language = ["en"]  # "zh"]
+source = ["COMP", "CEE"]
+
+olympiad_bench_subsets = []
+
+for qt in question_type:
+    for mm in multimodality:
+        for sub in subject:
+            for lang in language:
+                for src in source:
+                    olympiad_bench_subsets.append(f"{qt}_{mm}_{sub}_{lang}_{src}")
+
+extraction_targets = [ExprExtractionConfig(), LatexExtractionConfig()]
+
+metric = multilingual_extractive_match_metric(
+    language=Language.ENGLISH,
+    gold_extraction_target=extraction_targets,
+    pred_extraction_target=extraction_targets,
+    precision=6,
+)
+# We create the task config
+olympiad_bench = LightevalTaskConfig(
+    name="olympiad_bench",
+    prompt_function=olympiad_bench_prompt,
+    suite=["extended"],
+    hf_repo="Hothan/OlympiadBench",
+    hf_subset=olympiad_bench_subsets[0],
+    metric=[metric],
+    hf_avail_splits=["train"],
+    evaluation_splits=["train"],
+    few_shots_split="train",
+    few_shots_select="random_sampling",
+    generation_size=2048,
+    stop_sequence=[],  # no stop sequence, will use eot token
+    version="1.0",
+)
+
+# print(olympiad_bench)
+
+TASKS_TABLE = [olympiad_bench]