Refactor

huggingface · Feb 4, 2025 · fa00c5f · fa00c5f
1 parent 88f939e
commit fa00c5f
Show file tree

Hide file tree

Showing 4 changed files with 78 additions and 89 deletions.
diff --git a/src/lighteval/metrics/metrics.py b/src/lighteval/metrics/metrics.py
@@ -24,6 +24,10 @@
 import numpy as np
 from aenum import Enum
 
+from lighteval.metrics.dynamic_metrics import (
+    IndicesExtractionConfig,
+    multilingual_extractive_match_metric,
+)
 from lighteval.metrics.harness_compatibility.drop import drop_metrics
 from lighteval.metrics.harness_compatibility.truthful_qa import truthfulqa_mc_metrics
 from lighteval.metrics.metrics_corpus import (
@@ -69,6 +73,7 @@
     SampleLevelMetric,
     SampleLevelMetricGrouping,
 )
+from lighteval.utils.language import Language
 from lighteval.utils.utils import as_list
 
 
@@ -549,6 +554,12 @@ class Metrics(Enum):
         corpus_level_fn=CorpusLevelPerplexityMetric("weighted_perplexity").compute,
         higher_is_better=False,
     )
+    gpqa_instruct_metric = multilingual_extractive_match_metric(
+        language=Language.ENGLISH,
+        gold_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")],
+        pred_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")],
+        precision=6,
+    )
 
     def __str__(self):
         return self.name.replace("_at_", "@")

diff --git a/src/lighteval/tasks/default_prompts.py b/src/lighteval/tasks/default_prompts.py
@@ -729,6 +729,25 @@ def gpqa(line, task_name: str = None):
     )
 
 
+def gpqa_instruct(line, task_name: str = None):
+    """Adapted from simple-eval: https://github.com/openai/simple-evals/blob/83ed7640a7d9cd26849bcb3340125002ef14abbe/common.py#L14"""
+    gold_index = random.randint(0, 3)
+    choices = [line["Incorrect Answer 1"], line["Incorrect Answer 2"], line["Incorrect Answer 3"]]
+    choices.insert(gold_index, line["Correct Answer"])
+
+    instruction = "Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering."
+    query = f"{instruction}\n\n{line['Question']}\n\n" ""
+    query += "".join([f"{key}. {choice}\n" for key, choice in zip(LETTER_INDICES, choices)])
+
+    return Doc(
+        task_name=task_name,
+        query=query,
+        choices=LETTER_INDICES[: len(choices)],
+        gold_index=gold_index,
+        instruction=instruction,
+    )
+
+
 def gsm8k(line, task_name: str = None):
     # Has special analysis in metric for number decomposition
     return Doc(

diff --git a/src/lighteval/tasks/default_tasks.py b/src/lighteval/tasks/default_tasks.py
@@ -7720,6 +7720,54 @@
     trust_dataset=True,
     version=0,
 )
+gpqa_diamond_instruct_lighteval = LightevalTaskConfig(
+    name="gpqa:diamond",
+    suite=["lighteval"],
+    prompt_function=prompt.gpqa_instruct,
+    hf_repo="Idavidrein/gpqa",
+    hf_subset="gpqa_diamond",
+    hf_avail_splits=["train"],
+    evaluation_splits=["train"],
+    few_shots_split=None,
+    few_shots_select="random_sampling",
+    generation_size=32_000,  # needed for reasoning models like R1
+    metric=[Metrics.gpqa_instruct_metric],
+    stop_sequence=[],  # no stop sequence, will use eos token
+    trust_dataset=True,
+    version=0,
+)
+gpqa_extended_instruct_lighteval = LightevalTaskConfig(
+    name="gpqa:extended",
+    suite=["lighteval"],
+    prompt_function=prompt.gpqa_instruct,
+    hf_repo="Idavidrein/gpqa",
+    hf_subset="gpqa_extended",
+    hf_avail_splits=["train"],
+    evaluation_splits=["train"],
+    few_shots_split=None,
+    few_shots_select="random_sampling",
+    generation_size=32_000,  # needed for reasoning models like R1
+    metric=[Metrics.gpqa_instruct_metric],
+    stop_sequence=[],  # no stop sequence, will use eos token
+    trust_dataset=True,
+    version=0,
+)
+gpqa_main_instruct_lighteval = LightevalTaskConfig(
+    name="gpqa:main",
+    suite=["lighteval"],
+    prompt_function=prompt.gpqa_instruct,
+    hf_repo="Idavidrein/gpqa",
+    hf_subset="gpqa_main",
+    hf_avail_splits=["train"],
+    evaluation_splits=["train"],
+    few_shots_split=None,
+    few_shots_select="random_sampling",
+    generation_size=32_000,  # needed for reasoning models like R1
+    metric=[Metrics.gpqa_instruct_metric],
+    stop_sequence=[],  # no stop sequence, will use eos token
+    trust_dataset=True,
+    version=0,
+)
 gre_reading_comprehension_bigbench = LightevalTaskConfig(
     name="gre_reading_comprehension",
     suite=["bigbench", "bigbench_json"],

diff --git a/src/lighteval/tasks/extended/gpqa/main.py b/src/lighteval/tasks/extended/gpqa/main.py