From fa00c5f37dc8f3de7eb7813105d724264f21e373 Mon Sep 17 00:00:00 2001
From: Lewis Tunstall <lewis.c.tunstall@gmail.com>
Date: Tue, 4 Feb 2025 16:13:57 +0000
Subject: [PATCH] Refactor

---
 src/lighteval/metrics/metrics.py          | 11 +++
 src/lighteval/tasks/default_prompts.py    | 19 +++++
 src/lighteval/tasks/default_tasks.py      | 48 ++++++++++++
 src/lighteval/tasks/extended/gpqa/main.py | 89 -----------------------
 4 files changed, 78 insertions(+), 89 deletions(-)
 delete mode 100644 src/lighteval/tasks/extended/gpqa/main.py

diff --git a/src/lighteval/metrics/metrics.py b/src/lighteval/metrics/metrics.py
index ff4b6b059..171219197 100644
--- a/src/lighteval/metrics/metrics.py
+++ b/src/lighteval/metrics/metrics.py
@@ -24,6 +24,10 @@
 import numpy as np
 from aenum import Enum
 
+from lighteval.metrics.dynamic_metrics import (
+    IndicesExtractionConfig,
+    multilingual_extractive_match_metric,
+)
 from lighteval.metrics.harness_compatibility.drop import drop_metrics
 from lighteval.metrics.harness_compatibility.truthful_qa import truthfulqa_mc_metrics
 from lighteval.metrics.metrics_corpus import (
@@ -69,6 +73,7 @@
     SampleLevelMetric,
     SampleLevelMetricGrouping,
 )
+from lighteval.utils.language import Language
 from lighteval.utils.utils import as_list
 
 
@@ -549,6 +554,12 @@ class Metrics(Enum):
         corpus_level_fn=CorpusLevelPerplexityMetric("weighted_perplexity").compute,
         higher_is_better=False,
     )
+    gpqa_instruct_metric = multilingual_extractive_match_metric(
+        language=Language.ENGLISH,
+        gold_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")],
+        pred_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")],
+        precision=6,
+    )
 
     def __str__(self):
         return self.name.replace("_at_", "@")
diff --git a/src/lighteval/tasks/default_prompts.py b/src/lighteval/tasks/default_prompts.py
index 66c3d53b4..11037873a 100644
--- a/src/lighteval/tasks/default_prompts.py
+++ b/src/lighteval/tasks/default_prompts.py
@@ -729,6 +729,25 @@ def gpqa(line, task_name: str = None):
     )
 
 
+def gpqa_instruct(line, task_name: str = None):
+    """Adapted from simple-eval: https://github.com/openai/simple-evals/blob/83ed7640a7d9cd26849bcb3340125002ef14abbe/common.py#L14"""
+    gold_index = random.randint(0, 3)
+    choices = [line["Incorrect Answer 1"], line["Incorrect Answer 2"], line["Incorrect Answer 3"]]
+    choices.insert(gold_index, line["Correct Answer"])
+
+    instruction = "Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering."
+    query = f"{instruction}\n\n{line['Question']}\n\n" ""
+    query += "".join([f"{key}. {choice}\n" for key, choice in zip(LETTER_INDICES, choices)])
+
+    return Doc(
+        task_name=task_name,
+        query=query,
+        choices=LETTER_INDICES[: len(choices)],
+        gold_index=gold_index,
+        instruction=instruction,
+    )
+
+
 def gsm8k(line, task_name: str = None):
     # Has special analysis in metric for number decomposition
     return Doc(
diff --git a/src/lighteval/tasks/default_tasks.py b/src/lighteval/tasks/default_tasks.py
index d6a7ec498..039faa9b9 100644
--- a/src/lighteval/tasks/default_tasks.py
+++ b/src/lighteval/tasks/default_tasks.py
@@ -7720,6 +7720,54 @@
     trust_dataset=True,
     version=0,
 )
+gpqa_diamond_instruct_lighteval = LightevalTaskConfig(
+    name="gpqa:diamond",
+    suite=["lighteval"],
+    prompt_function=prompt.gpqa_instruct,
+    hf_repo="Idavidrein/gpqa",
+    hf_subset="gpqa_diamond",
+    hf_avail_splits=["train"],
+    evaluation_splits=["train"],
+    few_shots_split=None,
+    few_shots_select="random_sampling",
+    generation_size=32_000,  # needed for reasoning models like R1
+    metric=[Metrics.gpqa_instruct_metric],
+    stop_sequence=[],  # no stop sequence, will use eos token
+    trust_dataset=True,
+    version=0,
+)
+gpqa_extended_instruct_lighteval = LightevalTaskConfig(
+    name="gpqa:extended",
+    suite=["lighteval"],
+    prompt_function=prompt.gpqa_instruct,
+    hf_repo="Idavidrein/gpqa",
+    hf_subset="gpqa_extended",
+    hf_avail_splits=["train"],
+    evaluation_splits=["train"],
+    few_shots_split=None,
+    few_shots_select="random_sampling",
+    generation_size=32_000,  # needed for reasoning models like R1
+    metric=[Metrics.gpqa_instruct_metric],
+    stop_sequence=[],  # no stop sequence, will use eos token
+    trust_dataset=True,
+    version=0,
+)
+gpqa_main_instruct_lighteval = LightevalTaskConfig(
+    name="gpqa:main",
+    suite=["lighteval"],
+    prompt_function=prompt.gpqa_instruct,
+    hf_repo="Idavidrein/gpqa",
+    hf_subset="gpqa_main",
+    hf_avail_splits=["train"],
+    evaluation_splits=["train"],
+    few_shots_split=None,
+    few_shots_select="random_sampling",
+    generation_size=32_000,  # needed for reasoning models like R1
+    metric=[Metrics.gpqa_instruct_metric],
+    stop_sequence=[],  # no stop sequence, will use eos token
+    trust_dataset=True,
+    version=0,
+)
 gre_reading_comprehension_bigbench = LightevalTaskConfig(
     name="gre_reading_comprehension",
     suite=["bigbench", "bigbench_json"],
diff --git a/src/lighteval/tasks/extended/gpqa/main.py b/src/lighteval/tasks/extended/gpqa/main.py
deleted file mode 100644
index 939abf9fa..000000000
--- a/src/lighteval/tasks/extended/gpqa/main.py
+++ /dev/null
@@ -1,89 +0,0 @@
-# MIT License
-
-# Copyright (c) 2025 The HuggingFace Team
-
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
-
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-"""Usage:
-
-lighteval vllm pretrained=deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B,dtype=float16,tensor_parallel_size=1,max_model_length=32768,gpu_memory_utilisation=0.8 "extended|gpqa:diamond|0|0" \
-    --use-chat-template \
-    --custom-tasks src/lighteval/tasks/extended/gpqa/main.py
-"""
-import random
-
-from lighteval.metrics.dynamic_metrics import (
-    IndicesExtractionConfig,
-    multilingual_extractive_match_metric,
-)
-from lighteval.tasks.default_prompts import LETTER_INDICES
-from lighteval.tasks.lighteval_task import Doc, LightevalTaskConfig
-from lighteval.utils.language import Language
-
-
-def gpqa_prompt_fn(line, task_name: str = None):
-    """Adapted from simple-eval: https://github.com/openai/simple-evals/blob/83ed7640a7d9cd26849bcb3340125002ef14abbe/common.py#L14"""
-    gold_index = random.randint(0, 3)
-    choices = [line["Incorrect Answer 1"], line["Incorrect Answer 2"], line["Incorrect Answer 3"]]
-    choices.insert(gold_index, line["Correct Answer"])
-
-    instruction = "Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering."
-    query = f"{instruction}\n\n{line['Question']}\n\n" ""
-    query += "".join([f"{key}. {choice}\n" for key, choice in zip(LETTER_INDICES, choices)])
-
-    return Doc(
-        task_name=task_name,
-        query=query,
-        choices=LETTER_INDICES[: len(choices)],
-        gold_index=gold_index,
-        instruction=instruction,
-    )
-
-
-extraction_targets = [IndicesExtractionConfig(prefix_for_extraction="NativeLetters")]
-
-metric = multilingual_extractive_match_metric(
-    language=Language.ENGLISH,
-    gold_extraction_target=extraction_targets,
-    pred_extraction_target=extraction_targets,
-    precision=6,
-)
-
-subsets = ["extended", "main", "diamond"]
-task_configs = []
-
-for subset in subsets:
-    task = LightevalTaskConfig(
-        name=f"gpqa:{subset}",
-        suite=["extended"],
-        prompt_function=gpqa_prompt_fn,
-        hf_repo="Idavidrein/gpqa",
-        hf_subset="gpqa_diamond",
-        hf_avail_splits=["train"],
-        evaluation_splits=["train"],
-        few_shots_split=None,
-        few_shots_select="random_sampling",
-        generation_size=32_000,
-        metric=[metric],
-        stop_sequence=[],  # no stop sequence, will use eos token
-        trust_dataset=True,
-        version=0,
-    )
-    task_configs.append(task)
-
-TASKS_TABLE = task_configs