From fa00c5f37dc8f3de7eb7813105d724264f21e373 Mon Sep 17 00:00:00 2001 From: Lewis Tunstall Date: Tue, 4 Feb 2025 16:13:57 +0000 Subject: [PATCH] Refactor --- src/lighteval/metrics/metrics.py | 11 +++ src/lighteval/tasks/default_prompts.py | 19 +++++ src/lighteval/tasks/default_tasks.py | 48 ++++++++++++ src/lighteval/tasks/extended/gpqa/main.py | 89 ----------------------- 4 files changed, 78 insertions(+), 89 deletions(-) delete mode 100644 src/lighteval/tasks/extended/gpqa/main.py diff --git a/src/lighteval/metrics/metrics.py b/src/lighteval/metrics/metrics.py index ff4b6b059..171219197 100644 --- a/src/lighteval/metrics/metrics.py +++ b/src/lighteval/metrics/metrics.py @@ -24,6 +24,10 @@ import numpy as np from aenum import Enum +from lighteval.metrics.dynamic_metrics import ( + IndicesExtractionConfig, + multilingual_extractive_match_metric, +) from lighteval.metrics.harness_compatibility.drop import drop_metrics from lighteval.metrics.harness_compatibility.truthful_qa import truthfulqa_mc_metrics from lighteval.metrics.metrics_corpus import ( @@ -69,6 +73,7 @@ SampleLevelMetric, SampleLevelMetricGrouping, ) +from lighteval.utils.language import Language from lighteval.utils.utils import as_list @@ -549,6 +554,12 @@ class Metrics(Enum): corpus_level_fn=CorpusLevelPerplexityMetric("weighted_perplexity").compute, higher_is_better=False, ) + gpqa_instruct_metric = multilingual_extractive_match_metric( + language=Language.ENGLISH, + gold_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")], + pred_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")], + precision=6, + ) def __str__(self): return self.name.replace("_at_", "@") diff --git a/src/lighteval/tasks/default_prompts.py b/src/lighteval/tasks/default_prompts.py index 66c3d53b4..11037873a 100644 --- a/src/lighteval/tasks/default_prompts.py +++ b/src/lighteval/tasks/default_prompts.py @@ -729,6 +729,25 @@ def gpqa(line, task_name: str = None): ) +def gpqa_instruct(line, task_name: str = None): + """Adapted from simple-eval: https://github.com/openai/simple-evals/blob/83ed7640a7d9cd26849bcb3340125002ef14abbe/common.py#L14""" + gold_index = random.randint(0, 3) + choices = [line["Incorrect Answer 1"], line["Incorrect Answer 2"], line["Incorrect Answer 3"]] + choices.insert(gold_index, line["Correct Answer"]) + + instruction = "Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering." + query = f"{instruction}\n\n{line['Question']}\n\n" "" + query += "".join([f"{key}. {choice}\n" for key, choice in zip(LETTER_INDICES, choices)]) + + return Doc( + task_name=task_name, + query=query, + choices=LETTER_INDICES[: len(choices)], + gold_index=gold_index, + instruction=instruction, + ) + + def gsm8k(line, task_name: str = None): # Has special analysis in metric for number decomposition return Doc( diff --git a/src/lighteval/tasks/default_tasks.py b/src/lighteval/tasks/default_tasks.py index d6a7ec498..039faa9b9 100644 --- a/src/lighteval/tasks/default_tasks.py +++ b/src/lighteval/tasks/default_tasks.py @@ -7720,6 +7720,54 @@ trust_dataset=True, version=0, ) +gpqa_diamond_instruct_lighteval = LightevalTaskConfig( + name="gpqa:diamond", + suite=["lighteval"], + prompt_function=prompt.gpqa_instruct, + hf_repo="Idavidrein/gpqa", + hf_subset="gpqa_diamond", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select="random_sampling", + generation_size=32_000, # needed for reasoning models like R1 + metric=[Metrics.gpqa_instruct_metric], + stop_sequence=[], # no stop sequence, will use eos token + trust_dataset=True, + version=0, +) +gpqa_extended_instruct_lighteval = LightevalTaskConfig( + name="gpqa:extended", + suite=["lighteval"], + prompt_function=prompt.gpqa_instruct, + hf_repo="Idavidrein/gpqa", + hf_subset="gpqa_extended", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select="random_sampling", + generation_size=32_000, # needed for reasoning models like R1 + metric=[Metrics.gpqa_instruct_metric], + stop_sequence=[], # no stop sequence, will use eos token + trust_dataset=True, + version=0, +) +gpqa_main_instruct_lighteval = LightevalTaskConfig( + name="gpqa:main", + suite=["lighteval"], + prompt_function=prompt.gpqa_instruct, + hf_repo="Idavidrein/gpqa", + hf_subset="gpqa_main", + hf_avail_splits=["train"], + evaluation_splits=["train"], + few_shots_split=None, + few_shots_select="random_sampling", + generation_size=32_000, # needed for reasoning models like R1 + metric=[Metrics.gpqa_instruct_metric], + stop_sequence=[], # no stop sequence, will use eos token + trust_dataset=True, + version=0, +) gre_reading_comprehension_bigbench = LightevalTaskConfig( name="gre_reading_comprehension", suite=["bigbench", "bigbench_json"], diff --git a/src/lighteval/tasks/extended/gpqa/main.py b/src/lighteval/tasks/extended/gpqa/main.py deleted file mode 100644 index 939abf9fa..000000000 --- a/src/lighteval/tasks/extended/gpqa/main.py +++ /dev/null @@ -1,89 +0,0 @@ -# MIT License - -# Copyright (c) 2025 The HuggingFace Team - -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: - -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. - -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. -"""Usage: - -lighteval vllm pretrained=deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B,dtype=float16,tensor_parallel_size=1,max_model_length=32768,gpu_memory_utilisation=0.8 "extended|gpqa:diamond|0|0" \ - --use-chat-template \ - --custom-tasks src/lighteval/tasks/extended/gpqa/main.py -""" -import random - -from lighteval.metrics.dynamic_metrics import ( - IndicesExtractionConfig, - multilingual_extractive_match_metric, -) -from lighteval.tasks.default_prompts import LETTER_INDICES -from lighteval.tasks.lighteval_task import Doc, LightevalTaskConfig -from lighteval.utils.language import Language - - -def gpqa_prompt_fn(line, task_name: str = None): - """Adapted from simple-eval: https://github.com/openai/simple-evals/blob/83ed7640a7d9cd26849bcb3340125002ef14abbe/common.py#L14""" - gold_index = random.randint(0, 3) - choices = [line["Incorrect Answer 1"], line["Incorrect Answer 2"], line["Incorrect Answer 3"]] - choices.insert(gold_index, line["Correct Answer"]) - - instruction = "Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering." - query = f"{instruction}\n\n{line['Question']}\n\n" "" - query += "".join([f"{key}. {choice}\n" for key, choice in zip(LETTER_INDICES, choices)]) - - return Doc( - task_name=task_name, - query=query, - choices=LETTER_INDICES[: len(choices)], - gold_index=gold_index, - instruction=instruction, - ) - - -extraction_targets = [IndicesExtractionConfig(prefix_for_extraction="NativeLetters")] - -metric = multilingual_extractive_match_metric( - language=Language.ENGLISH, - gold_extraction_target=extraction_targets, - pred_extraction_target=extraction_targets, - precision=6, -) - -subsets = ["extended", "main", "diamond"] -task_configs = [] - -for subset in subsets: - task = LightevalTaskConfig( - name=f"gpqa:{subset}", - suite=["extended"], - prompt_function=gpqa_prompt_fn, - hf_repo="Idavidrein/gpqa", - hf_subset="gpqa_diamond", - hf_avail_splits=["train"], - evaluation_splits=["train"], - few_shots_split=None, - few_shots_select="random_sampling", - generation_size=32_000, - metric=[metric], - stop_sequence=[], # no stop sequence, will use eos token - trust_dataset=True, - version=0, - ) - task_configs.append(task) - -TASKS_TABLE = task_configs