Skip to content

Commit

Permalink
Refactor
Browse files Browse the repository at this point in the history
  • Loading branch information
lewtun committed Feb 4, 2025
1 parent 88f939e commit fa00c5f
Show file tree
Hide file tree
Showing 4 changed files with 78 additions and 89 deletions.
11 changes: 11 additions & 0 deletions src/lighteval/metrics/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,10 @@
import numpy as np
from aenum import Enum

from lighteval.metrics.dynamic_metrics import (
IndicesExtractionConfig,
multilingual_extractive_match_metric,
)
from lighteval.metrics.harness_compatibility.drop import drop_metrics
from lighteval.metrics.harness_compatibility.truthful_qa import truthfulqa_mc_metrics
from lighteval.metrics.metrics_corpus import (
Expand Down Expand Up @@ -69,6 +73,7 @@
SampleLevelMetric,
SampleLevelMetricGrouping,
)
from lighteval.utils.language import Language
from lighteval.utils.utils import as_list


Expand Down Expand Up @@ -549,6 +554,12 @@ class Metrics(Enum):
corpus_level_fn=CorpusLevelPerplexityMetric("weighted_perplexity").compute,
higher_is_better=False,
)
gpqa_instruct_metric = multilingual_extractive_match_metric(
language=Language.ENGLISH,
gold_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")],
pred_extraction_target=[IndicesExtractionConfig(prefix_for_extraction="NativeLetters")],
precision=6,
)

def __str__(self):
return self.name.replace("_at_", "@")
Expand Down
19 changes: 19 additions & 0 deletions src/lighteval/tasks/default_prompts.py
Original file line number Diff line number Diff line change
Expand Up @@ -729,6 +729,25 @@ def gpqa(line, task_name: str = None):
)


def gpqa_instruct(line, task_name: str = None):
"""Adapted from simple-eval: https://github.com/openai/simple-evals/blob/83ed7640a7d9cd26849bcb3340125002ef14abbe/common.py#L14"""
gold_index = random.randint(0, 3)
choices = [line["Incorrect Answer 1"], line["Incorrect Answer 2"], line["Incorrect Answer 3"]]
choices.insert(gold_index, line["Correct Answer"])

instruction = "Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering."
query = f"{instruction}\n\n{line['Question']}\n\n" ""
query += "".join([f"{key}. {choice}\n" for key, choice in zip(LETTER_INDICES, choices)])

return Doc(
task_name=task_name,
query=query,
choices=LETTER_INDICES[: len(choices)],
gold_index=gold_index,
instruction=instruction,
)


def gsm8k(line, task_name: str = None):
# Has special analysis in metric for number decomposition
return Doc(
Expand Down
48 changes: 48 additions & 0 deletions src/lighteval/tasks/default_tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -7720,6 +7720,54 @@
trust_dataset=True,
version=0,
)
gpqa_diamond_instruct_lighteval = LightevalTaskConfig(
name="gpqa:diamond",
suite=["lighteval"],
prompt_function=prompt.gpqa_instruct,
hf_repo="Idavidrein/gpqa",
hf_subset="gpqa_diamond",
hf_avail_splits=["train"],
evaluation_splits=["train"],
few_shots_split=None,
few_shots_select="random_sampling",
generation_size=32_000, # needed for reasoning models like R1
metric=[Metrics.gpqa_instruct_metric],
stop_sequence=[], # no stop sequence, will use eos token
trust_dataset=True,
version=0,
)
gpqa_extended_instruct_lighteval = LightevalTaskConfig(
name="gpqa:extended",
suite=["lighteval"],
prompt_function=prompt.gpqa_instruct,
hf_repo="Idavidrein/gpqa",
hf_subset="gpqa_extended",
hf_avail_splits=["train"],
evaluation_splits=["train"],
few_shots_split=None,
few_shots_select="random_sampling",
generation_size=32_000, # needed for reasoning models like R1
metric=[Metrics.gpqa_instruct_metric],
stop_sequence=[], # no stop sequence, will use eos token
trust_dataset=True,
version=0,
)
gpqa_main_instruct_lighteval = LightevalTaskConfig(
name="gpqa:main",
suite=["lighteval"],
prompt_function=prompt.gpqa_instruct,
hf_repo="Idavidrein/gpqa",
hf_subset="gpqa_main",
hf_avail_splits=["train"],
evaluation_splits=["train"],
few_shots_split=None,
few_shots_select="random_sampling",
generation_size=32_000, # needed for reasoning models like R1
metric=[Metrics.gpqa_instruct_metric],
stop_sequence=[], # no stop sequence, will use eos token
trust_dataset=True,
version=0,
)
gre_reading_comprehension_bigbench = LightevalTaskConfig(
name="gre_reading_comprehension",
suite=["bigbench", "bigbench_json"],
Expand Down
89 changes: 0 additions & 89 deletions src/lighteval/tasks/extended/gpqa/main.py

This file was deleted.

0 comments on commit fa00c5f

Please sign in to comment.