moving custom tasks to code

huggingface · Feb 7, 2024 · cb163be · cb163be
1 parent 059e100
commit cb163be
Show file tree

Hide file tree

Showing 10 changed files with 71 additions and 197 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -37,4 +37,5 @@ repos:
     rev: 'v0.1.6'
     hooks:
       - id: ruff
+        args: ['--fix']
       - id: ruff-format
diff --git a/README.md b/README.md
@@ -8,7 +8,7 @@ LightEval is an evaluation suite which gathers a selection of features from wide
 
 It is still an early, internal version - it should be nice to use but don't expect 100% stability!
 
-In case of problems or question, feel free to open an issue! 
+In case of problems or question, feel free to open an issue!
 
 ## How to install and use
 ### Requirements
@@ -50,11 +50,11 @@ Lastly, create a **line summary** of your evaluation, in `metadata_table.json`.
 - `suite` (list), the suite(s) to which your evaluation should belong. This field allows us to compare different tasks implementation, and is used a task selection to differentiate the versions to launch. At the moment, you'll find the keywords ["helm", "bigbench", "original", "lighteval"]; you can add also add new ones (for test, we recommend using "custom").
 - `prompt_function` (str), the name of the prompt function you defined in the step above
 - `hf_repo` (str), the path to your evaluation dataset on the hub
-- `hf_subset` (str), the specific subset you want to use for your evaluation (note: when the dataset has no subset, fill this field with `"default"`, not with `None` or `""`) 
+- `hf_subset` (str), the specific subset you want to use for your evaluation (note: when the dataset has no subset, fill this field with `"default"`, not with `None` or `""`)
 - `hf_avail_splits` (list), all the splits available for your dataset (train, valid or validation, test, other...)
 - `evaluation_splits` (list), the splits you want to use for evaluation
 - `few_shots_split` (str, can be `null`), the specific split from which you want to select samples for your few-shot examples. It should be different from the sets included in `evaluation_splits`
-- `few_shots_select` (str, can be `null`), the method that you will use to select items for your few-shot examples. Can be `null`, or one of: 
+- `few_shots_select` (str, can be `null`), the method that you will use to select items for your few-shot examples. Can be `null`, or one of:
     - `balanced` selects examples from the `few_shots_split` with balanced labels, to avoid skewing the few shot examples (hence the model generations) towards one specific label
     - `random` selects examples at random from the `few_shots_split`
     - `random_sampling` selects new examples at random from the `few_shots_split` for every new item, but if a sampled item is equal to the current one, it is removed from the available samples
@@ -102,7 +102,7 @@ These metrics need the model to generate an output. They are therefore slower.
     - `exact_match_indicator`: Exact match with some preceding context (before an indicator) removed
     - `f1_score_quasi` (HELM): Average F1 score in terms of word overlap between the model output and gold, with both being normalized first
     - `f1_score`:  Average F1 score in terms of word overlap between the model output and gold without normalisation
-    - `f1_score_macro`: Corpus level macro F1 score 
+    - `f1_score_macro`: Corpus level macro F1 score
     - `f1_score_macro`: Corpus level micro F1 score
 - Summarization:
     - `rouge` (Harness): Average ROUGE score [(Lin, 2004)](https://aclanthology.org/W04-1013/)
@@ -141,7 +141,7 @@ These metrics need both the generation and its logprob. They are not working at
 - `prediction_perplexity` (HELM): Measure of the logprob of a given input.
 
 ## Adding a new metric
-If you want to add a new metric, first check if you can use one of the parametrized functions in `src.lighteval.metrics.metrics_corpus` or `metrics_sample`. If not, add it to either of these files depending on the level at which it is applied. Then, follow the example in `src.lighteval.metrics.metrics` to register your metric. 
+If you want to add a new metric, first check if you can use one of the parametrized functions in `src.lighteval.metrics.metrics_corpus` or `metrics_sample`. If not, add it to either of these files depending on the level at which it is applied. Then, follow the example in `src.lighteval.metrics.metrics` to register your metric.
 
 ## Examples of scripts to launch lighteval on the cluster
 ### Evaluate a whole suite on one node, 8 GPUs

diff --git a/src/lighteval/logging/__init__.py b/src/lighteval/logging/__init__.py
diff --git a/src/lighteval/logging/evaluation_tracker.py b/src/lighteval/logging/evaluation_tracker.py
@@ -556,7 +556,7 @@ def push_results_to_tensorboard(  # noqa: C901
 
         tb_context.close()  # flushes the unfinished write operations
         time.sleep(5)
-        files = os.listdir(output_dir_tb)
+        files = os.listdir(str(output_dir_tb))
         for file in files:
             os.rename(os.path.join(output_dir_tb, file), os.path.join(output_dir_tb, f"{global_step:07d}_{file}"))
 
@@ -566,5 +566,3 @@ def push_results_to_tensorboard(  # noqa: C901
             f"Pushed to tensorboard at https://huggingface.co/tensorboard/{lighteval_config.logging.hub_repo_tensorboard}/"
             f" at {output_dir_tb} and global_step {global_step}"
         )
-        # except Exception as e:
-        #     logger.warning(f"Could not push to tensorboard\n{e}")
diff --git a/src/lighteval/models/__init__.py b/src/lighteval/models/__init__.py
diff --git a/src/lighteval/tasks/__init__.py b/src/lighteval/tasks/__init__.py
diff --git a/src/lighteval/tasks/lighteval_task.py b/src/lighteval/tasks/lighteval_task.py
@@ -1,5 +1,6 @@
 import collections
 import random
+from dataclasses import dataclass
 from multiprocessing import Pool
 from pathlib import Path
 from typing import TYPE_CHECKING, List, Optional, Tuple
@@ -39,6 +40,42 @@
     from lighteval.logging.evaluation_tracker import EvaluationTracker
 
 
+@dataclass
+class CustomEvaluationTaskConfig:
+    name: str
+    prompt_function: str
+    hf_repo: str
+    hf_subset: str
+    metric: Tuple[Metrics]
+    hf_avail_splits: Optional[Tuple[str]] = None
+    evaluation_splits: Optional[Tuple[str]] = None
+    few_shots_split: Optional[str] = None
+    few_shots_select: Optional[str] = None
+    generation_size: int = -1
+    stop_sequence: Optional[Tuple[str]] = None
+    output_regex: Optional[str] = None
+
+    frozen: bool = False
+    suite: Optional[Tuple[str]] = None  # we use this to know if we should use a custom lighteval or bigcode task
+
+    def __post_init__(self):
+        if self.suite is None:
+            self.suite = ["custom"]
+        if self.hf_avail_splits is None:
+            self.hf_avail_splits = ["train", "validation", "test"]
+        if self.evaluation_splits is None:
+            self.evaluation_splits = ["validation"]
+        if self.stop_sequence is None:
+            self.stop_sequence = ["\n"]
+
+        # Convert list to tuple for hashing
+        self.metric = tuple(self.metric)
+        self.hf_avail_splits = tuple(self.hf_avail_splits) if self.hf_avail_splits else None
+        self.evaluation_splits = tuple(self.evaluation_splits) if self.evaluation_splits else None
+        self.suite = tuple(self.suite) if self.suite else None
+        self.stop_sequence = tuple(self.stop_sequence) if self.stop_sequence else None
+
+
 class LightevalTask:
     def __init__(self, name: str, cfg: dict, cache_dir: Optional[str] = None, custom_tasks_module=None):
         """

diff --git a/tasks_examples/custom_tasks/custom_evaluation_tasks.py b/tasks_examples/custom_tasks/custom_evaluation_tasks.py
@@ -6,59 +6,56 @@
 """
 import re
 from dataclasses import asdict
-from typing import Dict, List
+from typing import Dict, List, Tuple
 
+from lighteval.metrics import MetricCategory, Metrics
+from lighteval.tasks.lighteval_task import CustomEvaluationTaskConfig
 from lighteval.tasks.requests import Doc
+from lighteval.tasks.tasks_prompt_formatting import LETTER_INDICES
 
-from .custom_evaluation_utils import *
 
-
-# fmt: off
-LETTER_INDICES = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z"]
-# fmt: on
-
-_TASKS_STRINGS: List[Tuple[CustomEvaluationTask, str]] = []
-_TASKS: List[CustomEvaluationTask] = []
+_TASKS_STRINGS: List[Tuple[CustomEvaluationTaskConfig, str]] = []
+_TASKS: List[CustomEvaluationTaskConfig] = []
 
 ## COMMON_SENSE_REASONING_TASKS ##
 COMMON_SENSE_REASONING_TASKS = [
-    CustomEvaluationTask(
+    CustomEvaluationTaskConfig(
         name="hellaswag",
         prompt_function="hellaswag_prompt",
         hf_repo="hellaswag",
         hf_subset="default",
         metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
     ),
-    CustomEvaluationTask(
+    CustomEvaluationTaskConfig(
         name="winogrande",
         prompt_function="winogrande",
         hf_repo="winogrande",
         hf_subset="winogrande_xl",
         metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
     ),
-    CustomEvaluationTask(
+    CustomEvaluationTaskConfig(
         name="piqa",
         prompt_function="piqa_harness",
         hf_repo="piqa",
         hf_subset="plain_text",
         metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
     ),
-    CustomEvaluationTask(
+    CustomEvaluationTaskConfig(
         name="siqa",
         prompt_function="siqa_prompt",
         hf_repo="lighteval/siqa",
         hf_subset="default",
         hf_avail_splits=["train", "validation"],
         metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
     ),
-    CustomEvaluationTask(
+    CustomEvaluationTaskConfig(
         name="openbookqa",
         prompt_function="openbookqa",
         hf_repo="openbookqa",
         hf_subset="main",
         metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
     ),
-    CustomEvaluationTask(
+    CustomEvaluationTaskConfig(
         name="arc:easy",
         prompt_function="arc",
         hf_repo="ai2_arc",
@@ -67,7 +64,7 @@
         generation_size=1,
         metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
     ),
-    CustomEvaluationTask(
+    CustomEvaluationTaskConfig(
         name="arc:challenge",
         prompt_function="arc",
         hf_repo="ai2_arc",
@@ -76,7 +73,7 @@
         generation_size=1,
         metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
     ),
-    CustomEvaluationTask(
+    CustomEvaluationTaskConfig(
         name="commonsense_qa",
         prompt_function="commonsense_qa_prompt",
         hf_repo="commonsense_qa",
@@ -134,7 +131,7 @@ def preprocess(text):
 ## WORLD_KNOWLEDGE_TASKS ##
 
 WORLD_KNOWLEDGE_TASKS = [
-    CustomEvaluationTask(
+    CustomEvaluationTaskConfig(
         name="trivia_qa",
         prompt_function="triviaqa",
         hf_repo="trivia_qa",
@@ -143,7 +140,7 @@ def preprocess(text):
         generation_size=20,
         stop_sequence=["\n", ".", ","],
     ),
-    CustomEvaluationTask(
+    CustomEvaluationTaskConfig(
         name="natural_questions",
         prompt_function="natural_questions_prompt",
         hf_repo="lighteval/natural_questions_clean",
@@ -173,14 +170,14 @@ def natural_questions_prompt(line, task_name: str = None):
 ## Reading comprehension ##
 
 READING_COMP_TASKS = [
-    CustomEvaluationTask(
+    CustomEvaluationTaskConfig(
         name="super_glue:boolq",
         prompt_function="boolq_prompt",
         hf_repo="super_glue",
         hf_subset="boolq",
         metric=["target_perplexity"],
     ),
-    CustomEvaluationTask(
+    CustomEvaluationTaskConfig(
         name="quac",
         prompt_function="quac",
         hf_repo="lighteval/quac_helm",
@@ -207,7 +204,7 @@ def boolq_prompt(line, task_name: str = None):
 
 
 ## MATH ##
-class CustomMathEvaluationTask(CustomEvaluationTask):
+class CustomMathEvaluationTask(CustomEvaluationTaskConfig):
     """Custom class for math tasks with all the defaults set"""
 
     def __init__(
@@ -254,7 +251,7 @@ def __init__(
     CustomMathEvaluationTask(name="math:prealgebra", hf_subset="prealgebra"),
     CustomMathEvaluationTask(name="math:precalculus", hf_subset="precalculus"),
 ]
-GSM8K = CustomEvaluationTask(
+GSM8K = CustomEvaluationTaskConfig(
     name="gsm8k",
     prompt_function="gsm8k",
     hf_repo="gsm8k",
@@ -275,7 +272,7 @@ def __init__(
 
 
 ## MMLU ##
-class CustomMMLUEvaluationTask(CustomEvaluationTask):
+class CustomMMLUEvaluationTask(CustomEvaluationTaskConfig):
     def __init__(
         self,
         name,
@@ -418,7 +415,7 @@ def mmlu_prompt(line, task_name: str = None):
 ## BBH ##
 
 
-class CustomBBHEvaluationTask(CustomEvaluationTask):
+class CustomBBHEvaluationTask(CustomEvaluationTaskConfig):
     def __init__(
         self,
         name,
@@ -509,7 +506,7 @@ def bbh_prompt(line, task_name: str = None):
 
 
 ## AGI eval ##
-class CustomAGIEvalEvaluationTask(CustomEvaluationTask):
+class CustomAGIEvalEvaluationTask(CustomEvaluationTaskConfig):
     def __init__(
         self,
         name,
@@ -620,17 +617,17 @@ def agi_eval_prompt_no_letters(line, task_name: str = None):
 
 
 ## HUMAN EVAL ##
-# human_eval = CustomEvaluationTask(
+# human_eval = CustomEvaluationTaskConfig(
 #         name="human_eval",
 #         prompt_function="human_eval",
 #         hf_repo="lighteval/human_eval",
 #         metric=["human_eval_pass_at_1"],
 #     ),
 
 
-def has_generative_metrics(task: CustomEvaluationTask) -> bool:
+def has_generative_metrics(task: CustomEvaluationTaskConfig) -> bool:
     for metric in task.metric:
-        if metric in NEEDS_GENERATION_ONLY:
+        if metric.category == MetricCategory.GENERATIVE:
             return True
     return False