Add custom task (bac-fr) for evaluation of models in french (#518)

* Add custom task (bac-fr) for evaluation of models in french * Update prompt function for the bac-fr task Co-authored-by: Clémentine Fourrier <[email protected]> * Add metrics for the evaluation of bac-fr * Fix function name (as_list) * Fix prompt for multichoice --------- Co-authored-by: Clémentine Fourrier <[email protected]>
huggingface · Feb 3, 2025 · d7a1f11 · d7a1f11
1 parent be7da17
commit d7a1f11
Showing 1 changed file with 36 additions and 1 deletion.
diff --git a/community_tasks/french_evals.py b/community_tasks/french_evals.py
@@ -46,6 +46,7 @@
 from lighteval.tasks.extended.ifeval.main import ifeval_metrics
 from lighteval.tasks.lighteval_task import LightevalTaskConfig
 from lighteval.tasks.requests import Doc
+from lighteval.utils.utils import as_list
 
 
 # Ifeval-fr prompt function
@@ -80,6 +81,22 @@ def prompt_gpqa_fr(line, task_name: str = None):
     )
 
 
+# BAC-fr prompt function
+def prompt_bac_fr(line, task_name: str = None):
+    prompt = f"Enoncé: {line['enonce']}\n{line['instruction']}\n"
+    if line["choix"] is not None:  # Multichoice evaluation
+        # prompt += "\n".join([f"{LETTER_INDICES[ix]}.{choix}" for ix, choix in enumerate(line["choix"])])
+        return Doc(
+            task_name=task_name,
+            query=prompt,
+            choices=as_list(line["choix"]),
+            gold_index=line["choix"].index(line["choix correct"]),
+            instruction="",
+        )
+    else:
+        return Doc(task_name=task_name, query=prompt, choices=[line["reponse"]], gold_index=0, instruction="")
+
+
 # IFEVal-fr task
 
 
@@ -117,5 +134,23 @@ def prompt_gpqa_fr(line, task_name: str = None):
     version=0,
 )
 
+# BAC-fr task
+bac_fr_task = LightevalTaskConfig(
+    name="bac-fr",
+    suite=["community"],
+    prompt_function=prompt_bac_fr,
+    hf_repo="fr-gouv-coordination-ia/bac-fr",
+    hf_subset="default",
+    hf_avail_splits=["train"],
+    evaluation_splits=["train"],
+    few_shots_split=None,
+    few_shots_select="random_sampling",
+    generation_size=1,
+    metric=[Metrics.quasi_exact_match_math, Metrics.exact_match],
+    stop_sequence=["\n"],
+    trust_dataset=True,
+    version=0,
+)
+
 # STORE YOUR EVALS
-TASKS_TABLE = [ifeval_fr_task, gpqa_fr_task]
+TASKS_TABLE = [ifeval_fr_task, gpqa_fr_task, bac_fr_task]