From 1ae2fa287397055a71dd1d55816490a5cb929577 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mar=C3=ADa=20Grandury?=
 <57645283+mariagrandury@users.noreply.github.com>
Date: Wed, 22 Jan 2025 11:01:27 +0100
Subject: [PATCH] Translate task template to Catalan and Galician and fix typos
 (#506)

---
 src/lighteval/tasks/default_prompts.py        | 10 ++--
 src/lighteval/tasks/lighteval_task.py         |  4 +-
 src/lighteval/tasks/prompt_manager.py         |  2 +-
 src/lighteval/tasks/registry.py               |  2 +-
 src/lighteval/tasks/templates/continuation.py |  2 +-
 src/lighteval/tasks/templates/copa.py         |  6 +--
 src/lighteval/tasks/templates/hellaswag.py    |  4 +-
 src/lighteval/tasks/templates/nli.py          | 12 ++---
 .../templates/utils/translation_literals.py   | 48 ++++++++++++++++++-
 9 files changed, 67 insertions(+), 23 deletions(-)

diff --git a/src/lighteval/tasks/default_prompts.py b/src/lighteval/tasks/default_prompts.py
index c5395281c..66c3d53b4 100644
--- a/src/lighteval/tasks/default_prompts.py
+++ b/src/lighteval/tasks/default_prompts.py
@@ -120,7 +120,7 @@ def asdiv(line, task_name: str = None):
 
 def babi_qa(line, task_name: str = None):  # HELM
     def process_path(path: str) -> str:
-        """Turn a path string (task 19) from the original format 's,w' to a verbal model-friendly format 'south west'"""
+        """Turn a path string (task 19) from the original format 's,w' into a verbal model-friendly format 'south west'"""
         steps = path.split(",")
         directions = {"s": "south", "n": "north", "e": "east", "w": "west"}
         path = " ".join([directions[step] for step in steps])
@@ -281,7 +281,7 @@ def bbh_logical_deduction_three_objects(line, task_name: str = None):
 def bbh_movie_recommendation(line, task_name: str = None):
     if line["target"] == "Monsters, Inc":  # this line is not correctly formatted
         logger.warning(
-            "One sample removed from task bbh:movie_recommentation because its line is incorrectly formatted."
+            "One sample removed from task bbh:movie_recommendation because its line is incorrectly formatted."
         )
         return []
     instruction = "Recommend movies similar to the given list of movies.\n\n"
@@ -500,7 +500,7 @@ def civil_comments(line, task_name: str = None):
 def cnn_dm(line, task_name: str = None):
     return Doc(
         task_name=task_name,
-        query=f"###\nArticle:{line['article']}\n\nSummarize the above article in 3 sentence.\n",
+        query=f"###\nArticle:{line['article']}\n\nSummarize the above article in 3 sentences.\n",
         choices=[str(line["summary"])],
         gold_index=0,
         specific={"text": line["article"]},
@@ -730,7 +730,7 @@ def gpqa(line, task_name: str = None):
 
 
 def gsm8k(line, task_name: str = None):
-    # Has special analysis in metric for number decomposiition
+    # Has special analysis in metric for number decomposition
     return Doc(
         task_name=task_name,
         query=f"Question: {line['question']}\nAnswer:",
@@ -2076,7 +2076,7 @@ def rte(line, task_name: str = None):
     return Doc(
         task_name=task_name,
         query=f"{line['sentence1']}\nQuestion: {line['sentence2']} True or False?\nAnswer:",
-        choices=[" True", " False"],  # 0 = entailement, 1 = not entailment
+        choices=[" True", " False"],  # 0 = entailment, 1 = not entailment
         gold_index=int(line["label"]),
         # "metric": "choices_loglikelihood",
     )
diff --git a/src/lighteval/tasks/lighteval_task.py b/src/lighteval/tasks/lighteval_task.py
index 09886e4db..c187f653f 100644
--- a/src/lighteval/tasks/lighteval_task.py
+++ b/src/lighteval/tasks/lighteval_task.py
@@ -621,7 +621,7 @@ def create_requests_from_tasks(  # noqa: C901
         n_samples = min(max_samples, len(task_docs)) if max_samples else len(task_docs)
         evaluation_tracker.task_config_logger.log_num_docs(task_name, len(task_docs), n_samples)
 
-        # logs out the diferent versions of the tasks for every few shot
+        # logs out the different versions of the tasks for every few shot
         for num_fewshot, _ in fewshot_dict[task_name]:
             cur_task_name = f"{task_name}|{num_fewshot}"
             evaluation_tracker.versions_logger.log(cur_task_name, task.version)
@@ -633,7 +633,7 @@ def create_requests_from_tasks(  # noqa: C901
         prompt_manager = PromptManager(lm=lm, task=task)
         seeds = prompt_manager.few_shot_sampler.get_fewshot_seeds(num_fewshot_seeds)
 
-        # We can do several round of fewshots sampling to get some variance informations
+        # We can do several round of fewshots sampling to get some variance information
         for seed in seeds:
             for doc_id in range(n_samples):
                 doc_id_seed = f"{doc_id}_{seed}"  # if we do several rounds of few shot sampling we have several seeds
diff --git a/src/lighteval/tasks/prompt_manager.py b/src/lighteval/tasks/prompt_manager.py
index af55c3184..6d066c921 100644
--- a/src/lighteval/tasks/prompt_manager.py
+++ b/src/lighteval/tasks/prompt_manager.py
@@ -132,7 +132,7 @@ def _multi_turn_contexts(self, doc: Doc, use_chat_template: bool, system_prompt:
         Multi turn tasks need use chat templating.
 
         Args:
-            doc (Doc): Formated document.
+            doc (Doc): Formatted document.
             use_chat_template (bool): wether or not to use chat template. Will fail if false.
             system_prompt (Optional[str]): The system prompt to use
             tokenizer (PreTrainedTokenizer): The tokenizer used for the chat template
diff --git a/src/lighteval/tasks/registry.py b/src/lighteval/tasks/registry.py
index 834e81706..174a98d33 100644
--- a/src/lighteval/tasks/registry.py
+++ b/src/lighteval/tasks/registry.py
@@ -166,7 +166,7 @@ def _task_superset_dict(self):
                 "lighteval|mmlu" -> ["lighteval|mmlu:abstract_algebra", "lighteval|mmlu:college_biology", ...]
             }
         """
-        # Note: sorted before groupby is imporant as the python implementation of groupby does not
+        # Note: sorted before groupby is important as the python implementation of groupby does not
         # behave like sql groupby. For more info see the docs of itertools.groupby
         superset_dict = {k: list(v) for k, v in groupby(sorted(self.task_registry.keys()), lambda x: x.split(":")[0])}
         # Only consider supersets with more than one task
diff --git a/src/lighteval/tasks/templates/continuation.py b/src/lighteval/tasks/templates/continuation.py
index 6435fc8f2..c9cd5d1bc 100644
--- a/src/lighteval/tasks/templates/continuation.py
+++ b/src/lighteval/tasks/templates/continuation.py
@@ -112,7 +112,7 @@ def get_continuation_prompt_function(
     C. Continuation 3
     Answer: A/B/C
 
-    This template is very similar to the `Multiple Choice` template, except that it only takes context/continuations as input and don't use the anchor labels (Question/Answer)
+    This template is very similar to the `Multiple Choice` template, except that it only takes context/continuations as input and doesn't use the anchor labels (Question/Answer)
 
     Args:
         language (Language): The language of the Continuation task.
diff --git a/src/lighteval/tasks/templates/copa.py b/src/lighteval/tasks/templates/copa.py
index 2129332f8..a4d82c4de 100644
--- a/src/lighteval/tasks/templates/copa.py
+++ b/src/lighteval/tasks/templates/copa.py
@@ -86,17 +86,17 @@ def get_copa_prompt_function(
 
     Format:
     *CF*
-    Context Premise thefore/cause | (Continuation 1, Continuation 2, Continuation 3)
+    Context Premise therefore/cause | (Continuation 1, Continuation 2, Continuation 3)
 
     *Hybrid*
-    Context Premise thefore/cause
+    Context Premise therefore/cause
     A. Continuation 1
     B. Continuation 2
     C. Continuation 3
     Answer: | Continuation 1/Continuation 2/Continuation 3
 
     *MCF*
-    Context Premise thefore/cause
+    Context Premise therefore/cause
     A. Continuation 1
     B. Continuation 2
     C. Continuation 3
diff --git a/src/lighteval/tasks/templates/hellaswag.py b/src/lighteval/tasks/templates/hellaswag.py
index 43a3061b6..f5c4ba3db 100644
--- a/src/lighteval/tasks/templates/hellaswag.py
+++ b/src/lighteval/tasks/templates/hellaswag.py
@@ -70,7 +70,7 @@ def get_hellaswag_prompt_function(
     Create a templated prompt function for a Hellaswag task.
 
     Format:
-    Context Premise thefore/cause | (Continuation 1, Continuation 2, Continuation 3)
+    Context Premise therefore/cause | (Continuation 1, Continuation 2, Continuation 3)
 
     Args:
         language (Language): The language of the Hellaswag task.
@@ -126,7 +126,7 @@ def hellaswag_prompt(
         if ctx_b:
             ctx_a = join_ctxs(ctx_a, ctx_b)
 
-        # Removoal of the [header] can happen and we need the first letter to be capital afterwards
+        # Removal of the [header] can happen and we need the first letter to be capital afterwards
         full_context = HELLASWAG_QUERY.format(activity_label=activity_label, ctx=ctx_a)
         choices = [
             hellaswag_preprocess(
diff --git a/src/lighteval/tasks/templates/nli.py b/src/lighteval/tasks/templates/nli.py
index 842460306..e8809e17b 100644
--- a/src/lighteval/tasks/templates/nli.py
+++ b/src/lighteval/tasks/templates/nli.py
@@ -228,7 +228,7 @@ def prompt_fn(line: dict, task_name: str):
         if input_data is None:
             return None
 
-        # Template based on dicussion here: https://github.com/EleutherAI/lm-evaluation-harness/issues/450
+        # Template based on discussion here: https://github.com/EleutherAI/lm-evaluation-harness/issues/450
         labels = [capitalize(get_relation_label(label, translation_literals)) for label in relations]
 
         premise, hypothesis, gold_idx = input_data["premise"], input_data["hypothesis"], input_data["gold_idx"]
@@ -236,15 +236,15 @@ def prompt_fn(line: dict, task_name: str):
         hypothesis = input_data["hypothesis"]
         if isinstance(formulation, HybridFormulation):
             # If we have the neither option move it to the end to be consistent with standard NLI evaluation
-            rearanged_labales = labels
+            rearranged_labels = labels
             if "neutral" in relations:
                 neutral_idx = relations.index("neutral")
-                rearanged_labales = labels[:neutral_idx] + labels[neutral_idx + 1 :] + [labels[neutral_idx]]
+                rearranged_labels = labels[:neutral_idx] + labels[neutral_idx + 1 :] + [labels[neutral_idx]]
 
-            choices_str = f"{translation_literals.comma}{translation_literals.word_space}".join(rearanged_labales[:-1])
-            hypothesis = f"{hypothesis.rstrip(PUNCT)}{translation_literals.sentence_space}{choices_str}{translation_literals.word_space}{translation_literals.or_word}{translation_literals.word_space}{rearanged_labales[-1]}{translation_literals.question_mark}"
+            choices_str = f"{translation_literals.comma}{translation_literals.word_space}".join(rearranged_labels[:-1])
+            hypothesis = f"{hypothesis.rstrip(PUNCT)}{translation_literals.sentence_space}{choices_str}{translation_literals.word_space}{translation_literals.or_word}{translation_literals.word_space}{rearranged_labels[-1]}{translation_literals.question_mark}"
 
-        # (hynky1999): Ideally we would not compute logprobs of the Yes/No/Also in CF fomulation. However as of right now lighteval doesn't allow to
+        # (hynky1999): Ideally we would not compute logprobs of the Yes/No/Also in CF formulation. However as of right now lighteval doesn't allow to
         # use multi-context.
         row = {
             "instruction": input_data.get("instruction", ""),
diff --git a/src/lighteval/tasks/templates/utils/translation_literals.py b/src/lighteval/tasks/templates/utils/translation_literals.py
index 51daf7198..756285e62 100644
--- a/src/lighteval/tasks/templates/utils/translation_literals.py
+++ b/src/lighteval/tasks/templates/utils/translation_literals.py
@@ -178,7 +178,29 @@ def __getattribute__(self, name: str) -> str:
     Language.BRETON: TranslationLiterals(language=Language.BRETON),
     Language.BULGARIAN: TranslationLiterals(language=Language.BULGARIAN),
     Language.BURMESE: TranslationLiterals(language=Language.BURMESE),
-    Language.CATALAN: TranslationLiterals(language=Language.CATALAN),
+    Language.CATALAN: TranslationLiterals(
+        language=Language.CATALAN,
+        question_word="pregunta",
+        answer="resposta",
+        confirmation_word="cert",
+        yes="sí",
+        no="no",
+        also="també",
+        cause_word="perquè",
+        effect_word="per tant",
+        or_word="o",
+        true="veritable",
+        false="fals",
+        neither="cap",
+        full_stop=".",
+        comma=",",
+        question_mark="?",
+        exclamation_mark="!",
+        word_space=" ",
+        sentence_space=" ",
+        colon=":",
+        semicolon=";",
+    ),
     Language.CEBUANO: TranslationLiterals(language=Language.CEBUANO),
     Language.CHINESE: TranslationLiterals(
         language=Language.CHINESE,
@@ -348,7 +370,29 @@ def __getattribute__(self, name: str) -> str:
         sentence_space=" ",
         colon=":",
     ),
-    Language.GALICIAN: TranslationLiterals(language=Language.GALICIAN),
+    Language.GALICIAN: TranslationLiterals(
+        language=Language.GALICIAN,
+        question_word="pregunta",
+        answer="resposta",
+        confirmation_word="certo",
+        yes="si",
+        no="non",
+        also="tamén",
+        cause_word="porque",
+        effect_word="polo tanto",
+        or_word="ou",
+        true="verdadeiro",
+        false="falso",
+        neither="ningún",
+        full_stop=".",
+        comma=",",
+        question_mark="?",
+        exclamation_mark="!",
+        word_space=" ",
+        sentence_space=" ",
+        colon=":",
+        semicolon=";",
+    ),
     Language.GEORGIAN: TranslationLiterals(language=Language.GEORGIAN),
     Language.GERMAN: TranslationLiterals(
         language=Language.GERMAN,