From 1ae2fa287397055a71dd1d55816490a5cb929577 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mar=C3=ADa=20Grandury?= <57645283+mariagrandury@users.noreply.github.com> Date: Wed, 22 Jan 2025 11:01:27 +0100 Subject: [PATCH] Translate task template to Catalan and Galician and fix typos (#506) --- src/lighteval/tasks/default_prompts.py | 10 ++-- src/lighteval/tasks/lighteval_task.py | 4 +- src/lighteval/tasks/prompt_manager.py | 2 +- src/lighteval/tasks/registry.py | 2 +- src/lighteval/tasks/templates/continuation.py | 2 +- src/lighteval/tasks/templates/copa.py | 6 +-- src/lighteval/tasks/templates/hellaswag.py | 4 +- src/lighteval/tasks/templates/nli.py | 12 ++--- .../templates/utils/translation_literals.py | 48 ++++++++++++++++++- 9 files changed, 67 insertions(+), 23 deletions(-) diff --git a/src/lighteval/tasks/default_prompts.py b/src/lighteval/tasks/default_prompts.py index c5395281c..66c3d53b4 100644 --- a/src/lighteval/tasks/default_prompts.py +++ b/src/lighteval/tasks/default_prompts.py @@ -120,7 +120,7 @@ def asdiv(line, task_name: str = None): def babi_qa(line, task_name: str = None): # HELM def process_path(path: str) -> str: - """Turn a path string (task 19) from the original format 's,w' to a verbal model-friendly format 'south west'""" + """Turn a path string (task 19) from the original format 's,w' into a verbal model-friendly format 'south west'""" steps = path.split(",") directions = {"s": "south", "n": "north", "e": "east", "w": "west"} path = " ".join([directions[step] for step in steps]) @@ -281,7 +281,7 @@ def bbh_logical_deduction_three_objects(line, task_name: str = None): def bbh_movie_recommendation(line, task_name: str = None): if line["target"] == "Monsters, Inc": # this line is not correctly formatted logger.warning( - "One sample removed from task bbh:movie_recommentation because its line is incorrectly formatted." + "One sample removed from task bbh:movie_recommendation because its line is incorrectly formatted." ) return [] instruction = "Recommend movies similar to the given list of movies.\n\n" @@ -500,7 +500,7 @@ def civil_comments(line, task_name: str = None): def cnn_dm(line, task_name: str = None): return Doc( task_name=task_name, - query=f"###\nArticle:{line['article']}\n\nSummarize the above article in 3 sentence.\n", + query=f"###\nArticle:{line['article']}\n\nSummarize the above article in 3 sentences.\n", choices=[str(line["summary"])], gold_index=0, specific={"text": line["article"]}, @@ -730,7 +730,7 @@ def gpqa(line, task_name: str = None): def gsm8k(line, task_name: str = None): - # Has special analysis in metric for number decomposiition + # Has special analysis in metric for number decomposition return Doc( task_name=task_name, query=f"Question: {line['question']}\nAnswer:", @@ -2076,7 +2076,7 @@ def rte(line, task_name: str = None): return Doc( task_name=task_name, query=f"{line['sentence1']}\nQuestion: {line['sentence2']} True or False?\nAnswer:", - choices=[" True", " False"], # 0 = entailement, 1 = not entailment + choices=[" True", " False"], # 0 = entailment, 1 = not entailment gold_index=int(line["label"]), # "metric": "choices_loglikelihood", ) diff --git a/src/lighteval/tasks/lighteval_task.py b/src/lighteval/tasks/lighteval_task.py index 09886e4db..c187f653f 100644 --- a/src/lighteval/tasks/lighteval_task.py +++ b/src/lighteval/tasks/lighteval_task.py @@ -621,7 +621,7 @@ def create_requests_from_tasks( # noqa: C901 n_samples = min(max_samples, len(task_docs)) if max_samples else len(task_docs) evaluation_tracker.task_config_logger.log_num_docs(task_name, len(task_docs), n_samples) - # logs out the diferent versions of the tasks for every few shot + # logs out the different versions of the tasks for every few shot for num_fewshot, _ in fewshot_dict[task_name]: cur_task_name = f"{task_name}|{num_fewshot}" evaluation_tracker.versions_logger.log(cur_task_name, task.version) @@ -633,7 +633,7 @@ def create_requests_from_tasks( # noqa: C901 prompt_manager = PromptManager(lm=lm, task=task) seeds = prompt_manager.few_shot_sampler.get_fewshot_seeds(num_fewshot_seeds) - # We can do several round of fewshots sampling to get some variance informations + # We can do several round of fewshots sampling to get some variance information for seed in seeds: for doc_id in range(n_samples): doc_id_seed = f"{doc_id}_{seed}" # if we do several rounds of few shot sampling we have several seeds diff --git a/src/lighteval/tasks/prompt_manager.py b/src/lighteval/tasks/prompt_manager.py index af55c3184..6d066c921 100644 --- a/src/lighteval/tasks/prompt_manager.py +++ b/src/lighteval/tasks/prompt_manager.py @@ -132,7 +132,7 @@ def _multi_turn_contexts(self, doc: Doc, use_chat_template: bool, system_prompt: Multi turn tasks need use chat templating. Args: - doc (Doc): Formated document. + doc (Doc): Formatted document. use_chat_template (bool): wether or not to use chat template. Will fail if false. system_prompt (Optional[str]): The system prompt to use tokenizer (PreTrainedTokenizer): The tokenizer used for the chat template diff --git a/src/lighteval/tasks/registry.py b/src/lighteval/tasks/registry.py index 834e81706..174a98d33 100644 --- a/src/lighteval/tasks/registry.py +++ b/src/lighteval/tasks/registry.py @@ -166,7 +166,7 @@ def _task_superset_dict(self): "lighteval|mmlu" -> ["lighteval|mmlu:abstract_algebra", "lighteval|mmlu:college_biology", ...] } """ - # Note: sorted before groupby is imporant as the python implementation of groupby does not + # Note: sorted before groupby is important as the python implementation of groupby does not # behave like sql groupby. For more info see the docs of itertools.groupby superset_dict = {k: list(v) for k, v in groupby(sorted(self.task_registry.keys()), lambda x: x.split(":")[0])} # Only consider supersets with more than one task diff --git a/src/lighteval/tasks/templates/continuation.py b/src/lighteval/tasks/templates/continuation.py index 6435fc8f2..c9cd5d1bc 100644 --- a/src/lighteval/tasks/templates/continuation.py +++ b/src/lighteval/tasks/templates/continuation.py @@ -112,7 +112,7 @@ def get_continuation_prompt_function( C. Continuation 3 Answer: A/B/C - This template is very similar to the `Multiple Choice` template, except that it only takes context/continuations as input and don't use the anchor labels (Question/Answer) + This template is very similar to the `Multiple Choice` template, except that it only takes context/continuations as input and doesn't use the anchor labels (Question/Answer) Args: language (Language): The language of the Continuation task. diff --git a/src/lighteval/tasks/templates/copa.py b/src/lighteval/tasks/templates/copa.py index 2129332f8..a4d82c4de 100644 --- a/src/lighteval/tasks/templates/copa.py +++ b/src/lighteval/tasks/templates/copa.py @@ -86,17 +86,17 @@ def get_copa_prompt_function( Format: *CF* - Context Premise thefore/cause | (Continuation 1, Continuation 2, Continuation 3) + Context Premise therefore/cause | (Continuation 1, Continuation 2, Continuation 3) *Hybrid* - Context Premise thefore/cause + Context Premise therefore/cause A. Continuation 1 B. Continuation 2 C. Continuation 3 Answer: | Continuation 1/Continuation 2/Continuation 3 *MCF* - Context Premise thefore/cause + Context Premise therefore/cause A. Continuation 1 B. Continuation 2 C. Continuation 3 diff --git a/src/lighteval/tasks/templates/hellaswag.py b/src/lighteval/tasks/templates/hellaswag.py index 43a3061b6..f5c4ba3db 100644 --- a/src/lighteval/tasks/templates/hellaswag.py +++ b/src/lighteval/tasks/templates/hellaswag.py @@ -70,7 +70,7 @@ def get_hellaswag_prompt_function( Create a templated prompt function for a Hellaswag task. Format: - Context Premise thefore/cause | (Continuation 1, Continuation 2, Continuation 3) + Context Premise therefore/cause | (Continuation 1, Continuation 2, Continuation 3) Args: language (Language): The language of the Hellaswag task. @@ -126,7 +126,7 @@ def hellaswag_prompt( if ctx_b: ctx_a = join_ctxs(ctx_a, ctx_b) - # Removoal of the [header] can happen and we need the first letter to be capital afterwards + # Removal of the [header] can happen and we need the first letter to be capital afterwards full_context = HELLASWAG_QUERY.format(activity_label=activity_label, ctx=ctx_a) choices = [ hellaswag_preprocess( diff --git a/src/lighteval/tasks/templates/nli.py b/src/lighteval/tasks/templates/nli.py index 842460306..e8809e17b 100644 --- a/src/lighteval/tasks/templates/nli.py +++ b/src/lighteval/tasks/templates/nli.py @@ -228,7 +228,7 @@ def prompt_fn(line: dict, task_name: str): if input_data is None: return None - # Template based on dicussion here: https://github.com/EleutherAI/lm-evaluation-harness/issues/450 + # Template based on discussion here: https://github.com/EleutherAI/lm-evaluation-harness/issues/450 labels = [capitalize(get_relation_label(label, translation_literals)) for label in relations] premise, hypothesis, gold_idx = input_data["premise"], input_data["hypothesis"], input_data["gold_idx"] @@ -236,15 +236,15 @@ def prompt_fn(line: dict, task_name: str): hypothesis = input_data["hypothesis"] if isinstance(formulation, HybridFormulation): # If we have the neither option move it to the end to be consistent with standard NLI evaluation - rearanged_labales = labels + rearranged_labels = labels if "neutral" in relations: neutral_idx = relations.index("neutral") - rearanged_labales = labels[:neutral_idx] + labels[neutral_idx + 1 :] + [labels[neutral_idx]] + rearranged_labels = labels[:neutral_idx] + labels[neutral_idx + 1 :] + [labels[neutral_idx]] - choices_str = f"{translation_literals.comma}{translation_literals.word_space}".join(rearanged_labales[:-1]) - hypothesis = f"{hypothesis.rstrip(PUNCT)}{translation_literals.sentence_space}{choices_str}{translation_literals.word_space}{translation_literals.or_word}{translation_literals.word_space}{rearanged_labales[-1]}{translation_literals.question_mark}" + choices_str = f"{translation_literals.comma}{translation_literals.word_space}".join(rearranged_labels[:-1]) + hypothesis = f"{hypothesis.rstrip(PUNCT)}{translation_literals.sentence_space}{choices_str}{translation_literals.word_space}{translation_literals.or_word}{translation_literals.word_space}{rearranged_labels[-1]}{translation_literals.question_mark}" - # (hynky1999): Ideally we would not compute logprobs of the Yes/No/Also in CF fomulation. However as of right now lighteval doesn't allow to + # (hynky1999): Ideally we would not compute logprobs of the Yes/No/Also in CF formulation. However as of right now lighteval doesn't allow to # use multi-context. row = { "instruction": input_data.get("instruction", ""), diff --git a/src/lighteval/tasks/templates/utils/translation_literals.py b/src/lighteval/tasks/templates/utils/translation_literals.py index 51daf7198..756285e62 100644 --- a/src/lighteval/tasks/templates/utils/translation_literals.py +++ b/src/lighteval/tasks/templates/utils/translation_literals.py @@ -178,7 +178,29 @@ def __getattribute__(self, name: str) -> str: Language.BRETON: TranslationLiterals(language=Language.BRETON), Language.BULGARIAN: TranslationLiterals(language=Language.BULGARIAN), Language.BURMESE: TranslationLiterals(language=Language.BURMESE), - Language.CATALAN: TranslationLiterals(language=Language.CATALAN), + Language.CATALAN: TranslationLiterals( + language=Language.CATALAN, + question_word="pregunta", + answer="resposta", + confirmation_word="cert", + yes="sí", + no="no", + also="també", + cause_word="perquè", + effect_word="per tant", + or_word="o", + true="veritable", + false="fals", + neither="cap", + full_stop=".", + comma=",", + question_mark="?", + exclamation_mark="!", + word_space=" ", + sentence_space=" ", + colon=":", + semicolon=";", + ), Language.CEBUANO: TranslationLiterals(language=Language.CEBUANO), Language.CHINESE: TranslationLiterals( language=Language.CHINESE, @@ -348,7 +370,29 @@ def __getattribute__(self, name: str) -> str: sentence_space=" ", colon=":", ), - Language.GALICIAN: TranslationLiterals(language=Language.GALICIAN), + Language.GALICIAN: TranslationLiterals( + language=Language.GALICIAN, + question_word="pregunta", + answer="resposta", + confirmation_word="certo", + yes="si", + no="non", + also="tamén", + cause_word="porque", + effect_word="polo tanto", + or_word="ou", + true="verdadeiro", + false="falso", + neither="ningún", + full_stop=".", + comma=",", + question_mark="?", + exclamation_mark="!", + word_space=" ", + sentence_space=" ", + colon=":", + semicolon=";", + ), Language.GEORGIAN: TranslationLiterals(language=Language.GEORGIAN), Language.GERMAN: TranslationLiterals( language=Language.GERMAN,