sillsdev
diff --git a/‎silnlp/common/translate_google.py
Lines changed: 15 additions & 4 deletions b/‎silnlp/common/translate_google.py
Lines changed: 15 additions & 4 deletions
diff --git a/‎silnlp/common/translator.py
Lines changed: 118 additions & 45 deletions b/‎silnlp/common/translator.py
Lines changed: 118 additions & 45 deletions
diff --git a/‎silnlp/nmt/config.py
Lines changed: 3 additions & 1 deletion b/‎silnlp/nmt/config.py
Lines changed: 3 additions & 1 deletion
diff --git a/‎silnlp/nmt/experiment.py
Lines changed: 11 additions & 0 deletions b/‎silnlp/nmt/experiment.py
Lines changed: 11 additions & 0 deletions
@@ -1,22 +1,33 @@
 import argparse
+import logging
 from typing import Iterable, Optional
 
 from google.cloud import translate_v2 as translate
 from machine.scripture import VerseRef, book_id_to_number
 
 from ..common.environment import SIL_NLP_ENV
 from .paratext import book_file_name_digits
-from .translator import Translator
+from .translator import TranslationGroup, Translator
 from .utils import get_git_revision_hash, get_mt_exp_dir
 
+LOGGER = logging.getLogger(__package__ + ".translate")
+
 
 class GoogleTranslator(Translator):
     def __init__(self) -> None:
         self._translate_client = translate.Client()
 
     def translate(
-        self, sentences: Iterable[str], src_iso: str, trg_iso: str, vrefs: Optional[Iterable[VerseRef]] = None
-    ) -> Iterable[str]:
+        self,
+        sentences: Iterable[str],
+        src_iso: str,
+        trg_iso: str,
+        produce_multiple_translations: bool = False,
+        vrefs: Optional[Iterable[VerseRef]] = None,
+    ) -> Iterable[TranslationGroup]:
+        if produce_multiple_translations:
+            LOGGER.warning("Google Translator does not support --multiple-translations")
+
         for sentence in sentences:
             if len(sentence) == 0:
                 yield ""
@@ -25,7 +36,7 @@ def translate(
                     sentence, source_language=src_iso, target_language=trg_iso, format_="text"
                 )
                 translation = results["translatedText"]
-                yield translation
+                yield [translation]
 
 
 def main() -> None:
 
@@ -46,22 +46,67 @@ def insert_draft_remark(
     return "\n".join(lines)
 
 
+# A group of multiple translations of a single sentence
+TranslationGroup = List[str]
+
+# A list representing a single draft (one translation of each input sentence)
+TranslatedDraft = List[str]
+
+
+# A wrapper around List[TranslationGroup] that allows upstream consumers to view a
+# list of translation groups as a collection of discrete drafts
+class DraftGroup:
+    def __init__(self, translation_groups: List[TranslationGroup]):
+        self.translation_groups = translation_groups
+        self.num_drafts: int = len(self.translation_groups[0])
+
+    def get_drafts(self) -> List[TranslatedDraft]:
+        translated_draft_sentences = [[] for _ in range(self.num_drafts)]
+
+        for translation_group in self.translation_groups:
+            for draft_index in range(self.num_drafts):
+                translated_draft_sentences[draft_index].append(translation_group[draft_index])
+
+        return translated_draft_sentences
+
+
 class Translator(ABC):
     @abstractmethod
     def translate(
-        self, sentences: Iterable[str], src_iso: str, trg_iso: str, vrefs: Optional[Iterable[VerseRef]] = None
-    ) -> Iterable[str]:
+        self,
+        sentences: Iterable[str],
+        src_iso: str,
+        trg_iso: str,
+        produce_multiple_translations: bool = False,
+        vrefs: Optional[Iterable[VerseRef]] = None,
+    ) -> Iterable[TranslationGroup]:
         pass
 
-    def translate_text(self, src_file_path: Path, trg_file_path: Path, src_iso: str, trg_iso: str) -> None:
-        write_corpus(trg_file_path, self.translate(load_corpus(src_file_path), src_iso, trg_iso))
+    def translate_text(
+        self,
+        src_file_path: Path,
+        trg_file_path: Path,
+        src_iso: str,
+        trg_iso: str,
+        produce_multiple_translations: bool = False,
+    ) -> None:
+        draft_set: DraftGroup = DraftGroup(
+            list(self.translate(load_corpus(src_file_path), src_iso, trg_iso, produce_multiple_translations))
+        )
+        for draft_index, translated_draft in enumerate(draft_set.get_drafts(), 1):
+            if produce_multiple_translations:
+                trg_draft_file_path = trg_file_path.with_suffix(f".{draft_index}{trg_file_path.suffix}")
+            else:
+                trg_draft_file_path = trg_file_path
+            write_corpus(trg_draft_file_path, translated_draft)
 
     def translate_book(
         self,
         src_project: str,
         book: str,
         output_path: Path,
         trg_iso: str,
+        produce_multiple_translations: bool = False,
         chapters: List[int] = [],
         trg_project: Optional[str] = None,
         include_inline_elements: bool = False,
@@ -78,6 +123,7 @@ def translate_book(
             output_path,
             get_iso(src_project),
             trg_iso,
+            produce_multiple_translations,
             chapters,
             trg_project,
             include_inline_elements,
@@ -87,9 +133,10 @@ def translate_book(
     def translate_usfm(
         self,
         src_file_path: Path,
-        out_path: Path,
+        trg_file_path: Path,
         src_iso: str,
         trg_iso: str,
+        produce_multiple_translations: bool = False,
         chapters: List[int] = [],
         trg_project: Optional[str] = None,
         include_inline_elements: bool = False,
@@ -131,47 +178,63 @@ def translate_usfm(
         empty_sents = []
         for i in reversed(range(len(sentences))):
             if len(sentences[i]) == 0:
-                empty_sents.append((i, sentences.pop(i), vrefs.pop(i)))
+                sentences.pop(i)
+                empty_sents.append((i, vrefs.pop(i)))
 
-        translations = list(self.translate(sentences, src_iso, trg_iso, vrefs))
+        translations = list(self.translate(sentences, src_iso, trg_iso, produce_multiple_translations, vrefs))
 
         # Add empty sentences back in
-        for idx, sent, vref in reversed(empty_sents):
-            translations.insert(idx, sent)
+        for idx, vref in reversed(empty_sents):
+            translations.insert(idx, [])
             vrefs.insert(idx, vref)
 
-        rows = [([ref], translation) for ref, translation in zip(vrefs, translations)]
-
-        # Insert translation into the USFM structure of an existing project
-        # If the target project is not the same as the translated file's original project,
-        # no verses outside of the ones translated will be overwritten
-        use_src_project = trg_project is None and src_from_project
-        trg_format_project = src_file_path.parent.name if use_src_project else trg_project
-        if trg_format_project is not None:
-            dest_project_path = get_project_dir(trg_format_project)
-            dest_updater = FileParatextProjectTextUpdater(dest_project_path)
-            usfm_out = dest_updater.update_usfm(
-                src_file_text.id, rows, strip_all_text=use_src_project, prefer_existing_text=False
-            )
+        draft_set: DraftGroup = DraftGroup(translations)
+        for draft_index, translated_draft in enumerate(draft_set.get_drafts(), 1):
+            rows = [([ref], translation) for ref, translation in zip(vrefs, translated_draft)]
 
-            if usfm_out is None:
-                raise FileNotFoundError(f"Book {src_file_text.id} does not exist in target project {trg_project}")
-        # Insert translation into the USFM structure of an individual file
-        else:
-            with open(src_file_path, encoding="utf-8-sig") as f:
-                usfm = f.read()
-            handler = UpdateUsfmParserHandler(rows, vrefs[0].book, strip_all_text=True)
-            parse_usfm(usfm, handler)
-            usfm_out = handler.get_usfm()
-
-        # Insert draft remark and write to output path
-        description = f"project {src_file_text.project}" if src_from_project else f"file {src_file_path.name}"
-        usfm_out = insert_draft_remark(usfm_out, vrefs[0].book, description, experiment_ckpt_str)
-        encoding = src_settings.encoding if src_from_project else "utf-8"
-        with out_path.open("w", encoding=encoding) as f:
-            f.write(usfm_out)
-
-    def translate_docx(self, src_file_path: Path, trg_file_path: Path, src_iso: str, trg_iso: str) -> None:
+            # Insert translation into the USFM structure of an existing project
+            # If the target project is not the same as the translated file's original project,
+            # no verses outside of the ones translated will be overwritten
+            use_src_project = trg_project is None and src_from_project
+            trg_format_project = src_file_path.parent.name if use_src_project else trg_project
+            if trg_format_project is not None:
+                dest_project_path = get_project_dir(trg_format_project)
+                dest_updater = FileParatextProjectTextUpdater(dest_project_path)
+                usfm_out = dest_updater.update_usfm(
+                    src_file_text.id, rows, strip_all_text=use_src_project, prefer_existing_text=False
+                )
+
+                if usfm_out is None:
+                    raise FileNotFoundError(f"Book {src_file_text.id} does not exist in target project {trg_project}")
+            # Insert translation into the USFM structure of an individual file
+            else:
+                with open(src_file_path, encoding="utf-8-sig") as f:
+                    usfm = f.read()
+                handler = UpdateUsfmParserHandler(rows, vrefs[0].book, strip_all_text=True)
+                parse_usfm(usfm, handler)
+                usfm_out = handler.get_usfm()
+
+            # Insert draft remark and write to output path
+            description = f"project {src_file_text.project}" if src_from_project else f"file {src_file_path.name}"
+            usfm_out = insert_draft_remark(usfm_out, vrefs[0].book, description, experiment_ckpt_str)
+            encoding = src_settings.encoding if src_from_project else "utf-8"
+
+            if produce_multiple_translations:
+                trg_draft_file_path = trg_file_path.with_suffix(f".{draft_index}{trg_file_path.suffix}")
+            else:
+                trg_draft_file_path = trg_file_path
+
+            with trg_draft_file_path.open("w", encoding=encoding) as f:
+                f.write(usfm_out)
+
+    def translate_docx(
+        self,
+        src_file_path: Path,
+        trg_file_path: Path,
+        src_iso: str,
+        trg_iso: str,
+        produce_multiple_translations: bool = False,
+    ) -> None:
         tokenizer: nltk.tokenize.PunktSentenceTokenizer
         try:
             src_lang = Lang(src_iso)
@@ -190,9 +253,19 @@ def translate_docx(self, src_file_path: Path, trg_file_path: Path, src_iso: str,
                 sentences.append(sentence)
                 paras.append(i)
 
-        for para, group in groupby(zip(self.translate(sentences, src_iso, trg_iso), paras), key=lambda t: t[1]):
-            text = " ".join(s[0] for s in group)
-            doc.paragraphs[para].text = text
+        draft_set: DraftGroup = DraftGroup(
+            list(self.translate(sentences, src_iso, trg_iso, produce_multiple_translations))
+        )
+
+        for draft_index, translated_draft in enumerate(draft_set.get_drafts(), 1):
+            for para, group in groupby(zip(translated_draft, paras), key=lambda t: t[1]):
+                text = " ".join(s[0] for s in group)
+                doc.paragraphs[para].text = text
+
+            if produce_multiple_translations:
+                trg_draft_file_path = trg_file_path.with_suffix(f".{draft_index}{trg_file_path.suffix}")
+            else:
+                trg_draft_file_path = trg_file_path
 
-        with trg_file_path.open("wb") as file:
-            doc.save(file)
+            with trg_draft_file_path.open("wb") as file:
+                doc.save(file)
@@ -38,6 +38,7 @@
 )
 from ..common.environment import SIL_NLP_ENV
 from ..common.script_utils import get_script, is_represented
+from ..common.translator import TranslationGroup
 from ..common.utils import NoiseMethod, Side, create_noise_methods, get_mt_exp_dir, is_set, set_seed
 from .augment import AugmentMethod, create_augment_methods
 from .tokenizer import Tokenizer
@@ -320,6 +321,7 @@ def translate_test_files(
         self,
         input_paths: List[Path],
         translation_paths: List[Path],
+        produce_multiple_translations: bool = False,
         vref_paths: Optional[List[Path]] = None,
         ckpt: Union[CheckpointType, str, int] = CheckpointType.LAST,
     ) -> None: ...
@@ -332,7 +334,7 @@ def translate(
         trg_iso: str,
         vrefs: Optional[Iterable[VerseRef]] = None,
         ckpt: Union[CheckpointType, str, int] = CheckpointType.LAST,
-    ) -> Iterable[str]: ...
+    ) -> Iterable[TranslationGroup]: ...
 
     @abstractmethod
     def get_checkpoint_path(self, ckpt: Union[CheckpointType, str, int]) -> Tuple[Path, int]: ...
 
@@ -28,6 +28,7 @@ class SILExperiment:
     run_train: bool = False
     run_test: bool = False
     run_translate: bool = False
+    produce_multiple_translations: bool = False
     scorers: Set[str] = field(default_factory=set)
     score_by_book: bool = False
     commit: Optional[str] = None
@@ -105,6 +106,7 @@ def translate(self):
                     config.get("src_project"),
                     config.get("trg_project"),
                     config.get("trg_iso"),
+                    self.produce_multiple_translations,
                     config.get("include_inline_elements", False),
                 )
             elif config.get("src_prefix"):
@@ -115,13 +117,15 @@ def translate(self):
                     config.get("end_seq"),
                     config.get("src_iso"),
                     config.get("trg_iso"),
+                    self.produce_multiple_translations,
                 )
             elif config.get("src"):
                 translator.translate_files(
                     config.get("src"),
                     config.get("trg"),
                     config.get("src_iso"),
                     config.get("trg_iso"),
+                    self.produce_multiple_translations,
                     config.get("include_inline_elements", False),
                 )
             else:
@@ -150,6 +154,12 @@ def main() -> None:
     parser.add_argument("--train", default=False, action="store_true", help="Run the train step.")
     parser.add_argument("--test", default=False, action="store_true", help="Run the test step.")
     parser.add_argument("--translate", default=False, action="store_true", help="Create drafts.")
+    parser.add_argument(
+        "--multiple-translations",
+        default=False,
+        action="store_true",
+        help='Produce multiple translations of each verse. These will be saved in separate files with suffixes like ".1.txt", ".2.txt", etc.',
+    )
     parser.add_argument("--score-by-book", default=False, action="store_true", help="Score individual books")
     parser.add_argument("--mt-dir", default=None, type=str, help="The machine translation directory.")
     parser.add_argument(
@@ -199,6 +209,7 @@ def main() -> None:
         run_train=args.train,
         run_test=args.test,
         run_translate=args.translate,
+        produce_multiple_translations=args.multiple_translations,
         scorers=set(s.lower() for s in args.scorers),
         score_by_book=args.score_by_book,
         commit=args.commit,