Skip to content

Commit 73b7dce

Browse files
authored
Merge pull request #533 from sillsdev/513-multiple-diverse-drafts
#513 multiple diverse drafts - merging pull request
2 parents 3cfab57 + 0af9c72 commit 73b7dce

File tree

8 files changed

+424
-100
lines changed

8 files changed

+424
-100
lines changed

silnlp/common/translate_google.py

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,33 @@
11
import argparse
2+
import logging
23
from typing import Iterable, Optional
34

45
from google.cloud import translate_v2 as translate
56
from machine.scripture import VerseRef, book_id_to_number
67

78
from ..common.environment import SIL_NLP_ENV
89
from .paratext import book_file_name_digits
9-
from .translator import Translator
10+
from .translator import TranslationGroup, Translator
1011
from .utils import get_git_revision_hash, get_mt_exp_dir
1112

13+
LOGGER = logging.getLogger(__package__ + ".translate")
14+
1215

1316
class GoogleTranslator(Translator):
1417
def __init__(self) -> None:
1518
self._translate_client = translate.Client()
1619

1720
def translate(
18-
self, sentences: Iterable[str], src_iso: str, trg_iso: str, vrefs: Optional[Iterable[VerseRef]] = None
19-
) -> Iterable[str]:
21+
self,
22+
sentences: Iterable[str],
23+
src_iso: str,
24+
trg_iso: str,
25+
produce_multiple_translations: bool = False,
26+
vrefs: Optional[Iterable[VerseRef]] = None,
27+
) -> Iterable[TranslationGroup]:
28+
if produce_multiple_translations:
29+
LOGGER.warning("Google Translator does not support --multiple-translations")
30+
2031
for sentence in sentences:
2132
if len(sentence) == 0:
2233
yield ""
@@ -25,7 +36,7 @@ def translate(
2536
sentence, source_language=src_iso, target_language=trg_iso, format_="text"
2637
)
2738
translation = results["translatedText"]
28-
yield translation
39+
yield [translation]
2940

3041

3142
def main() -> None:

silnlp/common/translator.py

Lines changed: 118 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -46,22 +46,67 @@ def insert_draft_remark(
4646
return "\n".join(lines)
4747

4848

49+
# A group of multiple translations of a single sentence
50+
TranslationGroup = List[str]
51+
52+
# A list representing a single draft (one translation of each input sentence)
53+
TranslatedDraft = List[str]
54+
55+
56+
# A wrapper around List[TranslationGroup] that allows upstream consumers to view a
57+
# list of translation groups as a collection of discrete drafts
58+
class DraftGroup:
59+
def __init__(self, translation_groups: List[TranslationGroup]):
60+
self.translation_groups = translation_groups
61+
self.num_drafts: int = len(self.translation_groups[0])
62+
63+
def get_drafts(self) -> List[TranslatedDraft]:
64+
translated_draft_sentences = [[] for _ in range(self.num_drafts)]
65+
66+
for translation_group in self.translation_groups:
67+
for draft_index in range(self.num_drafts):
68+
translated_draft_sentences[draft_index].append(translation_group[draft_index])
69+
70+
return translated_draft_sentences
71+
72+
4973
class Translator(ABC):
5074
@abstractmethod
5175
def translate(
52-
self, sentences: Iterable[str], src_iso: str, trg_iso: str, vrefs: Optional[Iterable[VerseRef]] = None
53-
) -> Iterable[str]:
76+
self,
77+
sentences: Iterable[str],
78+
src_iso: str,
79+
trg_iso: str,
80+
produce_multiple_translations: bool = False,
81+
vrefs: Optional[Iterable[VerseRef]] = None,
82+
) -> Iterable[TranslationGroup]:
5483
pass
5584

56-
def translate_text(self, src_file_path: Path, trg_file_path: Path, src_iso: str, trg_iso: str) -> None:
57-
write_corpus(trg_file_path, self.translate(load_corpus(src_file_path), src_iso, trg_iso))
85+
def translate_text(
86+
self,
87+
src_file_path: Path,
88+
trg_file_path: Path,
89+
src_iso: str,
90+
trg_iso: str,
91+
produce_multiple_translations: bool = False,
92+
) -> None:
93+
draft_set: DraftGroup = DraftGroup(
94+
list(self.translate(load_corpus(src_file_path), src_iso, trg_iso, produce_multiple_translations))
95+
)
96+
for draft_index, translated_draft in enumerate(draft_set.get_drafts(), 1):
97+
if produce_multiple_translations:
98+
trg_draft_file_path = trg_file_path.with_suffix(f".{draft_index}{trg_file_path.suffix}")
99+
else:
100+
trg_draft_file_path = trg_file_path
101+
write_corpus(trg_draft_file_path, translated_draft)
58102

59103
def translate_book(
60104
self,
61105
src_project: str,
62106
book: str,
63107
output_path: Path,
64108
trg_iso: str,
109+
produce_multiple_translations: bool = False,
65110
chapters: List[int] = [],
66111
trg_project: Optional[str] = None,
67112
include_inline_elements: bool = False,
@@ -78,6 +123,7 @@ def translate_book(
78123
output_path,
79124
get_iso(src_project),
80125
trg_iso,
126+
produce_multiple_translations,
81127
chapters,
82128
trg_project,
83129
include_inline_elements,
@@ -87,9 +133,10 @@ def translate_book(
87133
def translate_usfm(
88134
self,
89135
src_file_path: Path,
90-
out_path: Path,
136+
trg_file_path: Path,
91137
src_iso: str,
92138
trg_iso: str,
139+
produce_multiple_translations: bool = False,
93140
chapters: List[int] = [],
94141
trg_project: Optional[str] = None,
95142
include_inline_elements: bool = False,
@@ -131,47 +178,63 @@ def translate_usfm(
131178
empty_sents = []
132179
for i in reversed(range(len(sentences))):
133180
if len(sentences[i]) == 0:
134-
empty_sents.append((i, sentences.pop(i), vrefs.pop(i)))
181+
sentences.pop(i)
182+
empty_sents.append((i, vrefs.pop(i)))
135183

136-
translations = list(self.translate(sentences, src_iso, trg_iso, vrefs))
184+
translations = list(self.translate(sentences, src_iso, trg_iso, produce_multiple_translations, vrefs))
137185

138186
# Add empty sentences back in
139-
for idx, sent, vref in reversed(empty_sents):
140-
translations.insert(idx, sent)
187+
for idx, vref in reversed(empty_sents):
188+
translations.insert(idx, [])
141189
vrefs.insert(idx, vref)
142190

143-
rows = [([ref], translation) for ref, translation in zip(vrefs, translations)]
144-
145-
# Insert translation into the USFM structure of an existing project
146-
# If the target project is not the same as the translated file's original project,
147-
# no verses outside of the ones translated will be overwritten
148-
use_src_project = trg_project is None and src_from_project
149-
trg_format_project = src_file_path.parent.name if use_src_project else trg_project
150-
if trg_format_project is not None:
151-
dest_project_path = get_project_dir(trg_format_project)
152-
dest_updater = FileParatextProjectTextUpdater(dest_project_path)
153-
usfm_out = dest_updater.update_usfm(
154-
src_file_text.id, rows, strip_all_text=use_src_project, prefer_existing_text=False
155-
)
191+
draft_set: DraftGroup = DraftGroup(translations)
192+
for draft_index, translated_draft in enumerate(draft_set.get_drafts(), 1):
193+
rows = [([ref], translation) for ref, translation in zip(vrefs, translated_draft)]
156194

157-
if usfm_out is None:
158-
raise FileNotFoundError(f"Book {src_file_text.id} does not exist in target project {trg_project}")
159-
# Insert translation into the USFM structure of an individual file
160-
else:
161-
with open(src_file_path, encoding="utf-8-sig") as f:
162-
usfm = f.read()
163-
handler = UpdateUsfmParserHandler(rows, vrefs[0].book, strip_all_text=True)
164-
parse_usfm(usfm, handler)
165-
usfm_out = handler.get_usfm()
166-
167-
# Insert draft remark and write to output path
168-
description = f"project {src_file_text.project}" if src_from_project else f"file {src_file_path.name}"
169-
usfm_out = insert_draft_remark(usfm_out, vrefs[0].book, description, experiment_ckpt_str)
170-
encoding = src_settings.encoding if src_from_project else "utf-8"
171-
with out_path.open("w", encoding=encoding) as f:
172-
f.write(usfm_out)
173-
174-
def translate_docx(self, src_file_path: Path, trg_file_path: Path, src_iso: str, trg_iso: str) -> None:
195+
# Insert translation into the USFM structure of an existing project
196+
# If the target project is not the same as the translated file's original project,
197+
# no verses outside of the ones translated will be overwritten
198+
use_src_project = trg_project is None and src_from_project
199+
trg_format_project = src_file_path.parent.name if use_src_project else trg_project
200+
if trg_format_project is not None:
201+
dest_project_path = get_project_dir(trg_format_project)
202+
dest_updater = FileParatextProjectTextUpdater(dest_project_path)
203+
usfm_out = dest_updater.update_usfm(
204+
src_file_text.id, rows, strip_all_text=use_src_project, prefer_existing_text=False
205+
)
206+
207+
if usfm_out is None:
208+
raise FileNotFoundError(f"Book {src_file_text.id} does not exist in target project {trg_project}")
209+
# Insert translation into the USFM structure of an individual file
210+
else:
211+
with open(src_file_path, encoding="utf-8-sig") as f:
212+
usfm = f.read()
213+
handler = UpdateUsfmParserHandler(rows, vrefs[0].book, strip_all_text=True)
214+
parse_usfm(usfm, handler)
215+
usfm_out = handler.get_usfm()
216+
217+
# Insert draft remark and write to output path
218+
description = f"project {src_file_text.project}" if src_from_project else f"file {src_file_path.name}"
219+
usfm_out = insert_draft_remark(usfm_out, vrefs[0].book, description, experiment_ckpt_str)
220+
encoding = src_settings.encoding if src_from_project else "utf-8"
221+
222+
if produce_multiple_translations:
223+
trg_draft_file_path = trg_file_path.with_suffix(f".{draft_index}{trg_file_path.suffix}")
224+
else:
225+
trg_draft_file_path = trg_file_path
226+
227+
with trg_draft_file_path.open("w", encoding=encoding) as f:
228+
f.write(usfm_out)
229+
230+
def translate_docx(
231+
self,
232+
src_file_path: Path,
233+
trg_file_path: Path,
234+
src_iso: str,
235+
trg_iso: str,
236+
produce_multiple_translations: bool = False,
237+
) -> None:
175238
tokenizer: nltk.tokenize.PunktSentenceTokenizer
176239
try:
177240
src_lang = Lang(src_iso)
@@ -190,9 +253,19 @@ def translate_docx(self, src_file_path: Path, trg_file_path: Path, src_iso: str,
190253
sentences.append(sentence)
191254
paras.append(i)
192255

193-
for para, group in groupby(zip(self.translate(sentences, src_iso, trg_iso), paras), key=lambda t: t[1]):
194-
text = " ".join(s[0] for s in group)
195-
doc.paragraphs[para].text = text
256+
draft_set: DraftGroup = DraftGroup(
257+
list(self.translate(sentences, src_iso, trg_iso, produce_multiple_translations))
258+
)
259+
260+
for draft_index, translated_draft in enumerate(draft_set.get_drafts(), 1):
261+
for para, group in groupby(zip(translated_draft, paras), key=lambda t: t[1]):
262+
text = " ".join(s[0] for s in group)
263+
doc.paragraphs[para].text = text
264+
265+
if produce_multiple_translations:
266+
trg_draft_file_path = trg_file_path.with_suffix(f".{draft_index}{trg_file_path.suffix}")
267+
else:
268+
trg_draft_file_path = trg_file_path
196269

197-
with trg_file_path.open("wb") as file:
198-
doc.save(file)
270+
with trg_draft_file_path.open("wb") as file:
271+
doc.save(file)

silnlp/nmt/config.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
)
3939
from ..common.environment import SIL_NLP_ENV
4040
from ..common.script_utils import get_script, is_represented
41+
from ..common.translator import TranslationGroup
4142
from ..common.utils import NoiseMethod, Side, create_noise_methods, get_mt_exp_dir, is_set, set_seed
4243
from .augment import AugmentMethod, create_augment_methods
4344
from .tokenizer import Tokenizer
@@ -320,6 +321,7 @@ def translate_test_files(
320321
self,
321322
input_paths: List[Path],
322323
translation_paths: List[Path],
324+
produce_multiple_translations: bool = False,
323325
vref_paths: Optional[List[Path]] = None,
324326
ckpt: Union[CheckpointType, str, int] = CheckpointType.LAST,
325327
) -> None: ...
@@ -332,7 +334,7 @@ def translate(
332334
trg_iso: str,
333335
vrefs: Optional[Iterable[VerseRef]] = None,
334336
ckpt: Union[CheckpointType, str, int] = CheckpointType.LAST,
335-
) -> Iterable[str]: ...
337+
) -> Iterable[TranslationGroup]: ...
336338

337339
@abstractmethod
338340
def get_checkpoint_path(self, ckpt: Union[CheckpointType, str, int]) -> Tuple[Path, int]: ...

silnlp/nmt/experiment.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ class SILExperiment:
2828
run_train: bool = False
2929
run_test: bool = False
3030
run_translate: bool = False
31+
produce_multiple_translations: bool = False
3132
scorers: Set[str] = field(default_factory=set)
3233
score_by_book: bool = False
3334
commit: Optional[str] = None
@@ -105,6 +106,7 @@ def translate(self):
105106
config.get("src_project"),
106107
config.get("trg_project"),
107108
config.get("trg_iso"),
109+
self.produce_multiple_translations,
108110
config.get("include_inline_elements", False),
109111
)
110112
elif config.get("src_prefix"):
@@ -115,13 +117,15 @@ def translate(self):
115117
config.get("end_seq"),
116118
config.get("src_iso"),
117119
config.get("trg_iso"),
120+
self.produce_multiple_translations,
118121
)
119122
elif config.get("src"):
120123
translator.translate_files(
121124
config.get("src"),
122125
config.get("trg"),
123126
config.get("src_iso"),
124127
config.get("trg_iso"),
128+
self.produce_multiple_translations,
125129
config.get("include_inline_elements", False),
126130
)
127131
else:
@@ -150,6 +154,12 @@ def main() -> None:
150154
parser.add_argument("--train", default=False, action="store_true", help="Run the train step.")
151155
parser.add_argument("--test", default=False, action="store_true", help="Run the test step.")
152156
parser.add_argument("--translate", default=False, action="store_true", help="Create drafts.")
157+
parser.add_argument(
158+
"--multiple-translations",
159+
default=False,
160+
action="store_true",
161+
help='Produce multiple translations of each verse. These will be saved in separate files with suffixes like ".1.txt", ".2.txt", etc.',
162+
)
153163
parser.add_argument("--score-by-book", default=False, action="store_true", help="Score individual books")
154164
parser.add_argument("--mt-dir", default=None, type=str, help="The machine translation directory.")
155165
parser.add_argument(
@@ -199,6 +209,7 @@ def main() -> None:
199209
run_train=args.train,
200210
run_test=args.test,
201211
run_translate=args.translate,
212+
produce_multiple_translations=args.multiple_translations,
202213
scorers=set(s.lower() for s in args.scorers),
203214
score_by_book=args.score_by_book,
204215
commit=args.commit,

0 commit comments

Comments
 (0)