@@ -46,22 +46,67 @@ def insert_draft_remark(
46
46
return "\n " .join (lines )
47
47
48
48
49
+ # A group of multiple translations of a single sentence
50
+ TranslationGroup = List [str ]
51
+
52
+ # A list representing a single draft (one translation of each input sentence)
53
+ TranslatedDraft = List [str ]
54
+
55
+
56
+ # A wrapper around List[TranslationGroup] that allows upstream consumers to view a
57
+ # list of translation groups as a collection of discrete drafts
58
+ class DraftGroup :
59
+ def __init__ (self , translation_groups : List [TranslationGroup ]):
60
+ self .translation_groups = translation_groups
61
+ self .num_drafts : int = len (self .translation_groups [0 ])
62
+
63
+ def get_drafts (self ) -> List [TranslatedDraft ]:
64
+ translated_draft_sentences = [[] for _ in range (self .num_drafts )]
65
+
66
+ for translation_group in self .translation_groups :
67
+ for draft_index in range (self .num_drafts ):
68
+ translated_draft_sentences [draft_index ].append (translation_group [draft_index ])
69
+
70
+ return translated_draft_sentences
71
+
72
+
49
73
class Translator (ABC ):
50
74
@abstractmethod
51
75
def translate (
52
- self , sentences : Iterable [str ], src_iso : str , trg_iso : str , vrefs : Optional [Iterable [VerseRef ]] = None
53
- ) -> Iterable [str ]:
76
+ self ,
77
+ sentences : Iterable [str ],
78
+ src_iso : str ,
79
+ trg_iso : str ,
80
+ produce_multiple_translations : bool = False ,
81
+ vrefs : Optional [Iterable [VerseRef ]] = None ,
82
+ ) -> Iterable [TranslationGroup ]:
54
83
pass
55
84
56
- def translate_text (self , src_file_path : Path , trg_file_path : Path , src_iso : str , trg_iso : str ) -> None :
57
- write_corpus (trg_file_path , self .translate (load_corpus (src_file_path ), src_iso , trg_iso ))
85
+ def translate_text (
86
+ self ,
87
+ src_file_path : Path ,
88
+ trg_file_path : Path ,
89
+ src_iso : str ,
90
+ trg_iso : str ,
91
+ produce_multiple_translations : bool = False ,
92
+ ) -> None :
93
+ draft_set : DraftGroup = DraftGroup (
94
+ list (self .translate (load_corpus (src_file_path ), src_iso , trg_iso , produce_multiple_translations ))
95
+ )
96
+ for draft_index , translated_draft in enumerate (draft_set .get_drafts (), 1 ):
97
+ if produce_multiple_translations :
98
+ trg_draft_file_path = trg_file_path .with_suffix (f".{ draft_index } { trg_file_path .suffix } " )
99
+ else :
100
+ trg_draft_file_path = trg_file_path
101
+ write_corpus (trg_draft_file_path , translated_draft )
58
102
59
103
def translate_book (
60
104
self ,
61
105
src_project : str ,
62
106
book : str ,
63
107
output_path : Path ,
64
108
trg_iso : str ,
109
+ produce_multiple_translations : bool = False ,
65
110
chapters : List [int ] = [],
66
111
trg_project : Optional [str ] = None ,
67
112
include_inline_elements : bool = False ,
@@ -78,6 +123,7 @@ def translate_book(
78
123
output_path ,
79
124
get_iso (src_project ),
80
125
trg_iso ,
126
+ produce_multiple_translations ,
81
127
chapters ,
82
128
trg_project ,
83
129
include_inline_elements ,
@@ -87,9 +133,10 @@ def translate_book(
87
133
def translate_usfm (
88
134
self ,
89
135
src_file_path : Path ,
90
- out_path : Path ,
136
+ trg_file_path : Path ,
91
137
src_iso : str ,
92
138
trg_iso : str ,
139
+ produce_multiple_translations : bool = False ,
93
140
chapters : List [int ] = [],
94
141
trg_project : Optional [str ] = None ,
95
142
include_inline_elements : bool = False ,
@@ -131,47 +178,63 @@ def translate_usfm(
131
178
empty_sents = []
132
179
for i in reversed (range (len (sentences ))):
133
180
if len (sentences [i ]) == 0 :
134
- empty_sents .append ((i , sentences .pop (i ), vrefs .pop (i )))
181
+ sentences .pop (i )
182
+ empty_sents .append ((i , vrefs .pop (i )))
135
183
136
- translations = list (self .translate (sentences , src_iso , trg_iso , vrefs ))
184
+ translations = list (self .translate (sentences , src_iso , trg_iso , produce_multiple_translations , vrefs ))
137
185
138
186
# Add empty sentences back in
139
- for idx , sent , vref in reversed (empty_sents ):
140
- translations .insert (idx , sent )
187
+ for idx , vref in reversed (empty_sents ):
188
+ translations .insert (idx , [] )
141
189
vrefs .insert (idx , vref )
142
190
143
- rows = [([ref ], translation ) for ref , translation in zip (vrefs , translations )]
144
-
145
- # Insert translation into the USFM structure of an existing project
146
- # If the target project is not the same as the translated file's original project,
147
- # no verses outside of the ones translated will be overwritten
148
- use_src_project = trg_project is None and src_from_project
149
- trg_format_project = src_file_path .parent .name if use_src_project else trg_project
150
- if trg_format_project is not None :
151
- dest_project_path = get_project_dir (trg_format_project )
152
- dest_updater = FileParatextProjectTextUpdater (dest_project_path )
153
- usfm_out = dest_updater .update_usfm (
154
- src_file_text .id , rows , strip_all_text = use_src_project , prefer_existing_text = False
155
- )
191
+ draft_set : DraftGroup = DraftGroup (translations )
192
+ for draft_index , translated_draft in enumerate (draft_set .get_drafts (), 1 ):
193
+ rows = [([ref ], translation ) for ref , translation in zip (vrefs , translated_draft )]
156
194
157
- if usfm_out is None :
158
- raise FileNotFoundError (f"Book { src_file_text .id } does not exist in target project { trg_project } " )
159
- # Insert translation into the USFM structure of an individual file
160
- else :
161
- with open (src_file_path , encoding = "utf-8-sig" ) as f :
162
- usfm = f .read ()
163
- handler = UpdateUsfmParserHandler (rows , vrefs [0 ].book , strip_all_text = True )
164
- parse_usfm (usfm , handler )
165
- usfm_out = handler .get_usfm ()
166
-
167
- # Insert draft remark and write to output path
168
- description = f"project { src_file_text .project } " if src_from_project else f"file { src_file_path .name } "
169
- usfm_out = insert_draft_remark (usfm_out , vrefs [0 ].book , description , experiment_ckpt_str )
170
- encoding = src_settings .encoding if src_from_project else "utf-8"
171
- with out_path .open ("w" , encoding = encoding ) as f :
172
- f .write (usfm_out )
173
-
174
- def translate_docx (self , src_file_path : Path , trg_file_path : Path , src_iso : str , trg_iso : str ) -> None :
195
+ # Insert translation into the USFM structure of an existing project
196
+ # If the target project is not the same as the translated file's original project,
197
+ # no verses outside of the ones translated will be overwritten
198
+ use_src_project = trg_project is None and src_from_project
199
+ trg_format_project = src_file_path .parent .name if use_src_project else trg_project
200
+ if trg_format_project is not None :
201
+ dest_project_path = get_project_dir (trg_format_project )
202
+ dest_updater = FileParatextProjectTextUpdater (dest_project_path )
203
+ usfm_out = dest_updater .update_usfm (
204
+ src_file_text .id , rows , strip_all_text = use_src_project , prefer_existing_text = False
205
+ )
206
+
207
+ if usfm_out is None :
208
+ raise FileNotFoundError (f"Book { src_file_text .id } does not exist in target project { trg_project } " )
209
+ # Insert translation into the USFM structure of an individual file
210
+ else :
211
+ with open (src_file_path , encoding = "utf-8-sig" ) as f :
212
+ usfm = f .read ()
213
+ handler = UpdateUsfmParserHandler (rows , vrefs [0 ].book , strip_all_text = True )
214
+ parse_usfm (usfm , handler )
215
+ usfm_out = handler .get_usfm ()
216
+
217
+ # Insert draft remark and write to output path
218
+ description = f"project { src_file_text .project } " if src_from_project else f"file { src_file_path .name } "
219
+ usfm_out = insert_draft_remark (usfm_out , vrefs [0 ].book , description , experiment_ckpt_str )
220
+ encoding = src_settings .encoding if src_from_project else "utf-8"
221
+
222
+ if produce_multiple_translations :
223
+ trg_draft_file_path = trg_file_path .with_suffix (f".{ draft_index } { trg_file_path .suffix } " )
224
+ else :
225
+ trg_draft_file_path = trg_file_path
226
+
227
+ with trg_draft_file_path .open ("w" , encoding = encoding ) as f :
228
+ f .write (usfm_out )
229
+
230
+ def translate_docx (
231
+ self ,
232
+ src_file_path : Path ,
233
+ trg_file_path : Path ,
234
+ src_iso : str ,
235
+ trg_iso : str ,
236
+ produce_multiple_translations : bool = False ,
237
+ ) -> None :
175
238
tokenizer : nltk .tokenize .PunktSentenceTokenizer
176
239
try :
177
240
src_lang = Lang (src_iso )
@@ -190,9 +253,19 @@ def translate_docx(self, src_file_path: Path, trg_file_path: Path, src_iso: str,
190
253
sentences .append (sentence )
191
254
paras .append (i )
192
255
193
- for para , group in groupby (zip (self .translate (sentences , src_iso , trg_iso ), paras ), key = lambda t : t [1 ]):
194
- text = " " .join (s [0 ] for s in group )
195
- doc .paragraphs [para ].text = text
256
+ draft_set : DraftGroup = DraftGroup (
257
+ list (self .translate (sentences , src_iso , trg_iso , produce_multiple_translations ))
258
+ )
259
+
260
+ for draft_index , translated_draft in enumerate (draft_set .get_drafts (), 1 ):
261
+ for para , group in groupby (zip (translated_draft , paras ), key = lambda t : t [1 ]):
262
+ text = " " .join (s [0 ] for s in group )
263
+ doc .paragraphs [para ].text = text
264
+
265
+ if produce_multiple_translations :
266
+ trg_draft_file_path = trg_file_path .with_suffix (f".{ draft_index } { trg_file_path .suffix } " )
267
+ else :
268
+ trg_draft_file_path = trg_file_path
196
269
197
- with trg_file_path .open ("wb" ) as file :
198
- doc .save (file )
270
+ with trg_draft_file_path .open ("wb" ) as file :
271
+ doc .save (file )
0 commit comments