Skip to content

Commit

Permalink
remove redundant line (andrewyng#8)
Browse files Browse the repository at this point in the history
* remove redundant line

the line removed was redundant - the `source_text` gets already replaced through the f"-string

* Update utils.py
  • Loading branch information
rdmueller authored and Salah-Sal committed Jul 12, 2024
1 parent 2f0c344 commit e64bc03
Show file tree
Hide file tree
Showing 2 changed files with 77 additions and 10 deletions.
75 changes: 75 additions & 0 deletions app/process.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import re
from difflib import Differ

import docx
Expand Down Expand Up @@ -86,6 +87,52 @@ def diff_texts(text1, text2):
return highlighted_text


def tokenize_mixed_direction_text(text: str, language: str) -> str:
"""
Tokenizes a given text while correctly handling the embedding of LTR text within an RTL context,
preserving formatting such as spaces and new lines. LTR words are isolated using Unicode
Bidirectional Algorithm control characters to ensure proper display within RTL text.
Args:
text (str): The text to be tokenized and formatted.
language (str): The language of the text, which determines text directionality.
Returns:
str: The tokenized text with LTR words appropriately wrapped to preserve reading flow in RTL languages.
"""
rtl_languages = {
"Arabic",
"Aramaic",
"Azeri",
"Dhivehi/Maldivian",
"Hebrew",
"Kurdish (Sorani)",
"Persian/Farsi",
"Urdu",
}
is_rtl = language in rtl_languages

# Regex to capture words, non-word characters, and any whitespace
words_and_delimiters = re.findall(r"\w+|[^\w\s]+|\s+", text)

new_text = []
ltr_pattern = re.compile(
"[A-Za-z]+"
) # This pattern identifies Latin script

if is_rtl:
for segment in words_and_delimiters:
# Check if the segment contains Latin script and not just whitespace
if ltr_pattern.search(segment) and not segment.isspace():
# Wrap LTR segments with Right-to-Left Embedding (RLE) and Pop Directional Format (PDF)
segment = "\u202b" + segment + "\u202c"
new_text.append(segment)
else:
new_text = words_and_delimiters # Non-RTL texts are returned unchanged

return "".join(new_text)


# modified from src.translaation-agent.utils.tranlsate
def translator(
source_lang: str,
Expand Down Expand Up @@ -116,6 +163,13 @@ def translator(
final_translation = one_chunk_improve_translation(
source_lang, target_lang, source_text, init_translation, reflection
)
init_translation = tokenize_mixed_direction_text(
init_translation, target_lang
)
reflection = tokenize_mixed_direction_text(reflection, target_lang)
final_translation = tokenize_mixed_direction_text(
final_translation, target_lang
)

return init_translation, reflection, final_translation

Expand All @@ -142,6 +196,9 @@ def translator(
)

init_translation = "".join(translation_1_chunks)
init_translation = tokenize_mixed_direction_text(
init_translation, target_lang
)

progress((2, 3), desc="Reflection...")
reflection_chunks = multichunk_reflect_on_translation(
Expand All @@ -153,6 +210,7 @@ def translator(
)

reflection = "".join(reflection_chunks)
reflection = tokenize_mixed_direction_text(reflection, target_lang)

progress((3, 3), desc="Second translation...")
translation_2_chunks = multichunk_improve_translation(
Expand All @@ -164,6 +222,9 @@ def translator(
)

final_translation = "".join(translation_2_chunks)
final_translation = tokenize_mixed_direction_text(
final_translation, target_lang
)

return init_translation, reflection, final_translation

Expand Down Expand Up @@ -206,6 +267,13 @@ def translator_sec(
final_translation = one_chunk_improve_translation(
source_lang, target_lang, source_text, init_translation, reflection
)
init_translation = tokenize_mixed_direction_text(
init_translation, target_lang
)
reflection = tokenize_mixed_direction_text(reflection, target_lang)
final_translation = tokenize_mixed_direction_text(
final_translation, target_lang
)

return init_translation, reflection, final_translation

Expand All @@ -232,6 +300,9 @@ def translator_sec(
)

init_translation = "".join(translation_1_chunks)
init_translation = tokenize_mixed_direction_text(
init_translation, target_lang
)

try:
model_load(endpoint2, base2, model2, api_key2)
Expand All @@ -248,6 +319,7 @@ def translator_sec(
)

reflection = "".join(reflection_chunks)
reflection = tokenize_mixed_direction_text(reflection, target_lang)

progress((3, 3), desc="Second translation...")
translation_2_chunks = multichunk_improve_translation(
Expand All @@ -259,5 +331,8 @@ def translator_sec(
)

final_translation = "".join(translation_2_chunks)
final_translation = tokenize_mixed_direction_text(
final_translation, target_lang
)

return init_translation, reflection, final_translation
12 changes: 2 additions & 10 deletions src/translation_agent/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,9 +92,7 @@ def one_chunk_initial_translation(
{target_lang}:"""

prompt = translation_prompt.format(source_text=source_text)

translation = get_completion(prompt, system_message=system_message)
translation = get_completion(translation_prompt, system_message=system_message)

return translation

Expand Down Expand Up @@ -170,13 +168,7 @@ def one_chunk_reflect_on_translation(
Each suggestion should address one specific part of the translation.
Output only the suggestions and nothing else."""

prompt = reflection_prompt.format(
source_lang=source_lang,
target_lang=target_lang,
source_text=source_text,
translation_1=translation_1,
)
reflection = get_completion(prompt, system_message=system_message)
reflection = get_completion(reflection_prompt, system_message=system_message)
return reflection


Expand Down

0 comments on commit e64bc03

Please sign in to comment.