Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix mixed-direction text layout in translations (Resolves #34) #37

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 75 additions & 0 deletions app/process.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import re
from difflib import Differ

import docx
Expand Down Expand Up @@ -86,6 +87,52 @@ def diff_texts(text1, text2):
return highlighted_text


def tokenize_mixed_direction_text(text: str, language: str) -> str:
"""
Tokenizes a given text while correctly handling the embedding of LTR text within an RTL context,
preserving formatting such as spaces and new lines. LTR words are isolated using Unicode
Bidirectional Algorithm control characters to ensure proper display within RTL text.

Args:
text (str): The text to be tokenized and formatted.
language (str): The language of the text, which determines text directionality.

Returns:
str: The tokenized text with LTR words appropriately wrapped to preserve reading flow in RTL languages.
"""
rtl_languages = {
"Arabic",
"Aramaic",
"Azeri",
"Dhivehi/Maldivian",
"Hebrew",
"Kurdish (Sorani)",
"Persian/Farsi",
"Urdu",
}
is_rtl = language in rtl_languages

# Regex to capture words, non-word characters, and any whitespace
words_and_delimiters = re.findall(r"\w+|[^\w\s]+|\s+", text)

new_text = []
ltr_pattern = re.compile(
"[A-Za-z]+"
) # This pattern identifies Latin script

if is_rtl:
for segment in words_and_delimiters:
# Check if the segment contains Latin script and not just whitespace
if ltr_pattern.search(segment) and not segment.isspace():
# Wrap LTR segments with Right-to-Left Embedding (RLE) and Pop Directional Format (PDF)
segment = "\u202b" + segment + "\u202c"
new_text.append(segment)
else:
new_text = words_and_delimiters # Non-RTL texts are returned unchanged

return "".join(new_text)


# modified from src.translaation-agent.utils.tranlsate
def translator(
source_lang: str,
Expand Down Expand Up @@ -116,6 +163,13 @@ def translator(
final_translation = one_chunk_improve_translation(
source_lang, target_lang, source_text, init_translation, reflection
)
init_translation = tokenize_mixed_direction_text(
init_translation, target_lang
)
reflection = tokenize_mixed_direction_text(reflection, target_lang)
final_translation = tokenize_mixed_direction_text(
final_translation, target_lang
)

return init_translation, reflection, final_translation

Expand All @@ -142,6 +196,9 @@ def translator(
)

init_translation = "".join(translation_1_chunks)
init_translation = tokenize_mixed_direction_text(
init_translation, target_lang
)

progress((2, 3), desc="Reflection...")
reflection_chunks = multichunk_reflect_on_translation(
Expand All @@ -153,6 +210,7 @@ def translator(
)

reflection = "".join(reflection_chunks)
reflection = tokenize_mixed_direction_text(reflection, target_lang)

progress((3, 3), desc="Second translation...")
translation_2_chunks = multichunk_improve_translation(
Expand All @@ -164,6 +222,9 @@ def translator(
)

final_translation = "".join(translation_2_chunks)
final_translation = tokenize_mixed_direction_text(
final_translation, target_lang
)

return init_translation, reflection, final_translation

Expand Down Expand Up @@ -206,6 +267,13 @@ def translator_sec(
final_translation = one_chunk_improve_translation(
source_lang, target_lang, source_text, init_translation, reflection
)
init_translation = tokenize_mixed_direction_text(
init_translation, target_lang
)
reflection = tokenize_mixed_direction_text(reflection, target_lang)
final_translation = tokenize_mixed_direction_text(
final_translation, target_lang
)

return init_translation, reflection, final_translation

Expand All @@ -232,6 +300,9 @@ def translator_sec(
)

init_translation = "".join(translation_1_chunks)
init_translation = tokenize_mixed_direction_text(
init_translation, target_lang
)

try:
model_load(endpoint2, base2, model2, api_key2)
Expand All @@ -248,6 +319,7 @@ def translator_sec(
)

reflection = "".join(reflection_chunks)
reflection = tokenize_mixed_direction_text(reflection, target_lang)

progress((3, 3), desc="Second translation...")
translation_2_chunks = multichunk_improve_translation(
Expand All @@ -259,5 +331,8 @@ def translator_sec(
)

final_translation = "".join(translation_2_chunks)
final_translation = tokenize_mixed_direction_text(
final_translation, target_lang
)

return init_translation, reflection, final_translation
12 changes: 2 additions & 10 deletions src/translation_agent/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,9 +92,7 @@ def one_chunk_initial_translation(

{target_lang}:"""

prompt = translation_prompt.format(source_text=source_text)

translation = get_completion(prompt, system_message=system_message)
translation = get_completion(translation_prompt, system_message=system_message)

return translation

Expand Down Expand Up @@ -170,13 +168,7 @@ def one_chunk_reflect_on_translation(
Each suggestion should address one specific part of the translation.
Output only the suggestions and nothing else."""

prompt = reflection_prompt.format(
source_lang=source_lang,
target_lang=target_lang,
source_text=source_text,
translation_1=translation_1,
)
reflection = get_completion(prompt, system_message=system_message)
reflection = get_completion(reflection_prompt, system_message=system_message)
return reflection


Expand Down