diff --git a/.gitignore b/.gitignore index b5844af..0d68f6f 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,4 @@ poetry.lock floresp-v2.0-rc.3 *cache wmt +outputs diff --git a/app/webui/README.md b/app/README.md similarity index 94% rename from app/webui/README.md rename to app/README.md index e6c4df5..1d84349 100644 --- a/app/webui/README.md +++ b/app/README.md @@ -38,7 +38,7 @@ This repository contains a Gradio web UI for a translation agent that utilizes v poetry install --with app poetry shell ``` - + 2. **Set API Keys:** - Rename `.env.sample` to `.env`, you can add your API keys for each service: @@ -53,11 +53,11 @@ This repository contains a Gradio web UI for a translation agent that utilizes v **Linux** ```bash - python app/webui/app.py + python app/app.py ``` **Windows** ```bash - python .\app\webui\app.py + python .\app\app.py ``` 4. **Access the Web UI:** diff --git a/app/webui/app.py b/app/app.py similarity index 55% rename from app/webui/app.py rename to app/app.py index 9cbb32b..06d3303 100644 --- a/app/webui/app.py +++ b/app/app.py @@ -1,8 +1,16 @@ import os import re -import gradio as gr from glob import glob -from process import model_load, diff_texts, translator, translator_sec, extract_docx, extract_pdf, extract_text + +import gradio as gr +from process import diff_texts +from process import extract_docx +from process import extract_pdf +from process import extract_text +from process import model_load +from process import translator +from process import translator_sec + def huanik( endpoint: str, @@ -22,28 +30,31 @@ def huanik( temperature: int, rpm: int, ): - if not source_text or source_lang == target_lang: - raise gr.Error("Please check that the content or options are entered correctly.") + raise gr.Error( + "Please check that the content or options are entered correctly." + ) try: model_load(endpoint, base, model, api_key, temperature, rpm) except Exception as e: - raise gr.Error(f"An unexpected error occurred: {e}") + raise gr.Error(f"An unexpected error occurred: {e}") from e - source_text = re.sub(r'(?m)^\s*$\n?', '', source_text) + source_text = re.sub(r"(?m)^\s*$\n?", "", source_text) if choice: - init_translation, reflect_translation, final_translation = translator_sec( - endpoint2=endpoint2, - base2=base2, - model2=model2, - api_key2=api_key2, - source_lang=source_lang, - target_lang=target_lang, - source_text=source_text, - country=country, - max_tokens=max_tokens, + init_translation, reflect_translation, final_translation = ( + translator_sec( + endpoint2=endpoint2, + base2=base2, + model2=model2, + api_key2=api_key2, + source_lang=source_lang, + target_lang=target_lang, + source_text=source_text, + country=country, + max_tokens=max_tokens, + ) ) else: @@ -61,10 +72,12 @@ def huanik( combine_adjacent=True, show_legend=True, visible=True, - color_map={"removed": "red", "added": "green"}) + color_map={"removed": "red", "added": "green"}, + ) return init_translation, reflect_translation, final_translation, final_diff + def update_model(endpoint): endpoint_model_map = { "Groq": "llama3-70b-8192", @@ -79,6 +92,7 @@ def update_model(endpoint): base = gr.update(visible=False) return gr.update(value=endpoint_model_map[endpoint]), base + def read_doc(path): file_type = path.split(".")[-1] print(file_type) @@ -89,19 +103,22 @@ def read_doc(path): content = extract_docx(path) else: content = extract_text(path) - return re.sub(r'(?m)^\s*$\n?', '', content) + return re.sub(r"(?m)^\s*$\n?", "", content) else: raise gr.Error("Oops, unsupported files.") + def enable_sec(choice): if choice: - return gr.update(visible = True) + return gr.update(visible=True) else: - return gr.update(visible = False) + return gr.update(visible=False) + def update_menu(visible): return not visible, gr.update(visible=not visible) + def export_txt(strings): os.makedirs("outputs", exist_ok=True) base_count = len(glob(os.path.join("outputs", "*.txt"))) @@ -110,19 +127,33 @@ def export_txt(strings): f.write(strings) return gr.update(value=file_path, visible=True) -def switch(source_lang,source_text,target_lang,output_final): + +def switch(source_lang, source_text, target_lang, output_final): if output_final: - return gr.update(value=target_lang), gr.update(value=output_final), gr.update(value=source_lang), gr.update(value=source_text) + return ( + gr.update(value=target_lang), + gr.update(value=output_final), + gr.update(value=source_lang), + gr.update(value=source_text), + ) else: - return gr.update(value=target_lang), gr.update(value=source_text), gr.update(value=source_lang), gr.update(value="") + return ( + gr.update(value=target_lang), + gr.update(value=source_text), + gr.update(value=source_lang), + gr.update(value=""), + ) + -def closeBtnShow(): +def close_btn_show(): return gr.update(visible=False), gr.update(visible=True) -def closeBtnHide(output_final): + +def close_btn_hide(output_final): if output_final: return gr.update(visible=True), gr.update(visible=False) + TITLE = """
Translation Agent WebUI
@@ -182,8 +213,8 @@ def closeBtnHide(output_final): JS = """ function () { - const menuBtn = document.getElementById('menu'); - menuBtn.classList.toggle('active'); + const menu_btn = document.getElementById('menu'); + menu_btn.classList.toggle('active'); } """ @@ -191,41 +222,66 @@ def closeBtnHide(output_final): with gr.Blocks(theme="soft", css=CSS, fill_height=True) as demo: with gr.Row(): visible = gr.State(value=True) - menuBtn = gr.Button(value="", elem_classes="menu_btn", elem_id="menu", size="sm") + menu_btn = gr.Button( + value="", elem_classes="menu_btn", elem_id="menu", size="sm" + ) gr.HTML(TITLE) with gr.Row(): with gr.Column(scale=1) as menubar: endpoint = gr.Dropdown( label="Endpoint", - choices=["OpenAI","Groq","TogetherAI","Ollama","CUSTOM"], + choices=["OpenAI", "Groq", "TogetherAI", "Ollama", "CUSTOM"], value="OpenAI", ) - choice = gr.Checkbox(label="Additional Endpoint", info="Additional endpoint for reflection") - model = gr.Textbox(label="Model", value="gpt-4o", ) - api_key = gr.Textbox(label="API_KEY", type="password", ) + choice = gr.Checkbox( + label="Additional Endpoint", + info="Additional endpoint for reflection", + ) + model = gr.Textbox( + label="Model", + value="gpt-4o", + ) + api_key = gr.Textbox( + label="API_KEY", + type="password", + ) base = gr.Textbox(label="BASE URL", visible=False) with gr.Column(visible=False) as AddEndpoint: endpoint2 = gr.Dropdown( label="Additional Endpoint", - choices=["OpenAI","Groq","TogetherAI","Ollama","CUSTOM"], + choices=[ + "OpenAI", + "Groq", + "TogetherAI", + "Ollama", + "CUSTOM", + ], value="OpenAI", ) - model2 = gr.Textbox(label="Model", value="gpt-4o", ) - api_key2 = gr.Textbox(label="API_KEY", type="password", ) + model2 = gr.Textbox( + label="Model", + value="gpt-4o", + ) + api_key2 = gr.Textbox( + label="API_KEY", + type="password", + ) base2 = gr.Textbox(label="BASE URL", visible=False) with gr.Row(): source_lang = gr.Textbox( label="Source Lang", value="English", - elem_classes = "lang", + elem_classes="lang", ) target_lang = gr.Textbox( label="Target Lang", value="Spanish", - elem_classes = "lang", + elem_classes="lang", ) - switchBtn = gr.Button(value="🔄️") - country = gr.Textbox(label="Country", value="Argentina", max_lines=1) + switch_btn = gr.Button(value="🔄️") + country = gr.Textbox( + label="Country", value="Argentina", max_lines=1 + ) with gr.Accordion("Advanced Options", open=False): max_tokens = gr.Slider( label="Max tokens Per Chunk", @@ -233,63 +289,99 @@ def closeBtnHide(output_final): maximum=2046, value=1000, step=8, - ) + ) temperature = gr.Slider( label="Temperature", minimum=0, maximum=1.0, value=0.3, step=0.1, - ) + ) rpm = gr.Slider( label="Request Per Minute", minimum=1, maximum=1000, value=60, step=1, - ) - # json_mode = gr.Checkbox( - # False, - # label="Json Mode", - # ) + ) + with gr.Column(scale=4): source_text = gr.Textbox( label="Source Text", - value="How we live is so different from how we ought to live that he who studies "+\ - "what ought to be done rather than what is done will learn the way to his downfall "+\ - "rather than to his preservation.", + value="How we live is so different from how we ought to live that he who studies " + + "what ought to be done rather than what is done will learn the way to his downfall " + + "rather than to his preservation.", lines=12, ) with gr.Tab("Final"): - output_final = gr.Textbox(label="FInal Translation", lines=12, show_copy_button=True) + output_final = gr.Textbox( + label="Final Translation", lines=12, show_copy_button=True + ) with gr.Tab("Initial"): - output_init = gr.Textbox(label="Init Translation", lines=12, show_copy_button=True) + output_init = gr.Textbox( + label="Init Translation", lines=12, show_copy_button=True + ) with gr.Tab("Reflection"): - output_reflect = gr.Textbox(label="Reflection", lines=12, show_copy_button=True) + output_reflect = gr.Textbox( + label="Reflection", lines=12, show_copy_button=True + ) with gr.Tab("Diff"): - output_diff = gr.HighlightedText(visible = False) + output_diff = gr.HighlightedText(visible=False) with gr.Row(): submit = gr.Button(value="Translate") upload = gr.UploadButton(label="Upload", file_types=["text"]) export = gr.DownloadButton(visible=False) - clear = gr.ClearButton([source_text, output_init, output_reflect, output_final]) + clear = gr.ClearButton( + [source_text, output_init, output_reflect, output_final] + ) close = gr.Button(value="Stop", visible=False) - switchBtn.click(fn=switch, inputs=[source_lang,source_text,target_lang,output_final], outputs=[source_lang,source_text,target_lang,output_final]) + switch_btn.click( + fn=switch, + inputs=[source_lang, source_text, target_lang, output_final], + outputs=[source_lang, source_text, target_lang, output_final], + ) - menuBtn.click(fn=update_menu, inputs=visible, outputs=[visible, menubar], js=JS) + menu_btn.click( + fn=update_menu, inputs=visible, outputs=[visible, menubar], js=JS + ) endpoint.change(fn=update_model, inputs=[endpoint], outputs=[model, base]) choice.select(fn=enable_sec, inputs=[choice], outputs=[AddEndpoint]) - endpoint2.change(fn=update_model, inputs=[endpoint2], outputs=[model2, base2]) + endpoint2.change( + fn=update_model, inputs=[endpoint2], outputs=[model2, base2] + ) - start_ta = submit.click(fn=huanik, inputs=[endpoint, base, model, api_key, choice, endpoint2, base2, model2, api_key2, source_lang, target_lang, source_text, country, max_tokens, temperature, rpm], outputs=[output_init, output_reflect, output_final, output_diff]) - upload.upload(fn=read_doc, inputs = upload, outputs = source_text) + start_ta = submit.click( + fn=huanik, + inputs=[ + endpoint, + base, + model, + api_key, + choice, + endpoint2, + base2, + model2, + api_key2, + source_lang, + target_lang, + source_text, + country, + max_tokens, + temperature, + rpm, + ], + outputs=[output_init, output_reflect, output_final, output_diff], + ) + upload.upload(fn=read_doc, inputs=upload, outputs=source_text) output_final.change(fn=export_txt, inputs=output_final, outputs=[export]) - submit.click(fn=closeBtnShow, outputs=[clear, close]) - output_final.change(fn=closeBtnHide, inputs=output_final, outputs=[clear, close]) + submit.click(fn=close_btn_show, outputs=[clear, close]) + output_final.change( + fn=close_btn_hide, inputs=output_final, outputs=[clear, close] + ) close.click(fn=None, cancels=start_ta) if __name__ == "__main__": - demo.queue(api_open=False).launch(show_api=False, share=False) \ No newline at end of file + demo.queue(api_open=False).launch(show_api=False, share=False) diff --git a/app/webui/image.png b/app/image.png similarity index 100% rename from app/webui/image.png rename to app/image.png diff --git a/app/webui/patch.py b/app/patch.py similarity index 69% rename from app/webui/patch.py rename to app/patch.py index b4267c5..91cc56c 100644 --- a/app/webui/patch.py +++ b/app/patch.py @@ -1,12 +1,14 @@ -# a monkey patch for completion import os import time from functools import wraps from threading import Lock +from typing import Optional from typing import Union -import translation_agent.utils as Utils -import openai + import gradio as gr +import openai +import translation_agent.utils as utils + RPM = 60 MODEL = "" @@ -16,15 +18,16 @@ ENDPOINT = "" client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY")) + # Add your LLMs here def model_load( - endpoint: str, - base_url: str, - model: str, - api_key: str = None, - temperature: float = TEMPERATURE, - rpm: int = RPM, - js_mode: bool = JS_MODE, + endpoint: str, + base_url: str, + model: str, + api_key: Optional[str] = None, + temperature: float = TEMPERATURE, + rpm: int = RPM, + js_mode: bool = JS_MODE, ): global client, RPM, MODEL, TEMPERATURE, JS_MODE, ENDPOINT ENDPOINT = endpoint @@ -34,15 +37,26 @@ def model_load( JS_MODE = js_mode if endpoint == "Groq": - client = openai.OpenAI(api_key=api_key if api_key else os.getenv("GROQ_API_KEY"), base_url="https://api.groq.com/openai/v1") + client = openai.OpenAI( + api_key=api_key if api_key else os.getenv("GROQ_API_KEY"), + base_url="https://api.groq.com/openai/v1", + ) elif endpoint == "TogetherAI": - client = openai.OpenAI(api_key=api_key if api_key else os.getenv("TOGETHER_API_KEY"), base_url="https://api.together.xyz/v1") + client = openai.OpenAI( + api_key=api_key if api_key else os.getenv("TOGETHER_API_KEY"), + base_url="https://api.together.xyz/v1", + ) elif endpoint == "CUSTOM": client = openai.OpenAI(api_key=api_key, base_url=base_url) elif endpoint == "Ollama": - client = openai.OpenAI(api_key="ollama", base_url="http://localhost:11434/v1") + client = openai.OpenAI( + api_key="ollama", base_url="http://localhost:11434/v1" + ) else: - client = openai.OpenAI(api_key=api_key if api_key else os.getenv("OPENAI_API_KEY")) + client = openai.OpenAI( + api_key=api_key if api_key else os.getenv("OPENAI_API_KEY") + ) + def rate_limit(get_max_per_minute): def decorator(func): @@ -63,9 +77,12 @@ def wrapper(*args, **kwargs): ret = func(*args, **kwargs) last_called[0] = time.time() return ret + return wrapper + return decorator + @rate_limit(lambda: RPM) def get_completion( prompt: str, @@ -112,7 +129,7 @@ def get_completion( ) return response.choices[0].message.content except Exception as e: - raise gr.Error(f"An unexpected error occurred: {e}") + raise gr.Error(f"An unexpected error occurred: {e}") from e else: try: response = client.chat.completions.create( @@ -126,17 +143,18 @@ def get_completion( ) return response.choices[0].message.content except Exception as e: - raise gr.Error(f"An unexpected error occurred: {e}") - -Utils.get_completion = get_completion - -one_chunk_initial_translation = Utils.one_chunk_initial_translation -one_chunk_reflect_on_translation = Utils.one_chunk_reflect_on_translation -one_chunk_improve_translation = Utils.one_chunk_improve_translation -one_chunk_translate_text = Utils.one_chunk_translate_text -num_tokens_in_string = Utils.num_tokens_in_string -multichunk_initial_translation = Utils.multichunk_initial_translation -multichunk_reflect_on_translation = Utils.multichunk_reflect_on_translation -multichunk_improve_translation = Utils.multichunk_improve_translation -multichunk_translation = Utils.multichunk_translation -calculate_chunk_size =Utils.calculate_chunk_size \ No newline at end of file + raise gr.Error(f"An unexpected error occurred: {e}") from e + + +utils.get_completion = get_completion + +one_chunk_initial_translation = utils.one_chunk_initial_translation +one_chunk_reflect_on_translation = utils.one_chunk_reflect_on_translation +one_chunk_improve_translation = utils.one_chunk_improve_translation +one_chunk_translate_text = utils.one_chunk_translate_text +num_tokens_in_string = utils.num_tokens_in_string +multichunk_initial_translation = utils.multichunk_initial_translation +multichunk_reflect_on_translation = utils.multichunk_reflect_on_translation +multichunk_improve_translation = utils.multichunk_improve_translation +multichunk_translation = utils.multichunk_translation +calculate_chunk_size = utils.calculate_chunk_size diff --git a/app/webui/process.py b/app/process.py similarity index 71% rename from app/webui/process.py rename to app/process.py index 2319238..9dc1b6e 100644 --- a/app/webui/process.py +++ b/app/process.py @@ -1,17 +1,30 @@ -import gradio as gr -from simplemma import simple_tokenizer from difflib import Differ -from icecream import ic -from patch import model_load,num_tokens_in_string,one_chunk_initial_translation, one_chunk_reflect_on_translation, one_chunk_improve_translation -from patch import calculate_chunk_size, multichunk_initial_translation, multichunk_reflect_on_translation, multichunk_improve_translation -import pymupdf + import docx +import gradio as gr +import pymupdf +from icecream import ic from langchain_text_splitters import RecursiveCharacterTextSplitter +from patch import calculate_chunk_size +from patch import model_load +from patch import multichunk_improve_translation +from patch import multichunk_initial_translation +from patch import multichunk_reflect_on_translation +from patch import num_tokens_in_string +from patch import one_chunk_improve_translation +from patch import one_chunk_initial_translation +from patch import one_chunk_reflect_on_translation +from simplemma import simple_tokenizer + + +progress = gr.Progress() -progress=gr.Progress() def extract_text(path): - return open(path, 'r').read() + with open(path) as f: + file_text = f.read() + return file_text + def extract_pdf(path): doc = pymupdf.open(path) @@ -20,29 +33,34 @@ def extract_pdf(path): text += page.get_text() return text + def extract_docx(path): doc = docx.Document(path) data = [] for paragraph in doc.paragraphs: data.append(paragraph.text) - content = '\n\n'.join(data) + content = "\n\n".join(data) return content + def tokenize(text): # Use nltk to tokenize the text words = simple_tokenizer(text) # Check if the text contains spaces - if ' ' in text: + if " " in text: # Create a list of words and spaces tokens = [] for word in words: tokens.append(word) - if not word.startswith("'") and not word.endswith("'"): # Avoid adding space after punctuation - tokens.append(' ') # Add space after each word + if not word.startswith("'") and not word.endswith( + "'" + ): # Avoid adding space after punctuation + tokens.append(" ") # Add space after each word return tokens[:-1] # Remove the last space else: return words + def diff_texts(text1, text2): tokens1 = tokenize(text1) tokens2 = tokenize(text2) @@ -54,24 +72,25 @@ def diff_texts(text1, text2): for token in diff_result: word = token[2:] category = None - if token[0] == '+': - category = 'added' - elif token[0] == '-': - category = 'removed' - elif token[0] == '?': + if token[0] == "+": + category = "added" + elif token[0] == "-": + category = "removed" + elif token[0] == "?": continue # Ignore the hints line highlighted_text.append((word, category)) return highlighted_text -#modified from src.translaation-agent.utils.tranlsate + +# modified from src.translaation-agent.utils.tranlsate def translator( - source_lang: str, - target_lang: str, - source_text: str, - country: str, - max_tokens:int = 1000, + source_lang: str, + target_lang: str, + source_text: str, + country: str, + max_tokens: int = 1000, ): """Translate the source_text from source_lang to target_lang.""" num_tokens_in_text = num_tokens_in_string(source_text) @@ -81,17 +100,17 @@ def translator( if num_tokens_in_text < max_tokens: ic("Translating text as single chunk") - progress((1,3), desc="First translation...") + progress((1, 3), desc="First translation...") init_translation = one_chunk_initial_translation( source_lang, target_lang, source_text ) - progress((2,3), desc="Reflecton...") + progress((2, 3), desc="Reflection...") reflection = one_chunk_reflect_on_translation( source_lang, target_lang, source_text, init_translation, country ) - progress((3,3), desc="Second translation...") + progress((3, 3), desc="Second translation...") final_translation = one_chunk_improve_translation( source_lang, target_lang, source_text, init_translation, reflection ) @@ -115,14 +134,14 @@ def translator( source_text_chunks = text_splitter.split_text(source_text) - progress((1,3), desc="First translation...") + progress((1, 3), desc="First translation...") translation_1_chunks = multichunk_initial_translation( source_lang, target_lang, source_text_chunks ) init_translation = "".join(translation_1_chunks) - progress((2,3), desc="Reflection...") + progress((2, 3), desc="Reflection...") reflection_chunks = multichunk_reflect_on_translation( source_lang, target_lang, @@ -133,7 +152,7 @@ def translator( reflection = "".join(reflection_chunks) - progress((3,3), desc="Second translation...") + progress((3, 3), desc="Second translation...") translation_2_chunks = multichunk_improve_translation( source_lang, target_lang, @@ -148,17 +167,16 @@ def translator( def translator_sec( - endpoint2: str, - base2: str, - model2: str, - api_key2: str, - source_lang: str, - target_lang: str, - source_text: str, - country: str, - max_tokens: int = 1000, + endpoint2: str, + base2: str, + model2: str, + api_key2: str, + source_lang: str, + target_lang: str, + source_text: str, + country: str, + max_tokens: int = 1000, ): - """Translate the source_text from source_lang to target_lang.""" num_tokens_in_text = num_tokens_in_string(source_text) @@ -167,7 +185,7 @@ def translator_sec( if num_tokens_in_text < max_tokens: ic("Translating text as single chunk") - progress((1,3), desc="First translation...") + progress((1, 3), desc="First translation...") init_translation = one_chunk_initial_translation( source_lang, target_lang, source_text ) @@ -175,14 +193,14 @@ def translator_sec( try: model_load(endpoint2, base2, model2, api_key2) except Exception as e: - raise gr.Error(f"An unexpected error occurred: {e}") + raise gr.Error(f"An unexpected error occurred: {e}") from e - progress((2,3), desc="Reflecton...") + progress((2, 3), desc="Reflection...") reflection = one_chunk_reflect_on_translation( source_lang, target_lang, source_text, init_translation, country ) - progress((3,3), desc="Second translation...") + progress((3, 3), desc="Second translation...") final_translation = one_chunk_improve_translation( source_lang, target_lang, source_text, init_translation, reflection ) @@ -206,7 +224,7 @@ def translator_sec( source_text_chunks = text_splitter.split_text(source_text) - progress((1,3), desc="First translation...") + progress((1, 3), desc="First translation...") translation_1_chunks = multichunk_initial_translation( source_lang, target_lang, source_text_chunks ) @@ -216,9 +234,9 @@ def translator_sec( try: model_load(endpoint2, base2, model2, api_key2) except Exception as e: - raise gr.Error(f"An unexpected error occurred: {e}") + raise gr.Error(f"An unexpected error occurred: {e}") from e - progress((2,3), desc="Reflection...") + progress((2, 3), desc="Reflection...") reflection_chunks = multichunk_reflect_on_translation( source_lang, target_lang, @@ -229,7 +247,7 @@ def translator_sec( reflection = "".join(reflection_chunks) - progress((3,3), desc="Second translation...") + progress((3, 3), desc="Second translation...") translation_2_chunks = multichunk_improve_translation( source_lang, target_lang, @@ -240,4 +258,4 @@ def translator_sec( final_translation = "".join(translation_2_chunks) - return init_translation, reflection, final_translation \ No newline at end of file + return init_translation, reflection, final_translation diff --git a/app/webui/__init__.py b/app/webui/__init__.py deleted file mode 100644 index 9cfe7b4..0000000 --- a/app/webui/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .app import * \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index db00826..097f570 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,6 +24,9 @@ icecream = "^2.1.3" langchain-text-splitters = "^0.0.1" python-dotenv = "^1.0.1" +[tool.poetry.group.app] +optional = true + [tool.poetry.group.app.dependencies] simplemma = "^1.0.0" gradio = "4.37.2"