From 94533dbed6290a32a4bbe3114df51dad4d16b613 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Thu, 5 Sep 2024 11:36:01 -0700
Subject: [PATCH] add support for passing args to visionagentcoder

---
 vision_agent/agent/vision_agent.py | 23 ++++++++--
 vision_agent/tools/meta_tools.py   | 71 ++++++++++++++++++++++++++++--
 2 files changed, 87 insertions(+), 7 deletions(-)

diff --git a/vision_agent/agent/vision_agent.py b/vision_agent/agent/vision_agent.py
index 4733bb24..776ab964 100644
--- a/vision_agent/agent/vision_agent.py
+++ b/vision_agent/agent/vision_agent.py
@@ -14,7 +14,7 @@
 )
 from vision_agent.lmm import LMM, Message, OpenAILMM
 from vision_agent.tools import META_TOOL_DOCSTRING
-from vision_agent.tools.meta_tools import Artifacts
+from vision_agent.tools.meta_tools import Artifacts, use_extra_vision_agent_args
 from vision_agent.utils import CodeInterpreterFactory
 from vision_agent.utils.execute import CodeInterpreter, Execution
 
@@ -87,11 +87,18 @@ def run_code_action(
     return result, obs
 
 
-def parse_execution(response: str) -> Optional[str]:
+def parse_execution(
+    response: str,
+    test_multi_plan: bool = True,
+    customed_tool_names: Optional[List[str]] = None,
+) -> Optional[str]:
     code = None
     if "<execute_python>" in response:
         code = response[response.find("<execute_python>") + len("<execute_python>") :]
         code = code[: code.find("</execute_python>")]
+
+    if code is not None:
+        code = use_extra_vision_agent_args(code, test_multi_plan, customed_tool_names)
     return code
 
 
@@ -174,6 +181,8 @@ def chat_with_code(
         self,
         chat: List[Message],
         artifacts: Optional[Artifacts] = None,
+        test_multi_plan: bool = True,
+        customized_tool_names: Optional[List[str]] = None,
     ) -> Tuple[List[Message], Artifacts]:
         """Chat with VisionAgent, it will use code to execute actions to accomplish
         its tasks.
@@ -184,6 +193,12 @@ def chat_with_code(
                 or if it contains media files, it should be in the format of:
                 [{"role": "user", "content": "describe your task here...", "media": ["image1.jpg", "image2.jpg"]}]
             artifacts (Optional[Artifacts]): The artifacts to use in the task.
+            test_multi_plan (bool): If True, it will test tools for multiple plans and
+                pick the best one based off of the tool results. If False, it will go
+                with the first plan.
+            customized_tool_names (List[str]): A list of customized tools for agent to
+                pick and use. If not provided, default to full tool set from
+                vision_agent.tools.
 
         Returns:
             List[Message]: The conversation response.
@@ -262,7 +277,9 @@ def chat_with_code(
                 if response["let_user_respond"]:
                     break
 
-                code_action = parse_execution(response["response"])
+                code_action = parse_execution(
+                    response["response"], test_multi_plan, customized_tool_names
+                )
 
                 if code_action is not None:
                     result, obs = run_code_action(
diff --git a/vision_agent/tools/meta_tools.py b/vision_agent/tools/meta_tools.py
index af28360b..0847dcae 100644
--- a/vision_agent/tools/meta_tools.py
+++ b/vision_agent/tools/meta_tools.py
@@ -297,7 +297,12 @@ def edit_code_artifact(
 
 
 def generate_vision_code(
-    artifacts: Artifacts, name: str, chat: str, media: List[str]
+    artifacts: Artifacts,
+    name: str,
+    chat: str,
+    media: List[str],
+    test_multi_plan: bool = True,
+    customized_tool_names: Optional[List[str]] = None,
 ) -> str:
     """Generates python code to solve vision based tasks.
 
@@ -306,6 +311,8 @@ def generate_vision_code(
         name (str): The name of the artifact to save the code to.
         chat (str): The chat message from the user.
         media (List[str]): The media files to use.
+        test_multi_plan (bool): Do not change this parameter.
+        customized_tool_names (Optional[List[str]]): Do not change this parameter.
 
     Returns:
         str: The generated code.
@@ -330,7 +337,11 @@ def detect_dogs(image_path: str):
         agent = va.agent.VisionAgentCoder()
 
     fixed_chat: List[Message] = [{"role": "user", "content": chat, "media": media}]
-    response = agent.chat_with_workflow(fixed_chat, test_multi_plan=False)
+    response = agent.chat_with_workflow(
+        fixed_chat,
+        test_multi_plan=test_multi_plan,
+        customized_tool_names=customized_tool_names,
+    )
     redisplay_results(response["test_result"])
     code = response["code"]
     artifacts[name] = code
@@ -342,7 +353,11 @@ def detect_dogs(image_path: str):
 
 
 def edit_vision_code(
-    artifacts: Artifacts, name: str, chat_history: List[str], media: List[str]
+    artifacts: Artifacts,
+    name: str,
+    chat_history: List[str],
+    media: List[str],
+    customized_tool_names: Optional[List[str]] = None,
 ) -> str:
     """Edits python code to solve a vision based task.
 
@@ -350,6 +365,7 @@ def edit_vision_code(
         artifacts (Artifacts): The artifacts object to save the code to.
         name (str): The file path to the code.
         chat_history (List[str]): The chat history to used to generate the code.
+        customized_tool_names (Optional[List[str]]): Do not change this parameter.
 
     Returns:
         str: The edited code.
@@ -386,7 +402,11 @@ def detect_dogs(image_path: str):
             fixed_chat_history.append({"role": "assistant", "content": code})
             fixed_chat_history.append({"role": "user", "content": chat})
 
-    response = agent.chat_with_workflow(fixed_chat_history, test_multi_plan=False)
+    response = agent.chat_with_workflow(
+        fixed_chat_history,
+        test_multi_plan=False,
+        customized_tool_names=customized_tool_names,
+    )
     redisplay_results(response["test_result"])
     code = response["code"]
     artifacts[name] = code
@@ -480,6 +500,49 @@ def get_diff_with_prompts(name: str, before: str, after: str) -> str:
     return f"[Artifact {name} edits]\n{diff}\n[End of edits]"
 
 
+def use_extra_vision_agent_args(
+    code: str,
+    test_multi_plan: bool = True,
+    customized_tool_names: Optional[List[str]] = None,
+) -> str:
+    """This is for forcing arguments passed by the user to VisionAgent into the
+    VisionAgentCoder call.
+
+    Parameters:
+        code (str): The code to edit.
+        test_multi_plan (bool): Do not change this parameter.
+        customized_tool_names (Optional[List[str]]): Do not change this parameter.
+
+    Returns:
+        str: The edited code.
+    """
+    generate_pattern = r"generate_vision_code\(\s*([^\)]+)\)"
+
+    def generate_replacer(match: re.Match) -> str:
+        arg = match.group(1)
+        out_str = f"generate_vision_code({arg}, test_multi_plan={test_multi_plan}"
+        if customized_tool_names is not None:
+            out_str += f", customized_tool_names={customized_tool_names})"
+        else:
+            out_str += ")"
+        return out_str
+
+    edit_pattern = r"edit_vision_code\(\s*([^\)]+)\)"
+
+    def edit_replacer(match: re.Match) -> str:
+        arg = match.group(1)
+        out_str = f"edit_vision_code({arg}"
+        if customized_tool_names is not None:
+            out_str += f", customized_tool_names={customized_tool_names})"
+        else:
+            out_str += ")"
+        return out_str
+
+    new_code = re.sub(generate_pattern, generate_replacer, code)
+    new_code = re.sub(edit_pattern, edit_replacer, new_code)
+    return new_code
+
+
 def use_florence2_fine_tuning(
     artifacts: Artifacts, name: str, task: str, fine_tune_id: str
 ) -> str: