From a7dd110c60c8b0dd0a8fa6d8cf77eb618a7d106a Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Mon, 9 Sep 2024 08:21:25 -0700
Subject: [PATCH] Improve Orchestrator (#222)

* updating prompts with fine tuning examples

* update docs and add get_diff_with_prompt

* use get_diff_with_prompts for editing code

* add extract frames to util docs

* add prompts around edits

* add support for passing args to visionagentcoder

* for debugging

* format changes

* removed duplicate TOOLS_INFO
---
 vision_agent/agent/vision_agent.py         | 23 +++++-
 vision_agent/agent/vision_agent_prompts.py | 63 +++++++++++----
 vision_agent/tools/meta_tools.py           | 89 +++++++++++++++++++---
 vision_agent/tools/tools.py                | 13 +---
 4 files changed, 149 insertions(+), 39 deletions(-)

diff --git a/vision_agent/agent/vision_agent.py b/vision_agent/agent/vision_agent.py
index 4733bb24..776ab964 100644
--- a/vision_agent/agent/vision_agent.py
+++ b/vision_agent/agent/vision_agent.py
@@ -14,7 +14,7 @@
 )
 from vision_agent.lmm import LMM, Message, OpenAILMM
 from vision_agent.tools import META_TOOL_DOCSTRING
-from vision_agent.tools.meta_tools import Artifacts
+from vision_agent.tools.meta_tools import Artifacts, use_extra_vision_agent_args
 from vision_agent.utils import CodeInterpreterFactory
 from vision_agent.utils.execute import CodeInterpreter, Execution
 
@@ -87,11 +87,18 @@ def run_code_action(
     return result, obs
 
 
-def parse_execution(response: str) -> Optional[str]:
+def parse_execution(
+    response: str,
+    test_multi_plan: bool = True,
+    customed_tool_names: Optional[List[str]] = None,
+) -> Optional[str]:
     code = None
     if "<execute_python>" in response:
         code = response[response.find("<execute_python>") + len("<execute_python>") :]
         code = code[: code.find("</execute_python>")]
+
+    if code is not None:
+        code = use_extra_vision_agent_args(code, test_multi_plan, customed_tool_names)
     return code
 
 
@@ -174,6 +181,8 @@ def chat_with_code(
         self,
         chat: List[Message],
         artifacts: Optional[Artifacts] = None,
+        test_multi_plan: bool = True,
+        customized_tool_names: Optional[List[str]] = None,
     ) -> Tuple[List[Message], Artifacts]:
         """Chat with VisionAgent, it will use code to execute actions to accomplish
         its tasks.
@@ -184,6 +193,12 @@ def chat_with_code(
                 or if it contains media files, it should be in the format of:
                 [{"role": "user", "content": "describe your task here...", "media": ["image1.jpg", "image2.jpg"]}]
             artifacts (Optional[Artifacts]): The artifacts to use in the task.
+            test_multi_plan (bool): If True, it will test tools for multiple plans and
+                pick the best one based off of the tool results. If False, it will go
+                with the first plan.
+            customized_tool_names (List[str]): A list of customized tools for agent to
+                pick and use. If not provided, default to full tool set from
+                vision_agent.tools.
 
         Returns:
             List[Message]: The conversation response.
@@ -262,7 +277,9 @@ def chat_with_code(
                 if response["let_user_respond"]:
                     break
 
-                code_action = parse_execution(response["response"])
+                code_action = parse_execution(
+                    response["response"], test_multi_plan, customized_tool_names
+                )
 
                 if code_action is not None:
                     result, obs = run_code_action(
diff --git a/vision_agent/agent/vision_agent_prompts.py b/vision_agent/agent/vision_agent_prompts.py
index bf9fac80..7b0dd600 100644
--- a/vision_agent/agent/vision_agent_prompts.py
+++ b/vision_agent/agent/vision_agent_prompts.py
@@ -1,5 +1,5 @@
 VA_CODE = """
-**Role**: You are a helpful conversational agent that assists users with their requests by writing code to solve it.
+**Role**: You are a helpful agent that assists users with writing code.
 
 **Taks**: As a conversational agent, you are required to understand the user's request and provide a helpful response. Use a Chain-of-Thought approach to break down the problem, create a plan, and then provide a response. Ensure that your response is clear, concise, and helpful. You can use an interactive Python (Jupyter Notebook) environment, executing code with <execution_python>. You are given access to an `artifacts` object which contains files shared between you and the user. `artifacts` will be automatically saved everytime you execute python code.
 
@@ -56,7 +56,9 @@
 
 
 AGENT: {"thoughts": "Two dogs are detected, I will show this to the user and ask them if the result looks good.", "response": "I have written the code to detect dogs and shown the output, do the results look good to you?", "let_user_respond": true}
+"""
 
+EXAMPLES_CODE1_EXTRA = """
 USER: The the image only has one dog, can you fix this?
 
 [Artifacts loaded]
@@ -105,25 +107,24 @@
 
 OBSERVATION:
 [Artifact code.py]
-0|from vision_agent.tools import load_image, owl_v2, closest_box_distance, overlay_bounding_boxes, save_image
+0|from vision_agent.tools import load_image, florence2_phrase_grounding, closest_box_distance, overlay_bounding_boxes, save_image
 1|def count_workers_with_helmets(image_path: str, output_path: str):
 2|    image = load_image(image_path)
-3|    detections = owl_v2("worker, helmet", image)
+3|    detections = florence2_phrase_grounding("worker, helmet", image)
 4|    workers = [d for d in detections if d['label'] == 'worker']
 5|    helmets = [d for d in detections if d['label'] == 'helmet']
 6|    count = 0
 6|    for worker in workers:
-7|        person_box = worker['bbox']
-8|        person_has_helmet = False
-9|        for helmet in helmets:
-10|            if closest_box_distance(worker['box'], helmet['box']) < 0.01:
-11|                person_has_helmet = True
-12|                break
-13|        if person_has_helmet:
-14|            count += 1
-15|    overlay_bounding_boxes(image, detections)
-16|    save_image(output_path, image)
-15|    return count
+7|        person_has_helmet = False
+8|        for helmet in helmets:
+9|            if closest_box_distance(worker['box'], helmet['box']) < 0.01:
+10|                person_has_helmet = True
+11|                break
+11|        if person_has_helmet:
+13|            count += 1
+14|    overlay_bounding_boxes(image, detections)
+15|    save_image(output_path, image)
+16|    return count
 [End of artifact]
 
 AGENT: {"thoughts": "I have generated the code to count the workers with helmets in the image, I must now run the code and print the output and write the visualization to the artifacts so I can see the result and the user can see the visaulization.", "response": "<execute_python>from code import count_workers_with_helmets\n    print(count_workers_with_helmets('/path/to/images/workers.png', 'workers_viz.png'))\n    write_media_artifact(artifacts, 'workers_viz.png')</execute_python>", "let_user_respond": false}
@@ -133,4 +134,38 @@
 2
 
 AGENT: {"thoughts": "Two workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "I have written the code to count the workers wearing helmets in code.py and saved the visualization under 'workers_viz.png'.", "let_user_respond": true}
+
+USER: The detections are slightly off. Can you fine tune florence2 using these labels? "[{'image_path': 'image1.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}, {'image_path': 'image2.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}]"
+
+AGENT: {"thoughts": "Because the user has supplied me with labels I can call florence2_fine_tuning on their behalf to fine tune the model", "response": "I will fine tune florence2 with the labels you provided <execute_python>florence2_fine_tuning([{'image_path': 'image1.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}, {'image_path': 'image2.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}], "phrase_grounding")</execute_python>", "let_user_respond": false}
+
+OBSERVATION:
+[Florence2 fine tuning id: 23b3b022-5ebf-4798-9373-20ef36429abf]
+
+
+AGENT: {"thoughts": "The model has finished fine tuning, I will now replace the original florence2_phrase_grounding call with the fine tuning id.", "response": "I will not update the code to use the fine tuned model. <execute_python>use_florence2_fine_tuning(artifacts, "code.py", "phrase_grounding", "23b3b022-5ebf-4798-9373-20ef36429abf")</execute_python>", "let_user_respond": false}
+
+OBSERVATION:
+[Artifact code.py edits]
+---
++++
+@@ -1,7 +1,7 @@
+ from vision_agent.tools import load_image, florence2_phrase_grounding, closest_box_distance, overlay_bounding_boxes, save_image
+ def count_workers_with_helmets(image_path: str, output_path: str):
+     image = load_image(image_path)
+-    detections = florence2_phrase_grounding("worker, helmet", image)
++    detections = florence2_phrase_grounding("worker, helmet", image, "23b3b022-5ebf-4798-9373-20ef36429abf")
+     workers = [d for d in detections if d['label'] == 'worker']
+     helmets = [d for d in detections if d['label'] == 'helmet']
+     count = 0
+[End of edits]
+
+
+AGENT: {"thoughts": "I have updated the code to use the fine tuned model, I must now run the code and print the output and write the visualization to the artifacts so I can see the result and the user can see the visaulization.", "response": "<execute_python>from code import count_workers_with_helmets\n    print(count_workers_with_helmets('/path/to/images/workers.png', 'workers_viz.png'))\n    write_media_artifact(artifacts, 'workers_viz.png')</execute_python>", "let_user_respond": false}
+
+OBSERVATION:
+----- stdout -----
+3
+
+AGENT: {"thoughts": "Three workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "I have updated the code using the fine tuned florence2_phrase_grounding model to count the workers wearing helmets in code.py and saved the visualization under 'workers_viz.png'.", "let_user_respond": true}
 """
diff --git a/vision_agent/tools/meta_tools.py b/vision_agent/tools/meta_tools.py
index 3670e600..ccf98287 100644
--- a/vision_agent/tools/meta_tools.py
+++ b/vision_agent/tools/meta_tools.py
@@ -297,7 +297,12 @@ def edit_code_artifact(
 
 
 def generate_vision_code(
-    artifacts: Artifacts, name: str, chat: str, media: List[str]
+    artifacts: Artifacts,
+    name: str,
+    chat: str,
+    media: List[str],
+    test_multi_plan: bool = True,
+    customized_tool_names: Optional[List[str]] = None,
 ) -> str:
     """Generates python code to solve vision based tasks.
 
@@ -306,6 +311,8 @@ def generate_vision_code(
         name (str): The name of the artifact to save the code to.
         chat (str): The chat message from the user.
         media (List[str]): The media files to use.
+        test_multi_plan (bool): Do not change this parameter.
+        customized_tool_names (Optional[List[str]]): Do not change this parameter.
 
     Returns:
         str: The generated code.
@@ -330,7 +337,11 @@ def detect_dogs(image_path: str):
         agent = va.agent.VisionAgentCoder()
 
     fixed_chat: List[Message] = [{"role": "user", "content": chat, "media": media}]
-    response = agent.chat_with_workflow(fixed_chat, test_multi_plan=True)
+    response = agent.chat_with_workflow(
+        fixed_chat,
+        test_multi_plan=test_multi_plan,
+        customized_tool_names=customized_tool_names,
+    )
     redisplay_results(response["test_result"])
     code = response["code"]
     artifacts[name] = code
@@ -342,7 +353,11 @@ def detect_dogs(image_path: str):
 
 
 def edit_vision_code(
-    artifacts: Artifacts, name: str, chat_history: List[str], media: List[str]
+    artifacts: Artifacts,
+    name: str,
+    chat_history: List[str],
+    media: List[str],
+    customized_tool_names: Optional[List[str]] = None,
 ) -> str:
     """Edits python code to solve a vision based task.
 
@@ -350,6 +365,7 @@ def edit_vision_code(
         artifacts (Artifacts): The artifacts object to save the code to.
         name (str): The file path to the code.
         chat_history (List[str]): The chat history to used to generate the code.
+        customized_tool_names (Optional[List[str]]): Do not change this parameter.
 
     Returns:
         str: The edited code.
@@ -386,7 +402,11 @@ def detect_dogs(image_path: str):
             fixed_chat_history.append({"role": "assistant", "content": code})
             fixed_chat_history.append({"role": "user", "content": chat})
 
-    response = agent.chat_with_workflow(fixed_chat_history, test_multi_plan=False)
+    response = agent.chat_with_workflow(
+        fixed_chat_history,
+        test_multi_plan=False,
+        customized_tool_names=customized_tool_names,
+    )
     redisplay_results(response["test_result"])
     code = response["code"]
     artifacts[name] = code
@@ -425,18 +445,19 @@ def get_tool_descriptions() -> str:
 
 
 def florence2_fine_tuning(bboxes: List[Dict[str, Any]], task: str) -> str:
-    """'florence2_fine_tuning' is a tool that fine-tune florence2 to be able to detect
+    """DO NOT use this function unless the user has supplied you with bboxes.
+    'florence2_fine_tuning' is a tool that fine-tune florence2 to be able to detect
     objects in an image based on a given dataset. It returns the fine tuning job id.
 
     Parameters:
-        bboxes (List[BboxInput]): A list of BboxInput containing the
-            image path, labels and bounding boxes.
+        bboxes (List[BboxInput]): A list of BboxInput containing the image path, labels
+            and bounding boxes. The coordinates are unnormalized.
         task (str): The florencev2 fine-tuning task. The options are
             'phrase_grounding'.
 
     Returns:
-        UUID: The fine tuning job id, this id will used to retrieve the fine
-            tuned model.
+        str: The fine tuning job id, this id will used to retrieve the fine tuned
+            model.
 
     Example
     -------
@@ -473,6 +494,54 @@ def get_diff(before: str, after: str) -> str:
     )
 
 
+def get_diff_with_prompts(name: str, before: str, after: str) -> str:
+    diff = get_diff(before, after)
+    return f"[Artifact {name} edits]\n{diff}\n[End of edits]"
+
+
+def use_extra_vision_agent_args(
+    code: str,
+    test_multi_plan: bool = True,
+    customized_tool_names: Optional[List[str]] = None,
+) -> str:
+    """This is for forcing arguments passed by the user to VisionAgent into the
+    VisionAgentCoder call.
+
+    Parameters:
+        code (str): The code to edit.
+        test_multi_plan (bool): Do not change this parameter.
+        customized_tool_names (Optional[List[str]]): Do not change this parameter.
+
+    Returns:
+        str: The edited code.
+    """
+    generate_pattern = r"generate_vision_code\(\s*([^\)]+)\)"
+
+    def generate_replacer(match: re.Match) -> str:
+        arg = match.group(1)
+        out_str = f"generate_vision_code({arg}, test_multi_plan={test_multi_plan}"
+        if customized_tool_names is not None:
+            out_str += f", customized_tool_names={customized_tool_names})"
+        else:
+            out_str += ")"
+        return out_str
+
+    edit_pattern = r"edit_vision_code\(\s*([^\)]+)\)"
+
+    def edit_replacer(match: re.Match) -> str:
+        arg = match.group(1)
+        out_str = f"edit_vision_code({arg}"
+        if customized_tool_names is not None:
+            out_str += f", customized_tool_names={customized_tool_names})"
+        else:
+            out_str += ")"
+        return out_str
+
+    new_code = re.sub(generate_pattern, generate_replacer, code)
+    new_code = re.sub(edit_pattern, edit_replacer, new_code)
+    return new_code
+
+
 def use_florence2_fine_tuning(
     artifacts: Artifacts, name: str, task: str, fine_tune_id: str
 ) -> str:
@@ -521,7 +590,7 @@ def replacer(match: re.Match) -> str:
 
     artifacts[name] = new_code
 
-    diff = get_diff(code, new_code)
+    diff = get_diff_with_prompts(name, code, new_code)
     print(diff)
     return diff
 
diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py
index e8e23ba6..7092a646 100644
--- a/vision_agent/tools/tools.py
+++ b/vision_agent/tools/tools.py
@@ -1945,15 +1945,4 @@ def overlay_counting_results(
 TOOL_DESCRIPTIONS = get_tool_descriptions(TOOLS)  # type: ignore
 TOOL_DOCSTRING = get_tool_documentation(TOOLS)  # type: ignore
 TOOLS_INFO = get_tools_info(FUNCTION_TOOLS)  # type: ignore
-UTILITIES_DOCSTRING = get_tool_documentation(
-    [
-        save_json,
-        load_image,
-        save_image,
-        save_video,
-        overlay_bounding_boxes,
-        overlay_segmentation_masks,
-        overlay_heat_map,
-        overlay_counting_results,
-    ]
-)
+UTILITIES_DOCSTRING = get_tool_documentation(UTIL_TOOLS)  # type: ignore