diff --git a/tests/unit/test_va.py b/tests/unit/test_va.py
index 3fe619e8..e7a6e7c5 100644
--- a/tests/unit/test_va.py
+++ b/tests/unit/test_va.py
@@ -1,4 +1,12 @@
-from vision_agent.agent.vision_agent import parse_execution
+from vision_agent.agent.agent_utils import extract_tag
+from vision_agent.tools.meta_tools import use_extra_vision_agent_args
+
+
+def parse_execution(code, test_multi_plan=True, custom_tool_names=None):
+ code = extract_tag(code, "execute_python")
+ if not code:
+ return None
+ return use_extra_vision_agent_args(code, test_multi_plan, custom_tool_names)
def test_parse_execution_zero():
diff --git a/vision_agent/agent/vision_agent.py b/vision_agent/agent/vision_agent.py
index 42541d33..24cb3851 100644
--- a/vision_agent/agent/vision_agent.py
+++ b/vision_agent/agent/vision_agent.py
@@ -1,4 +1,5 @@
import copy
+import json
import logging
import os
import tempfile
@@ -6,7 +7,7 @@
from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast
from vision_agent.agent import Agent
-from vision_agent.agent.agent_utils import extract_json
+from vision_agent.agent.agent_utils import extract_json, extract_tag
from vision_agent.agent.vision_agent_prompts import (
EXAMPLES_CODE1,
EXAMPLES_CODE2,
@@ -57,6 +58,32 @@ def add_boilerplate(code: str, **format: Any) -> str:
)
+def format_agent_message(agent_message: str) -> str:
+ agent_message_json = extract_json(agent_message)
+ output = ""
+ if "thinking" in agent_message_json and agent_message_json["thinking"]:
+ output += "" + agent_message_json["thinking"] + ""
+ if "response" in agent_message_json and agent_message_json["response"]:
+ output += "" + agent_message_json["response"] + ""
+ if "execute_python" in agent_message_json and agent_message_json["execute_python"]:
+ output += (
+ "\n\n"
+ + agent_message_json["execute_python"]
+ + "\n\n"
+ )
+ if (
+ "let_user_respond" in agent_message_json
+ and agent_message_json["let_user_respond"]
+ ):
+ output += (
+ ""
+ + str(agent_message_json["let_user_respond"])
+ + ""
+ )
+
+ return output
+
+
def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
chat = copy.deepcopy(chat)
@@ -67,7 +94,7 @@ def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
elif chat_i["role"] == "observation":
conversation += f"OBSERVATION:\n{chat_i['content']}\n\n"
elif chat_i["role"] == "assistant":
- conversation += f"AGENT: {chat_i['content']}\n\n"
+ conversation += f"AGENT: {format_agent_message(chat_i['content'])}\n\n"
else:
raise ValueError(f"role {chat_i['role']} is not supported")
@@ -84,7 +111,19 @@ def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
and len(chat[-1]["media"]) > 0 # type: ignore
):
message["media"] = chat[-1]["media"]
- return extract_json(orch([message], stream=False)) # type: ignore
+ conv_resp = cast(str, orch([message], stream=False))
+
+ let_user_respond_str = extract_tag(conv_resp, "let_user_respond")
+ let_user_respond = (
+ "true" in let_user_respond_str.lower() if let_user_respond_str else False
+ )
+
+ return {
+ "thinking": extract_tag(conv_resp, "thinking"),
+ "response": extract_tag(conv_resp, "response"),
+ "execute_python": extract_tag(conv_resp, "execute_python"),
+ "let_user_respond": let_user_respond,
+ }
def execute_code_action(
@@ -100,32 +139,6 @@ def execute_code_action(
return result, obs
-def parse_execution(
- response: str,
- test_multi_plan: bool = True,
- custom_tool_names: Optional[List[str]] = None,
-) -> Optional[str]:
- code = None
- remaining = response
- all_code = []
- while "" in remaining:
- code_i = remaining[
- remaining.find("") + len("") :
- ]
- code_i = code_i[: code_i.find("")]
- remaining = remaining[
- remaining.find("") + len("") :
- ]
- all_code.append(code_i)
-
- if len(all_code) > 0:
- code = "\n".join(all_code)
-
- if code is not None:
- code = use_extra_vision_agent_args(code, test_multi_plan, custom_tool_names)
- return code
-
-
def execute_user_code_action(
last_user_message: Message,
code_interpreter: CodeInterpreter,
@@ -138,9 +151,13 @@ def execute_user_code_action(
return user_result, user_obs
last_user_content = cast(str, last_user_message.get("content", ""))
+ try:
+ user_code_action = json.loads(last_user_content).get("execute_python", None)
+ except json.JSONDecodeError:
+ return user_result, user_obs
- user_code_action = parse_execution(last_user_content, False)
if user_code_action is not None:
+ user_code_action = use_extra_vision_agent_args(user_code_action, False)
user_result, user_obs = execute_code_action(
user_code_action, code_interpreter, artifact_remote_path
)
@@ -149,29 +166,28 @@ def execute_user_code_action(
return user_result, user_obs
-def add_step_descriptions(response: Dict[str, str]) -> Dict[str, str]:
+def add_step_descriptions(response: Dict[str, Any]) -> Dict[str, Any]:
response = copy.deepcopy(response)
- if "response" in response:
- resp_str = response["response"]
- if "" in resp_str:
- # only include descriptions for these, the rest will just have executing
- # code
- description_map = {
- "open_code_artifact": "Reading file.",
- "create_code_artifact": "Creating file.",
- "edit_code_artifact": "Editing file.",
- "generate_vision_code": "Generating vision code.",
- "edit_vision_code": "Editing vision code.",
- }
- description = ""
- for k, v in description_map.items():
- if k in resp_str:
- description += v + " "
- if description == "":
- description = "Executing code."
- resp_str = resp_str[resp_str.find("") :]
- resp_str = description + resp_str
- response["response"] = resp_str
+
+ if "execute_python" in response and response["execute_python"]:
+ # only include descriptions for these, the rest will just have executing
+ # code
+ description_map = {
+ "open_code_artifact": "Reading file.",
+ "create_code_artifact": "Creating file.",
+ "edit_code_artifact": "Editing file.",
+ "generate_vision_code": "Generating vision code.",
+ "edit_vision_code": "Editing vision code.",
+ }
+ description = ""
+ for k, v in description_map.items():
+ if k in response["execute_python"]:
+ description += v + " "
+ if description == "":
+ description = "Executing code."
+
+ response["response"] = description
+
return response
@@ -394,13 +410,13 @@ def chat_with_artifacts(
int_chat.append(
{
"role": "assistant",
- "content": str(add_step_descriptions(response)),
+ "content": json.dumps(add_step_descriptions(response)),
}
)
orig_chat.append(
{
"role": "assistant",
- "content": str(add_step_descriptions(response)),
+ "content": json.dumps(add_step_descriptions(response)),
}
)
@@ -408,11 +424,13 @@ def chat_with_artifacts(
if last_response == response:
response["let_user_respond"] = True
- finished = response["let_user_respond"]
+ finished = response.get("let_user_respond", False)
- code_action = parse_execution(
- response["response"], test_multi_plan, custom_tool_names
- )
+ code_action = response.get("execute_python", None)
+ if code_action is not None:
+ code_action = use_extra_vision_agent_args(
+ code_action, test_multi_plan, custom_tool_names
+ )
if last_response == response:
self.streaming_message(
@@ -431,7 +449,7 @@ def chat_with_artifacts(
self.streaming_message(
{
"role": "assistant",
- "content": response,
+ "content": json.dumps(response),
"finished": finished and code_action is None,
}
)
diff --git a/vision_agent/agent/vision_agent_prompts.py b/vision_agent/agent/vision_agent_prompts.py
index a8b1e543..93343045 100644
--- a/vision_agent/agent/vision_agent_prompts.py
+++ b/vision_agent/agent/vision_agent_prompts.py
@@ -27,13 +27,14 @@
**Instructions**:
1. **Understand and Clarify**: Make sure you understand the task, ask clarifying questions if the task is not clear.
2. **Code Generation**: Only use code provided in the Documentation in your tags. Only use `edit_vision_code` to modify code written by `generate_vision_code`.
-3. **Execute**: Do only what the user asked you to do and no more. If you need to ask the user a question, set `let_user_respond` to `true`.
+3. **Execute**: Do only what the user asked you to do and no more. If you need to ask the user a question or show your results to the user, set to `true`.
4. **Response**: Keep your responses short and concise. Provide the user only with the information they need to continue the conversation.
-5. **Output in JSON**: Respond in the following format in JSON:
+5. **Output**: You can only respond with , , , and tags.
-```json
-{{"thoughts": , "response": , "let_user_respond": }}.
-```
+Your thoughts here...
+Your response to the user here...
+Your code here...
+true/false for whether or not to you want to let the user respond.
"""
@@ -45,7 +46,11 @@
Artifact dog.jpg loaded to /path/to/images/dog.jpg
[End of artifacts]
-AGENT: {"thoughts": "I will use the generate_vision_code to detect the dogs in the image.", "response": "generate_vision_code(artifacts, 'dog_detector.py', 'Can you write code to detect dogs in this image?', media=['/path/to/images/dog.jpg'])", "let_user_respond": false}
+AGENT: I will use the generate_vision_code to detect the dogs in the image.
+
+generate_vision_code(artifacts, 'dog_detector.py', 'Can you write code to detect dogs in this image?', media=['/path/to/images/dog.jpg'])
+
+false
OBSERVATION:
[Artifact dog_detector.py]
@@ -56,14 +61,21 @@
4| return dogs
[End of artifact]
-AGENT: {"thoughts": "I have generated the code to detect the dogs in the image, I must now run the code and print the results to get the output.", "response": "from dog_detector import detect_dogs\n print(detect_dogs('/example/workspace/dog.jpg'))", "let_user_respond": false}
+AGENT: I have generated the code to detect the dogs in the image, I must now run the code and print the results to get the output.
+
+from dog_detector import detect_dogs
+print(detect_dogs('/example/workspace/dog.jpg'))
+
+false
OBSERVATION:
----- stdout -----
[{'score': 0.99, 'label': 'dog', 'box': [0.1, 0.2, 0.3, 0.4]}, {'score': 0.23, 'label': 'dog', 'box': [0.2, 0.3, 0.4, 0.5]}]
-AGENT: {"thoughts": "Two dogs are detected, I will show this to the user and ask them if the result looks good.", "response": "The code detectd two dogs, do the results look good to you?", "let_user_respond": true}
+AGENT: Two dogs are detected, I will show this to the user and ask them if the result looks good.
+The code detectd two dogs, do the results look good to you?
+true
"""
EXAMPLES_CODE1_EXTRA = """
@@ -75,7 +87,11 @@
Artifact dog_detector.py loaded to /path/to/code/dog_detector.py
[End of artifacts]
-AGENT: {"thoughts": "I will use edit_vision_code to fix the code to detect only one dog. The lower score is 0.23 so I will set the threshold above that.", "response": "edit_vision_code(artifacts, 'dog_detector.py', ['Can you write code to detect dogs in this image?', 'Can you increase the threshold to 0.24?'], media=['/path/to/images/dog.jpg'])", "let_user_respond": false}
+AGENT: I will use edit_vision_code to fix the code to detect only one dog. The lower score is 0.23 so I will set the threshold above that
+
+edit_vision_code(artifacts, 'dog_detector.py', ['Can you write code to detect dogs in this image?', 'Can you increase the threshold to 0.24?'], media=['/path/to/images/dog.jpg'])
+
+false
OBSERVATION:
[Artifact dog_detector.py]
@@ -86,13 +102,20 @@
4| return dogs
[End of artifact]
-AGENT: {"thoughts": "I have edited the code to detect only one dog, I must now run the code and print the results to get the output.", "response": "from dog_detector import detect_dogs\n print(detect_dogs('/path/to/images/dog.jpg'))", "let_user_respond": false}
+AGENT: I have edited the code to detect only one dog, I must now run the code and print the results to get the output.
+
+from dog_detector import detect_dogs
+print(detect_dogs('/path/to/images/dog.jpg'))
+
+false
OBSERVATION:
----- stdout -----
[{'score': 0.99, 'label': 'dog', 'box': [0.1, 0.2, 0.3, 0.4]}]
-AGENT: {"thoughts": "One dog is detected, I will show this to the user and ask them if the result looks good.", "response": "The code detected one dog, do these results look good to you?", "let_user_respond": true}
+AGENT: One dog is detected, I will show this to the user and ask them if the result looks good.
+The code detected one dog, do these results look good to you?
+true
"""
EXAMPLES_CODE2 = """
@@ -103,12 +126,18 @@
Artifact image.jpg loaded to /path/to/images/image.jpg
[End of artifacts]
-AGENT: {"thoughts": "The user hasn't asked me to write any code and the task is very simple so I will view the image and answer myself to respond to the user quickly.", "response": "view_media_artifacts('image.jpg')", "let_user_respond": false}
+AGENT: The user hasn't asked me to write any code and the task is very simple so I will view the image and answer myself to respond to the user quickly.
+
+view_media_artifacts('image.jpg')
+
+false
OBSERVATION:
[Image image.jpg displayed]
-AGENT: {"thoughts": "The image shows a cat and a dog sitting on the couch, I will tell the user and ask them if they need any other assistance.", "response": "The image contains a dog and a cat sitting on a couch. Can I help you with any other tasks?", "let_user_respond": true}
+AGENT: The image shows a cat and a dog sitting on the couch, I will tell the user and ask them if they need any other assistance.
+The image contains a dog and a cat sitting on a couch. Can I help you with any other tasks?
+true
"""
@@ -119,7 +148,9 @@
[Artifacts loaded]
[End of artifacts]
-AGENT: {"thoughts": "The user has asked to count workers with helmets but has not provided an image. I will ask the user for an image and then generate the code to count workers with helmets.", "response": "Can you provide an image of workers with helmets?", "let_user_respond": true}
+AGENT: The user has asked to count workers with helmets but has not provided an image. I will ask the user for an image and then generate the code to count workers with helmets.
+Can you provide an image of workers with helmets?
+true
USER: Yes you can use workers.png
@@ -128,7 +159,11 @@
Artifact workers.png loaded to /path/to/images/workers.png
[End of artifacts]
-AGENT: {"thoughts": "I will use the generate_vision_code to count the workers with helmets in the image.", "response": "generate_vision_code(artifacts, 'code.py', 'Can you create a function to count workers with helmets, return the count and save a visaulization of the bounding boxes?', media=['/paths/to/images/workers.png'])", "let_user_respond": false}
+AGENT: I will use the generate_vision_code to count the workers with helmets in the image.
+
+generate_vision_code(artifacts, 'code.py', 'Can you create a function to count workers with helmets, return the count and save a visaulization of the bounding boxes?', media=['/paths/to/images/workers.png'])
+
+false
OBSERVATION:
[Artifact code.py]
@@ -152,22 +187,37 @@
16| return count
[End of artifact]
-AGENT: {"thoughts": "I have generated the code to count the workers with helmets in the image, I must now run the code and print the output and write the visualization to the artifacts so I can see the result and the user can see the visaulization.", "response": "from code import count_workers_with_helmets\n print(count_workers_with_helmets('/path/to/images/workers.png', 'workers_viz.png'))\n write_media_artifact(artifacts, 'workers_viz.png', 'workers_viz.png')", "let_user_respond": false}
+AGENT: I have generated the code to count the workers with helmets in the image, I must now run the code and print the output and write the visualization to the artifacts so I can see the result and the user can see the visaulization.
+
+from code import count_workers_with_helmets
+print(count_workers_with_helmets('/path/to/images/workers.png', 'workers_viz.png'))\n write_media_artifact(artifacts, 'workers_viz.png', 'workers_viz.png')
+
+false
OBSERVATION:
----- stdout -----
2
-AGENT: {"thoughts": "Two workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "The code to detect workers with helmets is saved in code.py and the visualization under 'workers_viz.png'.", "let_user_respond": true}
+AGENT: Two workers with helmets are detected, I will show this to the user and ask them if the result looks good.
+The code to detect workers with helmets is saved in code.py and the visualization under 'workers_viz.png'.
+true
USER: The detections are slightly off. Can you fine tune florence2 using these labels? "[{'image_path': 'image1.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}, {'image_path': 'image2.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}]"
-AGENT: {"thoughts": "Because the user has supplied me with labels I can call object_detection_fine_tuning on their behalf to fine tune the model", "response": "object_detection_fine_tuning([{'image_path': 'image1.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}, {'image_path': 'image2.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}])", "let_user_respond": false}
+AGENT: Because the user has supplied me with labels I can call object_detection_fine_tuning on their behalf to fine tune the model
+
+object_detection_fine_tuning([{'image_path': 'image1.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}, {'image_path': 'image2.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}])
+
+false
OBSERVATION:
[Fine tuning id: 23b3b022-5ebf-4798-9373-20ef36429abf]
-AGENT: {"thoughts": "The model has finished fine tuning, I will now replace the original florence2_phrase_grounding call with the fine tuning id.", "response": "use_object_detection_fine_tuning(artifacts, "code.py", "23b3b022-5ebf-4798-9373-20ef36429abf")", "let_user_respond": false}
+AGENT: The model has finished fine tuning, I will now replace the original florence2_phrase_grounding call with the fine tuning id.
+
+use_object_detection_fine_tuning(artifacts, "code.py", "23b3b022-5ebf-4798-9373-20ef36429abf")
+
+false
OBSERVATION:
[Artifact code.py edits]
@@ -184,11 +234,19 @@ def count_workers_with_helmets(image_path: str, output_path: str):
count = 0
[End of edits]
-AGENT: {"thoughts": "I have updated the code to use the fine tuned model, I must now run the code and print the output and write the visualization to the artifacts so I can see the result and the user can see the visaulization.", "response": "from code import count_workers_with_helmets\n print(count_workers_with_helmets('/path/to/images/workers.png', 'workers_viz.png'))\n write_media_artifact(artifacts, 'workers_viz.png', 'workers_viz.png')", "let_user_respond": false}
+AGENT: I have updated the code to use the fine tuned model, I must now run the code and print the output and write the visualization to the artifacts so I can see the result and the user can see the visaulization.
+
+from code import count_workers_with_helmets
+print(count_workers_with_helmets('/path/to/images/workers.png', 'workers_viz.png'))
+write_media_artifact(artifacts, 'workers_viz.png', 'workers_viz.png')
+
+false
OBSERVATION:
----- stdout -----
3
-AGENT: {"thoughts": "Three workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "I have updated the code using the fine tuned florence2_phrase_grounding model to count the workers wearing helmets in code.py and saved the visualization under 'workers_viz.png'.", "let_user_respond": true}
+AGENT: Three workers with helmets are detected, I will show this to the user and ask them if the result looks good.
+I have updated the code using the fine tuned florence2_phrase_grounding model to count the workers wearing helmets in code.py and saved the visualization under 'workers_viz.png'.
+true
"""