diff --git a/tests/unit/test_va.py b/tests/unit/test_va.py index 3fe619e8..e7a6e7c5 100644 --- a/tests/unit/test_va.py +++ b/tests/unit/test_va.py @@ -1,4 +1,12 @@ -from vision_agent.agent.vision_agent import parse_execution +from vision_agent.agent.agent_utils import extract_tag +from vision_agent.tools.meta_tools import use_extra_vision_agent_args + + +def parse_execution(code, test_multi_plan=True, custom_tool_names=None): + code = extract_tag(code, "execute_python") + if not code: + return None + return use_extra_vision_agent_args(code, test_multi_plan, custom_tool_names) def test_parse_execution_zero(): diff --git a/vision_agent/agent/vision_agent.py b/vision_agent/agent/vision_agent.py index 42541d33..24cb3851 100644 --- a/vision_agent/agent/vision_agent.py +++ b/vision_agent/agent/vision_agent.py @@ -1,4 +1,5 @@ import copy +import json import logging import os import tempfile @@ -6,7 +7,7 @@ from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast from vision_agent.agent import Agent -from vision_agent.agent.agent_utils import extract_json +from vision_agent.agent.agent_utils import extract_json, extract_tag from vision_agent.agent.vision_agent_prompts import ( EXAMPLES_CODE1, EXAMPLES_CODE2, @@ -57,6 +58,32 @@ def add_boilerplate(code: str, **format: Any) -> str: ) +def format_agent_message(agent_message: str) -> str: + agent_message_json = extract_json(agent_message) + output = "" + if "thinking" in agent_message_json and agent_message_json["thinking"]: + output += "" + agent_message_json["thinking"] + "" + if "response" in agent_message_json and agent_message_json["response"]: + output += "" + agent_message_json["response"] + "" + if "execute_python" in agent_message_json and agent_message_json["execute_python"]: + output += ( + "\n\n" + + agent_message_json["execute_python"] + + "\n\n" + ) + if ( + "let_user_respond" in agent_message_json + and agent_message_json["let_user_respond"] + ): + output += ( + "" + + str(agent_message_json["let_user_respond"]) + + "" + ) + + return output + + def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]: chat = copy.deepcopy(chat) @@ -67,7 +94,7 @@ def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]: elif chat_i["role"] == "observation": conversation += f"OBSERVATION:\n{chat_i['content']}\n\n" elif chat_i["role"] == "assistant": - conversation += f"AGENT: {chat_i['content']}\n\n" + conversation += f"AGENT: {format_agent_message(chat_i['content'])}\n\n" else: raise ValueError(f"role {chat_i['role']} is not supported") @@ -84,7 +111,19 @@ def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]: and len(chat[-1]["media"]) > 0 # type: ignore ): message["media"] = chat[-1]["media"] - return extract_json(orch([message], stream=False)) # type: ignore + conv_resp = cast(str, orch([message], stream=False)) + + let_user_respond_str = extract_tag(conv_resp, "let_user_respond") + let_user_respond = ( + "true" in let_user_respond_str.lower() if let_user_respond_str else False + ) + + return { + "thinking": extract_tag(conv_resp, "thinking"), + "response": extract_tag(conv_resp, "response"), + "execute_python": extract_tag(conv_resp, "execute_python"), + "let_user_respond": let_user_respond, + } def execute_code_action( @@ -100,32 +139,6 @@ def execute_code_action( return result, obs -def parse_execution( - response: str, - test_multi_plan: bool = True, - custom_tool_names: Optional[List[str]] = None, -) -> Optional[str]: - code = None - remaining = response - all_code = [] - while "" in remaining: - code_i = remaining[ - remaining.find("") + len("") : - ] - code_i = code_i[: code_i.find("")] - remaining = remaining[ - remaining.find("") + len("") : - ] - all_code.append(code_i) - - if len(all_code) > 0: - code = "\n".join(all_code) - - if code is not None: - code = use_extra_vision_agent_args(code, test_multi_plan, custom_tool_names) - return code - - def execute_user_code_action( last_user_message: Message, code_interpreter: CodeInterpreter, @@ -138,9 +151,13 @@ def execute_user_code_action( return user_result, user_obs last_user_content = cast(str, last_user_message.get("content", "")) + try: + user_code_action = json.loads(last_user_content).get("execute_python", None) + except json.JSONDecodeError: + return user_result, user_obs - user_code_action = parse_execution(last_user_content, False) if user_code_action is not None: + user_code_action = use_extra_vision_agent_args(user_code_action, False) user_result, user_obs = execute_code_action( user_code_action, code_interpreter, artifact_remote_path ) @@ -149,29 +166,28 @@ def execute_user_code_action( return user_result, user_obs -def add_step_descriptions(response: Dict[str, str]) -> Dict[str, str]: +def add_step_descriptions(response: Dict[str, Any]) -> Dict[str, Any]: response = copy.deepcopy(response) - if "response" in response: - resp_str = response["response"] - if "" in resp_str: - # only include descriptions for these, the rest will just have executing - # code - description_map = { - "open_code_artifact": "Reading file.", - "create_code_artifact": "Creating file.", - "edit_code_artifact": "Editing file.", - "generate_vision_code": "Generating vision code.", - "edit_vision_code": "Editing vision code.", - } - description = "" - for k, v in description_map.items(): - if k in resp_str: - description += v + " " - if description == "": - description = "Executing code." - resp_str = resp_str[resp_str.find("") :] - resp_str = description + resp_str - response["response"] = resp_str + + if "execute_python" in response and response["execute_python"]: + # only include descriptions for these, the rest will just have executing + # code + description_map = { + "open_code_artifact": "Reading file.", + "create_code_artifact": "Creating file.", + "edit_code_artifact": "Editing file.", + "generate_vision_code": "Generating vision code.", + "edit_vision_code": "Editing vision code.", + } + description = "" + for k, v in description_map.items(): + if k in response["execute_python"]: + description += v + " " + if description == "": + description = "Executing code." + + response["response"] = description + return response @@ -394,13 +410,13 @@ def chat_with_artifacts( int_chat.append( { "role": "assistant", - "content": str(add_step_descriptions(response)), + "content": json.dumps(add_step_descriptions(response)), } ) orig_chat.append( { "role": "assistant", - "content": str(add_step_descriptions(response)), + "content": json.dumps(add_step_descriptions(response)), } ) @@ -408,11 +424,13 @@ def chat_with_artifacts( if last_response == response: response["let_user_respond"] = True - finished = response["let_user_respond"] + finished = response.get("let_user_respond", False) - code_action = parse_execution( - response["response"], test_multi_plan, custom_tool_names - ) + code_action = response.get("execute_python", None) + if code_action is not None: + code_action = use_extra_vision_agent_args( + code_action, test_multi_plan, custom_tool_names + ) if last_response == response: self.streaming_message( @@ -431,7 +449,7 @@ def chat_with_artifacts( self.streaming_message( { "role": "assistant", - "content": response, + "content": json.dumps(response), "finished": finished and code_action is None, } ) diff --git a/vision_agent/agent/vision_agent_prompts.py b/vision_agent/agent/vision_agent_prompts.py index a8b1e543..93343045 100644 --- a/vision_agent/agent/vision_agent_prompts.py +++ b/vision_agent/agent/vision_agent_prompts.py @@ -27,13 +27,14 @@ **Instructions**: 1. **Understand and Clarify**: Make sure you understand the task, ask clarifying questions if the task is not clear. 2. **Code Generation**: Only use code provided in the Documentation in your tags. Only use `edit_vision_code` to modify code written by `generate_vision_code`. -3. **Execute**: Do only what the user asked you to do and no more. If you need to ask the user a question, set `let_user_respond` to `true`. +3. **Execute**: Do only what the user asked you to do and no more. If you need to ask the user a question or show your results to the user, set to `true`. 4. **Response**: Keep your responses short and concise. Provide the user only with the information they need to continue the conversation. -5. **Output in JSON**: Respond in the following format in JSON: +5. **Output**: You can only respond with , , , and tags. -```json -{{"thoughts": , "response": , "let_user_respond": }}. -``` +Your thoughts here... +Your response to the user here... +Your code here... +true/false for whether or not to you want to let the user respond. """ @@ -45,7 +46,11 @@ Artifact dog.jpg loaded to /path/to/images/dog.jpg [End of artifacts] -AGENT: {"thoughts": "I will use the generate_vision_code to detect the dogs in the image.", "response": "generate_vision_code(artifacts, 'dog_detector.py', 'Can you write code to detect dogs in this image?', media=['/path/to/images/dog.jpg'])", "let_user_respond": false} +AGENT: I will use the generate_vision_code to detect the dogs in the image. + +generate_vision_code(artifacts, 'dog_detector.py', 'Can you write code to detect dogs in this image?', media=['/path/to/images/dog.jpg']) + +false OBSERVATION: [Artifact dog_detector.py] @@ -56,14 +61,21 @@ 4| return dogs [End of artifact] -AGENT: {"thoughts": "I have generated the code to detect the dogs in the image, I must now run the code and print the results to get the output.", "response": "from dog_detector import detect_dogs\n print(detect_dogs('/example/workspace/dog.jpg'))", "let_user_respond": false} +AGENT: I have generated the code to detect the dogs in the image, I must now run the code and print the results to get the output. + +from dog_detector import detect_dogs +print(detect_dogs('/example/workspace/dog.jpg')) + +false OBSERVATION: ----- stdout ----- [{'score': 0.99, 'label': 'dog', 'box': [0.1, 0.2, 0.3, 0.4]}, {'score': 0.23, 'label': 'dog', 'box': [0.2, 0.3, 0.4, 0.5]}] -AGENT: {"thoughts": "Two dogs are detected, I will show this to the user and ask them if the result looks good.", "response": "The code detectd two dogs, do the results look good to you?", "let_user_respond": true} +AGENT: Two dogs are detected, I will show this to the user and ask them if the result looks good. +The code detectd two dogs, do the results look good to you? +true """ EXAMPLES_CODE1_EXTRA = """ @@ -75,7 +87,11 @@ Artifact dog_detector.py loaded to /path/to/code/dog_detector.py [End of artifacts] -AGENT: {"thoughts": "I will use edit_vision_code to fix the code to detect only one dog. The lower score is 0.23 so I will set the threshold above that.", "response": "edit_vision_code(artifacts, 'dog_detector.py', ['Can you write code to detect dogs in this image?', 'Can you increase the threshold to 0.24?'], media=['/path/to/images/dog.jpg'])", "let_user_respond": false} +AGENT: I will use edit_vision_code to fix the code to detect only one dog. The lower score is 0.23 so I will set the threshold above that + +edit_vision_code(artifacts, 'dog_detector.py', ['Can you write code to detect dogs in this image?', 'Can you increase the threshold to 0.24?'], media=['/path/to/images/dog.jpg']) + +false OBSERVATION: [Artifact dog_detector.py] @@ -86,13 +102,20 @@ 4| return dogs [End of artifact] -AGENT: {"thoughts": "I have edited the code to detect only one dog, I must now run the code and print the results to get the output.", "response": "from dog_detector import detect_dogs\n print(detect_dogs('/path/to/images/dog.jpg'))", "let_user_respond": false} +AGENT: I have edited the code to detect only one dog, I must now run the code and print the results to get the output. + +from dog_detector import detect_dogs +print(detect_dogs('/path/to/images/dog.jpg')) + +false OBSERVATION: ----- stdout ----- [{'score': 0.99, 'label': 'dog', 'box': [0.1, 0.2, 0.3, 0.4]}] -AGENT: {"thoughts": "One dog is detected, I will show this to the user and ask them if the result looks good.", "response": "The code detected one dog, do these results look good to you?", "let_user_respond": true} +AGENT: One dog is detected, I will show this to the user and ask them if the result looks good. +The code detected one dog, do these results look good to you? +true """ EXAMPLES_CODE2 = """ @@ -103,12 +126,18 @@ Artifact image.jpg loaded to /path/to/images/image.jpg [End of artifacts] -AGENT: {"thoughts": "The user hasn't asked me to write any code and the task is very simple so I will view the image and answer myself to respond to the user quickly.", "response": "view_media_artifacts('image.jpg')", "let_user_respond": false} +AGENT: The user hasn't asked me to write any code and the task is very simple so I will view the image and answer myself to respond to the user quickly. + +view_media_artifacts('image.jpg') + +false OBSERVATION: [Image image.jpg displayed] -AGENT: {"thoughts": "The image shows a cat and a dog sitting on the couch, I will tell the user and ask them if they need any other assistance.", "response": "The image contains a dog and a cat sitting on a couch. Can I help you with any other tasks?", "let_user_respond": true} +AGENT: The image shows a cat and a dog sitting on the couch, I will tell the user and ask them if they need any other assistance. +The image contains a dog and a cat sitting on a couch. Can I help you with any other tasks? +true """ @@ -119,7 +148,9 @@ [Artifacts loaded] [End of artifacts] -AGENT: {"thoughts": "The user has asked to count workers with helmets but has not provided an image. I will ask the user for an image and then generate the code to count workers with helmets.", "response": "Can you provide an image of workers with helmets?", "let_user_respond": true} +AGENT: The user has asked to count workers with helmets but has not provided an image. I will ask the user for an image and then generate the code to count workers with helmets. +Can you provide an image of workers with helmets? +true USER: Yes you can use workers.png @@ -128,7 +159,11 @@ Artifact workers.png loaded to /path/to/images/workers.png [End of artifacts] -AGENT: {"thoughts": "I will use the generate_vision_code to count the workers with helmets in the image.", "response": "generate_vision_code(artifacts, 'code.py', 'Can you create a function to count workers with helmets, return the count and save a visaulization of the bounding boxes?', media=['/paths/to/images/workers.png'])", "let_user_respond": false} +AGENT: I will use the generate_vision_code to count the workers with helmets in the image. + +generate_vision_code(artifacts, 'code.py', 'Can you create a function to count workers with helmets, return the count and save a visaulization of the bounding boxes?', media=['/paths/to/images/workers.png']) + +false OBSERVATION: [Artifact code.py] @@ -152,22 +187,37 @@ 16| return count [End of artifact] -AGENT: {"thoughts": "I have generated the code to count the workers with helmets in the image, I must now run the code and print the output and write the visualization to the artifacts so I can see the result and the user can see the visaulization.", "response": "from code import count_workers_with_helmets\n print(count_workers_with_helmets('/path/to/images/workers.png', 'workers_viz.png'))\n write_media_artifact(artifacts, 'workers_viz.png', 'workers_viz.png')", "let_user_respond": false} +AGENT: I have generated the code to count the workers with helmets in the image, I must now run the code and print the output and write the visualization to the artifacts so I can see the result and the user can see the visaulization. + +from code import count_workers_with_helmets +print(count_workers_with_helmets('/path/to/images/workers.png', 'workers_viz.png'))\n write_media_artifact(artifacts, 'workers_viz.png', 'workers_viz.png') + +false OBSERVATION: ----- stdout ----- 2 -AGENT: {"thoughts": "Two workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "The code to detect workers with helmets is saved in code.py and the visualization under 'workers_viz.png'.", "let_user_respond": true} +AGENT: Two workers with helmets are detected, I will show this to the user and ask them if the result looks good. +The code to detect workers with helmets is saved in code.py and the visualization under 'workers_viz.png'. +true USER: The detections are slightly off. Can you fine tune florence2 using these labels? "[{'image_path': 'image1.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}, {'image_path': 'image2.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}]" -AGENT: {"thoughts": "Because the user has supplied me with labels I can call object_detection_fine_tuning on their behalf to fine tune the model", "response": "object_detection_fine_tuning([{'image_path': 'image1.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}, {'image_path': 'image2.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}])", "let_user_respond": false} +AGENT: Because the user has supplied me with labels I can call object_detection_fine_tuning on their behalf to fine tune the model + +object_detection_fine_tuning([{'image_path': 'image1.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}, {'image_path': 'image2.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}]) + +false OBSERVATION: [Fine tuning id: 23b3b022-5ebf-4798-9373-20ef36429abf] -AGENT: {"thoughts": "The model has finished fine tuning, I will now replace the original florence2_phrase_grounding call with the fine tuning id.", "response": "use_object_detection_fine_tuning(artifacts, "code.py", "23b3b022-5ebf-4798-9373-20ef36429abf")", "let_user_respond": false} +AGENT: The model has finished fine tuning, I will now replace the original florence2_phrase_grounding call with the fine tuning id. + +use_object_detection_fine_tuning(artifacts, "code.py", "23b3b022-5ebf-4798-9373-20ef36429abf") + +false OBSERVATION: [Artifact code.py edits] @@ -184,11 +234,19 @@ def count_workers_with_helmets(image_path: str, output_path: str): count = 0 [End of edits] -AGENT: {"thoughts": "I have updated the code to use the fine tuned model, I must now run the code and print the output and write the visualization to the artifacts so I can see the result and the user can see the visaulization.", "response": "from code import count_workers_with_helmets\n print(count_workers_with_helmets('/path/to/images/workers.png', 'workers_viz.png'))\n write_media_artifact(artifacts, 'workers_viz.png', 'workers_viz.png')", "let_user_respond": false} +AGENT: I have updated the code to use the fine tuned model, I must now run the code and print the output and write the visualization to the artifacts so I can see the result and the user can see the visaulization. + +from code import count_workers_with_helmets +print(count_workers_with_helmets('/path/to/images/workers.png', 'workers_viz.png')) +write_media_artifact(artifacts, 'workers_viz.png', 'workers_viz.png') + +false OBSERVATION: ----- stdout ----- 3 -AGENT: {"thoughts": "Three workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "I have updated the code using the fine tuned florence2_phrase_grounding model to count the workers wearing helmets in code.py and saved the visualization under 'workers_viz.png'.", "let_user_respond": true} +AGENT: Three workers with helmets are detected, I will show this to the user and ask them if the result looks good. +I have updated the code using the fine tuned florence2_phrase_grounding model to count the workers wearing helmets in code.py and saved the visualization under 'workers_viz.png'. +true """