diff --git a/vision_agent/agent/vision_agent.py b/vision_agent/agent/vision_agent.py index 6714ba34..5b8116ae 100644 --- a/vision_agent/agent/vision_agent.py +++ b/vision_agent/agent/vision_agent.py @@ -10,11 +10,16 @@ from vision_agent.agent.vision_agent_prompts import ( EXAMPLES_CODE1, EXAMPLES_CODE2, + EXAMPLES_CODE3, VA_CODE, ) from vision_agent.lmm import LMM, AnthropicLMM, Message, OpenAILMM from vision_agent.tools import META_TOOL_DOCSTRING, load_image, save_image -from vision_agent.tools.meta_tools import Artifacts, use_extra_vision_agent_args +from vision_agent.tools.meta_tools import ( + Artifacts, + check_and_load_image, + use_extra_vision_agent_args, +) from vision_agent.utils import CodeInterpreterFactory from vision_agent.utils.execute import CodeInterpreter, Execution @@ -30,7 +35,7 @@ class BoilerplateCode: pre_code = [ "from typing import *", "from vision_agent.utils.execute import CodeInterpreter", - "from vision_agent.tools.meta_tools import Artifacts, open_code_artifact, create_code_artifact, edit_code_artifact, get_tool_descriptions, generate_vision_code, edit_vision_code, write_media_artifact, florence2_fine_tuning, use_florence2_fine_tuning", + "from vision_agent.tools.meta_tools import Artifacts, open_code_artifact, create_code_artifact, edit_code_artifact, get_tool_descriptions, generate_vision_code, edit_vision_code, write_media_artifact, view_media_artifact, florence2_fine_tuning, use_florence2_fine_tuning", "artifacts = Artifacts('{remote_path}')", "artifacts.load('{remote_path}')", ] @@ -68,10 +73,17 @@ def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]: prompt = VA_CODE.format( documentation=META_TOOL_DOCSTRING, - examples=f"{EXAMPLES_CODE1}\n{EXAMPLES_CODE2}", + examples=f"{EXAMPLES_CODE1}\n{EXAMPLES_CODE2}\n{EXAMPLES_CODE3}", conversation=conversation, ) - return extract_json(orch([{"role": "user", "content": prompt}], stream=False)) # type: ignore + message: Message = {"role": "user", "content": prompt} + if ( + chat[-1]["role"] == "observation" + and "media" in chat[-1] + and len(chat[-1]["media"]) > 0 # type: ignore + ): + message["media"] = chat[-1]["media"] + return extract_json(orch([message], stream=False)) # type: ignore def run_code_action( @@ -296,13 +308,22 @@ def chat_with_code( code_action, code_interpreter, str(remote_artifacts_path) ) + media_obs = check_and_load_image(code_action) + if self.verbosity >= 1: _LOGGER.info(obs) + + chat_elt: Message = {"role": "observation", "content": obs} + if media_obs and result.success: + chat_elt["media"] = [ + Path(code_interpreter.remote_path) / media_ob + for media_ob in media_obs + ] + # don't add execution results to internal chat - int_chat.append({"role": "observation", "content": obs}) - orig_chat.append( - {"role": "observation", "content": obs, "execution": result} - ) + int_chat.append(chat_elt) + chat_elt["execution"] = result + orig_chat.append(chat_elt) self.streaming_message( { "role": "observation", diff --git a/vision_agent/agent/vision_agent_prompts.py b/vision_agent/agent/vision_agent_prompts.py index 7b0dd600..557283a7 100644 --- a/vision_agent/agent/vision_agent_prompts.py +++ b/vision_agent/agent/vision_agent_prompts.py @@ -18,19 +18,24 @@ {examples} --- END EXAMPLES --- -**Instructions**: -1. **Understand and Clarify**: Make sure you understand the task, ask clarifying questions if the task is not clear. -2. **Output in JSON**: Respond in JSON format, {{"thoughts": , "response": , "let_user_respond": }}. - **Conversation**: Here is the current conversation so far: --- START CONVERSATION --- {conversation} +--- END CONVERSATION --- + +**Instructions**: +1. **Understand and Clarify**: Make sure you understand the task, ask clarifying questions if the task is not clear. +2. **Output in JSON**: Respond in the following format in JSON: + +```json +{{"thoughts": , "response": , "let_user_respond": }}. +``` """ EXAMPLES_CODE1 = """ -USER: Can you detect the dogs in this image? Media name dog.jpg +USER: Can you write code to detect the dogs in this image? Media name dog.jpg OBSERVATION: [Artifacts loaded] @@ -61,6 +66,7 @@ EXAMPLES_CODE1_EXTRA = """ USER: The the image only has one dog, can you fix this? +OBSERVATION: [Artifacts loaded] Artifact dog.jpg loaded to /path/to/images/dog.jpg Artifact dog_detector.py loaded to /path/to/code/dog_detector.py @@ -86,8 +92,24 @@ AGENT: {"thoughts": "One dog is detected, I will show this to the user and ask them if the result looks good.", "response": "I have written the code to detect one dog and shown you the output, do the results look good to you?", "let_user_respond": true} """ - EXAMPLES_CODE2 = """ +USER: Can you describe this image? + +OBSERVATION: +[Artifacts loaded] +Artifact image.jpg loaded to /path/to/images/image.jpg +[End of artifacts] + +AGENT: {"thoughts": "The user hasn't asked me to write any code and the task is very simple so I will view the image and answer myself to respond to the user quickly.", "response": "view_media_artifacts('image.jpg')", "let_user_respond": false} + +OBSERVATION: +[Image image.jpg displayed] + +AGENT: {"thoughts": "The image shows a cat and a dog sitting on the couch, I will tell the user and ask them if they need any other assistance.", "response": "The image contains a dog and a cat sitting on a couch. Can I help you with any other tasks?", "let_user_respond": true} +""" + + +EXAMPLES_CODE3 = """ USER: Can you create a function to count workers with helmets, return the count and save a visaulization of the bounding boxes? OBSERVATION: diff --git a/vision_agent/tools/meta_tools.py b/vision_agent/tools/meta_tools.py index ccf98287..3ec227f8 100644 --- a/vision_agent/tools/meta_tools.py +++ b/vision_agent/tools/meta_tools.py @@ -437,6 +437,33 @@ def list_artifacts(artifacts: Artifacts) -> str: return output_str +def check_and_load_image(code: str) -> List[str]: + if not code.strip(): + return [] + + pattern = r"show_media_artifact\(\s*([^\)]+),\s*['\"]([^\)]+)['\"]\s*\)" + match = re.search(pattern, code) + if match: + name = match.group(2) + return [name] + return [] + + +def view_media_artifact(artifacts: Artifacts, name: str) -> str: + """Views the image artifact with the given name. + + Parameters: + artifacts (Artifacts): The artifacts object to show the image from. + name (str): The name of the image artifact to show. + """ + if name not in artifacts: + output_str = f"[Artifact {name} does not exist]" + else: + output_str = f"[Image {name} displayed]" + print(output_str) + return output_str + + def get_tool_descriptions() -> str: """Returns a description of all the tools that `generate_vision_code` has access to. Helpful for answering questions about what types of vision tasks you can do with @@ -515,7 +542,7 @@ def use_extra_vision_agent_args( Returns: str: The edited code. """ - generate_pattern = r"generate_vision_code\(\s*([^\)]+)\)" + generate_pattern = r"generate_vision_code\(\s*([^\)]+)\s*\)" def generate_replacer(match: re.Match) -> str: arg = match.group(1) @@ -526,7 +553,7 @@ def generate_replacer(match: re.Match) -> str: out_str += ")" return out_str - edit_pattern = r"edit_vision_code\(\s*([^\)]+)\)" + edit_pattern = r"edit_vision_code\(\s*([^\)]+)\s*\)" def edit_replacer(match: re.Match) -> str: arg = match.group(1) @@ -604,6 +631,7 @@ def replacer(match: re.Match) -> str: generate_vision_code, edit_vision_code, write_media_artifact, + view_media_artifact, florence2_fine_tuning, use_florence2_fine_tuning, list_artifacts,