From cfe42011701e47b71e31e4f514789055f6e98eb3 Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Tue, 15 Oct 2024 08:34:26 -0700 Subject: [PATCH 01/26] add more checks around changing new/old format --- vision_agent/agent/vision_agent.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/vision_agent/agent/vision_agent.py b/vision_agent/agent/vision_agent.py index 39f79e8a..679a9d3c 100644 --- a/vision_agent/agent/vision_agent.py +++ b/vision_agent/agent/vision_agent.py @@ -231,9 +231,18 @@ def old_format_to_new_format(old_format_str: str) -> str: except json.JSONDecodeError: return old_format_str - thinking = old_format["thoughts"] if old_format["thoughts"].strip() != "" else None - let_user_respond = old_format["let_user_respond"] - if "" in old_format["response"]: + if "thoughts" in old_format: + thinking = ( + old_format["thoughts"] if old_format["thoughts"].strip() != "" else None + ) + else: + thinking = None + + let_user_respond = ( + old_format["let_user_respond"] if "let_user_respond" in old_format else True + ) + + if "response" in old_format and "" in old_format["response"]: execute_python = extract_tag(old_format["response"], "execute_python") response = ( old_format["response"] @@ -244,7 +253,7 @@ def old_format_to_new_format(old_format_str: str) -> str: ) else: execute_python = None - response = old_format["response"] + response = old_format["response"] if "response" in old_format else None return json.dumps( { From 79151bb1e46ffb0b132ce1d29a8a4aa51c3ee44f Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Tue, 15 Oct 2024 14:37:58 -0700 Subject: [PATCH 02/26] fix issue with raising json error type --- vision_agent/agent/agent_utils.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vision_agent/agent/agent_utils.py b/vision_agent/agent/agent_utils.py index cb7e1b44..cfcca382 100644 --- a/vision_agent/agent/agent_utils.py +++ b/vision_agent/agent/agent_utils.py @@ -72,7 +72,9 @@ def extract_json(json_str: str) -> Dict[str, Any]: if json_dict is None: error_msg = f"Could not extract JSON from the given str: {json_orig}" _LOGGER.exception(error_msg) - raise ValueError(error_msg) + raise json.JSONDecodeError( + msg="Could not extract JSON", doc=json_orig, pos=0 + ) return json_dict From 9b287068b1c6983197c07e866864f957d7757b89 Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Tue, 15 Oct 2024 14:38:28 -0700 Subject: [PATCH 03/26] only add artifacts to media list if they exist --- vision_agent/agent/vision_agent.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/vision_agent/agent/vision_agent.py b/vision_agent/agent/vision_agent.py index 679a9d3c..8e5eb2d1 100644 --- a/vision_agent/agent/vision_agent.py +++ b/vision_agent/agent/vision_agent.py @@ -153,7 +153,7 @@ def execute_code_action( obs = str(result.logs) if result.error: obs += f"\n{result.error}" - extract_and_save_files_to_artifacts(artifacts, code, obs) + extract_and_save_files_to_artifacts(artifacts, code, obs, result) return result, obs @@ -562,10 +562,16 @@ def chat_with_artifacts( self.local_artifacts_path, Path(self.local_artifacts_path).parent, ) - obs_chat_elt["media"] = [ - Path(self.local_artifacts_path).parent / media_ob - for media_ob in media_obs - ] + + # check if the media is actually in the artifacts + media_obs_chat = [] + for media_ob in media_obs: + if media_ob not in artifacts.artifacts: + media_obs_chat.append( + Path(self.local_artifacts_path).parent / media_ob + ) + if media_obs_chat: + obs_chat_elt["media"] = media_obs_chat # don't add execution results to internal chat int_chat.append(obs_chat_elt) From 9405de00f685e3d0efb0f9ba9efb3b31c07f5b8f Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Tue, 15 Oct 2024 14:39:19 -0700 Subject: [PATCH 04/26] only add artifacts to media list if they exist --- vision_agent/agent/vision_agent.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vision_agent/agent/vision_agent.py b/vision_agent/agent/vision_agent.py index 8e5eb2d1..24f8cc33 100644 --- a/vision_agent/agent/vision_agent.py +++ b/vision_agent/agent/vision_agent.py @@ -566,7 +566,7 @@ def chat_with_artifacts( # check if the media is actually in the artifacts media_obs_chat = [] for media_ob in media_obs: - if media_ob not in artifacts.artifacts: + if media_ob in artifacts.artifacts: media_obs_chat.append( Path(self.local_artifacts_path).parent / media_ob ) From 398ff232aa5981d5d6a0274c84c70feba05bb0f1 Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Tue, 15 Oct 2024 14:39:51 -0700 Subject: [PATCH 05/26] only add artifacts to media list if they exist --- vision_agent/agent/vision_agent.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vision_agent/agent/vision_agent.py b/vision_agent/agent/vision_agent.py index 24f8cc33..48cd6590 100644 --- a/vision_agent/agent/vision_agent.py +++ b/vision_agent/agent/vision_agent.py @@ -570,7 +570,7 @@ def chat_with_artifacts( media_obs_chat.append( Path(self.local_artifacts_path).parent / media_ob ) - if media_obs_chat: + if len(media_obs_chat) > 0: obs_chat_elt["media"] = media_obs_chat # don't add execution results to internal chat From d0116176f86da04c9890c574675b1a7b12ac88c8 Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Tue, 15 Oct 2024 14:40:06 -0700 Subject: [PATCH 06/26] extract media from ipython display --- vision_agent/tools/meta_tools.py | 107 ++++++++++++++++++++----------- 1 file changed, 70 insertions(+), 37 deletions(-) diff --git a/vision_agent/tools/meta_tools.py b/vision_agent/tools/meta_tools.py index b230e390..f166eba2 100644 --- a/vision_agent/tools/meta_tools.py +++ b/vision_agent/tools/meta_tools.py @@ -1,3 +1,4 @@ +import base64 import difflib import json import os @@ -5,6 +6,7 @@ import re import subprocess import tempfile +import uuid from pathlib import Path from typing import Any, Dict, List, Optional, Union @@ -570,8 +572,9 @@ def check_and_load_image(code: str) -> List[str]: def view_media_artifact(artifacts: Artifacts, name: str) -> str: - """Allows you to view the media artifact with the given name. This does not show - the media to the user, the user can already see all media saved in the artifacts. + """Allows only the agent to view the media artifact with the given name. DO NOT use + this to show media to the user, the user can already see all media saved in the + artifacts. Parameters: artifacts (Artifacts): The artifacts object to show the image from. @@ -758,7 +761,7 @@ def use_object_detection_fine_tuning( def extract_and_save_files_to_artifacts( - artifacts: Artifacts, code: str, obs: str + artifacts: Artifacts, code: str, obs: str, result: Execution ) -> None: """Extracts and saves files used in the code to the artifacts object. @@ -766,10 +769,32 @@ def extract_and_save_files_to_artifacts( artifacts (Artifacts): The artifacts object to save the files to. code (str): The code to extract the files from. """ + + # This is very hacky but there's no nice way to get the files into artifacts if the + # code is executed in a remote environment and we don't have access to the remove + # file system. + files = {} + for res in result.results: + if len(res.formats()) == 1 and res.formats()[0] in ["png", "jpeg", "mp4"]: + format = res.formats()[0] + if format == "png": + data = base64.b64decode(res.png) if res.png is not None else None + elif format == "jpeg": + data = base64.b64decode(res.jpeg) if res.jpeg is not None else None + elif format == "mp4": + data = base64.b64decode(res.mp4) if res.mp4 is not None else None + else: + data = None + + if format not in files: + files[format] = [data] + else: + files[format].append(data) + try: response = extract_json( AnthropicLMM()( # type: ignore - f"""You are a helpful AI assistant. Your job is to look at a snippet of code and the output of running that code and return the file paths that are being saved in the file. Below is the code snippet: + f"""You are a helpful AI assistant. You are given a number of files for certain file types, your job is to look at the code and the output of running that code and assign each file a file name. Below is the code snippet: ```python {code} @@ -779,43 +804,51 @@ def extract_and_save_files_to_artifacts( {obs} ``` +Here's the number of files that need file names: +{json.dumps({k: len(v) for k, v in files.items()})} + +The name cannot conflict with any of these existing names: +{json.dumps(list(artifacts.artifacts.keys()))} + Return the file paths in the following JSON format: -{{"file_paths": ["/path/to/image1.jpg", "/other/path/to/data.json"]}}""" +{{"png": ["image_name1.png", "other_image_name.png"], "mp4": ["video_name.mp4"]}}""" ) ) except json.JSONDecodeError: - return - - text_file_ext = [ - ".txt", - ".md", - "rtf", - ".html", - ".htm", - "xml", - ".json", - ".csv", - ".tsv", - ".yaml", - ".yml", - ".toml", - ".conf", - ".env" ".ini", - ".log", - ".py", - ".java", - ".js", - ".cpp", - ".c" ".sql", - ".sh", - ] - - if "file_paths" in response and isinstance(response["file_paths"], list): - for file_path in response["file_paths"]: - read_mode = "r" if Path(file_path).suffix in text_file_ext else "rb" - if Path(file_path).is_file(): - with open(file_path, read_mode) as f: - artifacts[Path(file_path).name] = f.read() + response = {} + + def find_name(file: Path, names: List[str]) -> str: + if not str(file) in names: + return str(file) + name = file.name + suffix = file.suffix + # test basic names first + for i in range(100): + new_name = f"{name}_output_{i}{suffix}" + if new_name not in names: + return new_name + return f"{name}_output_{str(uuid.uuid4())}{suffix}" + + for format in files.keys(): + i = 0 + if format in response: + for file in response[format]: + if i < len(files[format]) and files[format][i] is not None: + new_name = find_name( + Path(file).with_suffix("." + format), + list(artifacts.artifacts.keys()), + ) + artifacts[new_name] = files[format][i] + i += 1 + if i < len(files[format]): + for j in range(i, len(files[format])): + name = "image" if format in ["png", "jpeg"] else "video" + if files[format][j] is not None: + new_name = find_name( + Path(f"{name}").with_suffix("." + format), + list(artifacts.artifacts.keys()), + ) + artifacts[new_name] = files[format][j] META_TOOL_DOCSTRING = get_tool_documentation( From d92e192db519fab3b854f8cda02ed381345d7473 Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Tue, 15 Oct 2024 14:40:24 -0700 Subject: [PATCH 07/26] add better prompts for dealing with artifacts --- vision_agent/agent/vision_agent_prompts.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vision_agent/agent/vision_agent_prompts.py b/vision_agent/agent/vision_agent_prompts.py index 278b0b4c..7e06bfb2 100644 --- a/vision_agent/agent/vision_agent_prompts.py +++ b/vision_agent/agent/vision_agent_prompts.py @@ -1,7 +1,7 @@ VA_CODE = """ **Role**: You are a helpful agent that assists users with writing code. -**Taks**: As a conversational agent, you are required to understand the user's request and provide a helpful response. Use a Chain-of-Thought approach to break down the problem, create a plan, and then provide a response. Ensure that your response is clear, concise, and helpful. You can use an interactive Python (Jupyter Notebook) environment, executing code with . You are given access to an `artifacts` object which contains files shared between you and the user. `artifacts` will be automatically saved everytime you execute python code. +**Taks**: As a conversational agent, you are required to understand the user's request and provide a helpful response. Use a Chain-of-Thought approach to break down the problem, create a plan, and then provide a response. Ensure that your response is clear, concise, and helpful. You can use an interactive Python (Jupyter Notebook) environment, executing code with . You are given access to an `artifacts` object which contains files shared between you and the user. `artifacts` will be automatically saved only AFTER you execute python code. print("Hello World!") @@ -28,6 +28,7 @@ 1. **Understand and Clarify**: Make sure you understand the task, ask clarifying questions if the task is not clear. 2. **Code Generation**: Only use code provided in the Documentation in your tags. Only use `edit_vision_code` to modify code written by `generate_vision_code`. 3. **Execute**: Do only what the user asked you to do and no more. If you need to ask the user a question or show your results to the user, set to `true`. +4. **Artifacts**: Files are only saved in `artifacts` after , do not try to access artifacts until you observe that they are loaded. 4. **Response**: Keep your responses short and concise. Provide the user only with the information they need to continue the conversation. 5. **Output**: You can only respond with , , , and tags. From 250bbaafebc802914729f8349d8ad61cb57a304b Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Tue, 15 Oct 2024 16:52:28 -0700 Subject: [PATCH 08/26] add another prmopt example, reformat to reduce complex --- vision_agent/agent/vision_agent.py | 75 +++++++++++++++++++----------- 1 file changed, 48 insertions(+), 27 deletions(-) diff --git a/vision_agent/agent/vision_agent.py b/vision_agent/agent/vision_agent.py index 48cd6590..b302c366 100644 --- a/vision_agent/agent/vision_agent.py +++ b/vision_agent/agent/vision_agent.py @@ -12,6 +12,7 @@ EXAMPLES_CODE1, EXAMPLES_CODE2, EXAMPLES_CODE3, + EXAMPLES_CODE3_EXTRA2, VA_CODE, ) from vision_agent.lmm import LMM, AnthropicLMM, Message, OpenAILMM @@ -110,7 +111,7 @@ def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]: prompt = VA_CODE.format( documentation=META_TOOL_DOCSTRING, - examples=f"{EXAMPLES_CODE1}\n{EXAMPLES_CODE2}\n{EXAMPLES_CODE3}", + examples=f"{EXAMPLES_CODE1}\n{EXAMPLES_CODE2}\n{EXAMPLES_CODE3}\n{EXAMPLES_CODE3_EXTRA2}", conversation=conversation, ) message: Message = {"role": "user", "content": prompt} @@ -182,10 +183,46 @@ def execute_user_code_action( ) if user_result.error: user_obs += f"\n{user_result.error}" - extract_and_save_files_to_artifacts(artifacts, user_code_action, user_obs) + extract_and_save_files_to_artifacts( + artifacts, user_code_action, user_obs, user_result + ) return user_result, user_obs +def _add_media_obs( + code_action: str, + artifacts: Artifacts, + result: Execution, + obs: str, + code_interpreter: CodeInterpreter, + remote_artifacts_path: Path, + local_artifacts_path: Path, +) -> Dict[str, Any]: + obs_chat_elt: Message = {"role": "observation", "content": obs} + media_obs = check_and_load_image(code_action) + if media_obs and result.success: + # for view_media_artifact, we need to ensure the media is loaded + # locally so the conversation agent can actually see it + code_interpreter.download_file( + str(remote_artifacts_path.name), + str(local_artifacts_path), + ) + artifacts.load( + local_artifacts_path, + local_artifacts_path.parent, + ) + + # check if the media is actually in the artifacts + media_obs_chat = [] + for media_ob in media_obs: + if media_ob in artifacts.artifacts: + media_obs_chat.append(local_artifacts_path.parent / media_ob) + if len(media_obs_chat) > 0: + obs_chat_elt["media"] = media_obs_chat + + return obs_chat_elt + + def add_step_descriptions(response: Dict[str, Any]) -> Dict[str, Any]: response = copy.deepcopy(response) @@ -544,35 +581,19 @@ def chat_with_artifacts( code_interpreter, str(remote_artifacts_path), ) - - media_obs = check_and_load_image(code_action) + obs_chat_elt = _add_media_obs( + code_action, + artifacts, + result, + obs, + code_interpreter, + Path(remote_artifacts_path), + Path(self.local_artifacts_path), + ) if self.verbosity >= 1: _LOGGER.info(obs) - obs_chat_elt: Message = {"role": "observation", "content": obs} - if media_obs and result.success: - # for view_media_artifact, we need to ensure the media is loaded - # locally so the conversation agent can actually see it - code_interpreter.download_file( - str(remote_artifacts_path.name), - str(self.local_artifacts_path), - ) - artifacts.load( - self.local_artifacts_path, - Path(self.local_artifacts_path).parent, - ) - - # check if the media is actually in the artifacts - media_obs_chat = [] - for media_ob in media_obs: - if media_ob in artifacts.artifacts: - media_obs_chat.append( - Path(self.local_artifacts_path).parent / media_ob - ) - if len(media_obs_chat) > 0: - obs_chat_elt["media"] = media_obs_chat - # don't add execution results to internal chat int_chat.append(obs_chat_elt) obs_chat_elt["execution"] = result From b2bcc9fc28a5906ee1e0ab7e0a77c618c512ad1c Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Tue, 15 Oct 2024 16:53:35 -0700 Subject: [PATCH 09/26] add another prompt for editing code --- vision_agent/agent/vision_agent_prompts.py | 65 +++++++++++++++++++++- 1 file changed, 62 insertions(+), 3 deletions(-) diff --git a/vision_agent/agent/vision_agent_prompts.py b/vision_agent/agent/vision_agent_prompts.py index 7e06bfb2..66d6614e 100644 --- a/vision_agent/agent/vision_agent_prompts.py +++ b/vision_agent/agent/vision_agent_prompts.py @@ -54,7 +54,7 @@ false OBSERVATION: -[Artifact dog_detector.py] +[Artifact dog_detector.py (5 lines total)] 0|from vision_agent.tools import load_image, owl_v2 1|def detect_dogs(image_path: str): 2| image = load_image(image_path) @@ -95,7 +95,7 @@ false OBSERVATION: -[Artifact dog_detector.py] +[Artifact dog_detector.py (5 lines total)] 0|from vision_agent.tools import load_image, owl_v2 1|def detect_dogs(image_path: str): 2| image = load_image(image_path) @@ -167,7 +167,7 @@ false OBSERVATION: -[Artifact code.py] +[Artifact code.py (17 lines total)] 0|from vision_agent.tools import load_image, florence2_phrase_grounding, closest_box_distance, overlay_bounding_boxes, save_image 1|def count_workers_with_helmets(image_path: str, output_path: str): 2| image = load_image(image_path) @@ -202,7 +202,9 @@ AGENT: Two workers with helmets are detected, I will show this to the user and ask them if the result looks good. The code to detect workers with helmets is saved in code.py and the visualization under 'workers_viz.png'. true +""" +EXAMPLES_CODE3_EXTRA1 = """ USER: The detections are slightly off. Can you fine tune florence2 using these labels? "[{'image_path': 'image1.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}, {'image_path': 'image2.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}]" AGENT: Because the user has supplied me with labels I can call object_detection_fine_tuning on their behalf to fine tune the model @@ -250,3 +252,60 @@ def count_workers_with_helmets(image_path: str, output_path: str): I have updated the code using the fine tuned florence2_phrase_grounding model to count the workers wearing helmets in code.py and saved the visualization under 'workers_viz.png'. true """ + +EXAMPLES_CODE3_EXTRA2 = """ +USER: Can you write a streamlit app that allows me to upload an image and count the workers with helmets? + +AGENT: I will use edit_code_artifact to generate a streamlit app that allows the user to upload an image and count the workers with helmets. + +CODE = ''' +import streamlit as st +from code imoprt count_workers_with_helmets +import os + +def main(): + st.title("Worker with Helmet Counter") + uploaded_file = st.file_uploader("Choose image...", type=["jpeg", "jpg", "png"]) + if uploaded_file is not None: + with open("temp_image.jpg", "wb") as f: + f.write(uploaded_file.read()) + + count = count_workers_with_helmets("temp_image.jpg", "temp_image_viz.jpg") + st.image("temp_image_viz.jpg") + st.write(f"Number of workers with helmets: {count}") + os.remove("temp_image.jpg") + os.remove("temp_image_viz.jpg") + +if __name__ == "__main__": + main() +''' +edit_code_artifact(artifacts, 'streamlit_app.py', CODE, 0, 0) + + +OBSERVATION: +[Artifact streamlit_app.py (19 lines total)] +0|import streamlit as st +1|from code imoprt count_workers_with_helmets +2|import os +3| +4|def main(): +5| st.title("Worker with Helmet Counter") +6| uploaded_file = st.file_uploader("Choose image...", type=["jpeg", "jpg", "png"]) +7| if uploaded_file is not None: +8| with open("temp_image.jpg", "wb") as f: +9| f.write(uploaded_file.read()) +10| +11| count = count_workers_with_helmets("temp_image.jpg", "temp_image_viz.jpg") +12| st.image("temp_image_viz.jpg") +13| st.write(f"Number of workers with helmets: {count}") +14| os.remove("temp_image.jpg") +15| os.remove("temp_image_viz.jpg") +16| +17|if __name__ == "__main__": +18| main() +[End of artifact] + +AGENT: I have generated the streamlit app that allows the user to upload an image and count the workers +The streamlit app is saved in streamlit_app.py, you can run the app by running `streamlit run streamlit_app.py`. +true +""" From 9ab9e1d78ceda9b651436e16f6ba89ecbc4918a4 Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Tue, 15 Oct 2024 16:53:50 -0700 Subject: [PATCH 10/26] fix issues with edit code --- vision_agent/tools/meta_tools.py | 116 ++++++++++++++++++------------- 1 file changed, 68 insertions(+), 48 deletions(-) diff --git a/vision_agent/tools/meta_tools.py b/vision_agent/tools/meta_tools.py index f166eba2..fd3f306e 100644 --- a/vision_agent/tools/meta_tools.py +++ b/vision_agent/tools/meta_tools.py @@ -176,9 +176,9 @@ def view_lines( f"[Artifact: {name} ({total_lines} lines total)]\n" + format_lines(lines[start:end], start) + ( - "[End of artifact]" + "\n[End of artifact]" if end == len(lines) - else f"[{len(lines) - end} more lines]" + else f"\n[{len(lines) - end} more lines]" ) ) @@ -258,8 +258,10 @@ def edit_code_artifact( Parameters: artifacts (Artifacts): The artifacts object to edit the artifact from. name (str): The name of the artifact to edit. - start (int): The line number to start the edit. - end (int): The line number to end the edit. + start (int): The line number to start the edit, can be in [-1, total_lines] + where -1 represents the end of the file. + end (int): The line number to end the edit, can be in [-1, total_lines] where + -1 represents the end of the file. content (str): The content to insert. """ # just make the artifact if it doesn't exist instead of forcing agent to call @@ -268,17 +270,21 @@ def edit_code_artifact( artifacts[name] = "" total_lines = len(artifacts[name].splitlines()) - if start < 0 or end < 0 or start > end or end > total_lines: + if start < -1 or end < -1 or start > end or end > total_lines: print("[Invalid line range]") return "[Invalid line range]" - if start == end: - end += 1 + + if start == -1: + start = total_lines + if end == -1: + end = total_lines new_content_lines = content.splitlines(keepends=True) new_content_lines = [ line if line.endswith("\n") else line + "\n" for line in new_content_lines ] lines = artifacts[name].splitlines(keepends=True) + lines = [line if line.endswith("\n") else line + "\n" for line in lines] edited_lines = lines[:start] + new_content_lines + lines[end:] cur_line = start + len(content.split("\n")) // 2 @@ -760,6 +766,51 @@ def use_object_detection_fine_tuning( return diff +def _find_name(file: Path, names: List[str]) -> str: + if not str(file) in names: + return str(file) + name = file.name + suffix = file.suffix + # test basic names first + for i in range(100): + new_name = f"{name}_output_{i}{suffix}" + if new_name not in names: + return new_name + return f"{name}_output_{str(uuid.uuid4())}{suffix}" + + +def _extract_file_names( + code: str, obs: str, file_counts: Dict[str, int], existing_names: List[str] +) -> Dict[str, List[str]]: + try: + response = extract_json( + AnthropicLMM()( # type: ignore + f"""You are a helpful AI assistant. You are given a number of files for certain file types, your job is to look at the code and the output of running that code and assign each file a file name. Below is the code snippet: + +```python +{code} +``` + +```output +{obs} +``` + +Here's the number of files that need file names: +{json.dumps({k: v for k, v in file_counts.items()})} + +The name cannot conflict with any of these existing names: +{str(existing_names)} + +Return the file paths in the following JSON format: +{{"png": ["image_name1.png", "other_image_name.png"], "mp4": ["video_name.mp4"]}}""" + ) + ) + except json.JSONDecodeError: + response = {} + + return response + + def extract_and_save_files_to_artifacts( artifacts: Artifacts, code: str, obs: str, result: Execution ) -> None: @@ -775,8 +826,8 @@ def extract_and_save_files_to_artifacts( # file system. files = {} for res in result.results: - if len(res.formats()) == 1 and res.formats()[0] in ["png", "jpeg", "mp4"]: - format = res.formats()[0] + if len(res.formats()) == 1 and res.formats()[0] in ["png", "jpeg", "mp4"]: # type: ignore + format = res.formats()[0] # type: ignore if format == "png": data = base64.b64decode(res.png) if res.png is not None else None elif format == "jpeg": @@ -791,50 +842,19 @@ def extract_and_save_files_to_artifacts( else: files[format].append(data) - try: - response = extract_json( - AnthropicLMM()( # type: ignore - f"""You are a helpful AI assistant. You are given a number of files for certain file types, your job is to look at the code and the output of running that code and assign each file a file name. Below is the code snippet: - -```python -{code} -``` - -```output -{obs} -``` - -Here's the number of files that need file names: -{json.dumps({k: len(v) for k, v in files.items()})} - -The name cannot conflict with any of these existing names: -{json.dumps(list(artifacts.artifacts.keys()))} - -Return the file paths in the following JSON format: -{{"png": ["image_name1.png", "other_image_name.png"], "mp4": ["video_name.mp4"]}}""" - ) - ) - except json.JSONDecodeError: - response = {} - - def find_name(file: Path, names: List[str]) -> str: - if not str(file) in names: - return str(file) - name = file.name - suffix = file.suffix - # test basic names first - for i in range(100): - new_name = f"{name}_output_{i}{suffix}" - if new_name not in names: - return new_name - return f"{name}_output_{str(uuid.uuid4())}{suffix}" + response = _extract_file_names( + code, + obs, + {k: len(v) for k, v in files.items()}, + list(artifacts.artifacts.keys()), + ) for format in files.keys(): i = 0 if format in response: for file in response[format]: if i < len(files[format]) and files[format][i] is not None: - new_name = find_name( + new_name = _find_name( Path(file).with_suffix("." + format), list(artifacts.artifacts.keys()), ) @@ -844,7 +864,7 @@ def find_name(file: Path, names: List[str]) -> str: for j in range(i, len(files[format])): name = "image" if format in ["png", "jpeg"] else "video" if files[format][j] is not None: - new_name = find_name( + new_name = _find_name( Path(f"{name}").with_suffix("." + format), list(artifacts.artifacts.keys()), ) From 0247de107c0772f154a626b7b290749d1be38cdb Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Tue, 15 Oct 2024 17:15:40 -0700 Subject: [PATCH 11/26] remove verbosity arg --- vision_agent/tools/meta_tools.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/vision_agent/tools/meta_tools.py b/vision_agent/tools/meta_tools.py index fd3f306e..b7352583 100644 --- a/vision_agent/tools/meta_tools.py +++ b/vision_agent/tools/meta_tools.py @@ -28,7 +28,6 @@ CURRENT_LINE = 0 DEFAULT_WINDOW_SIZE = 100 ZMQ_PORT = os.environ.get("ZMQ_PORT", None) -VERBOSITY = os.environ.get("VERBOSITY", 0) def report_progress_callback(port: int, inp: Dict[str, Any]) -> None: @@ -443,14 +442,16 @@ def detect_dogs(image_path: str): dogs = owl_v2("dog", image) return dogs """ + # verbosity is set to 0 to avoid adding extra content to the VisionAgent conversation if ZMQ_PORT is not None: agent = va.agent.VisionAgentCoder( report_progress_callback=lambda inp: report_progress_callback( int(ZMQ_PORT), inp - ) + ), + verbosity=0, ) else: - agent = va.agent.VisionAgentCoder(verbosity=int(VERBOSITY)) + agent = va.agent.VisionAgentCoder(verbosity=0) fixed_chat: List[Message] = [{"role": "user", "content": chat, "media": media}] response = agent.generate_code( @@ -514,7 +515,8 @@ def detect_dogs(image_path: str): return dogs """ - agent = va.agent.VisionAgentCoder(verbosity=int(VERBOSITY)) + # verbosity is set to 0 to avoid adding extra content to the VisionAgent conversation + agent = va.agent.VisionAgentCoder(verbosity=0) if name not in artifacts: print(f"[Artifact {name} does not exist]") return f"[Artifact {name} does not exist]" From c514da353d2173d4e5ac68cb41143b565e4fd4eb Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Tue, 15 Oct 2024 18:55:01 -0700 Subject: [PATCH 12/26] limit context length and amount of debugging VA does --- vision_agent/agent/vision_agent.py | 3 ++- vision_agent/agent/vision_agent_prompts.py | 8 ++++---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/vision_agent/agent/vision_agent.py b/vision_agent/agent/vision_agent.py index b302c366..64a8ff49 100644 --- a/vision_agent/agent/vision_agent.py +++ b/vision_agent/agent/vision_agent.py @@ -98,8 +98,9 @@ def _clean_response(response: str) -> str: def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]: chat = copy.deepcopy(chat) + # only add 10 most recent messages in the chat to not go over token limit conversation = "" - for chat_i in chat: + for chat_i in chat[-10:]: if chat_i["role"] == "user": conversation += f"USER: {chat_i['content']}\n\n" elif chat_i["role"] == "observation": diff --git a/vision_agent/agent/vision_agent_prompts.py b/vision_agent/agent/vision_agent_prompts.py index 66d6614e..228cb592 100644 --- a/vision_agent/agent/vision_agent_prompts.py +++ b/vision_agent/agent/vision_agent_prompts.py @@ -1,7 +1,7 @@ VA_CODE = """ **Role**: You are a helpful agent that assists users with writing code. -**Taks**: As a conversational agent, you are required to understand the user's request and provide a helpful response. Use a Chain-of-Thought approach to break down the problem, create a plan, and then provide a response. Ensure that your response is clear, concise, and helpful. You can use an interactive Python (Jupyter Notebook) environment, executing code with . You are given access to an `artifacts` object which contains files shared between you and the user. `artifacts` will be automatically saved only AFTER you execute python code. +**Taks**: As a conversational agent, you are required to understand the user's request and provide a helpful response. Use a Chain-of-Thought approach to break down the problem, create a plan, and then provide a response. Ensure that your response is clear, concise, and helpful. You can use an interactive Python (Jupyter Notebook) environment, executing code with . You are given access to an `artifacts` object which contains files shared between you and the user. `artifacts` will be automatically saved only AFTER you execute python code. The user can see all `artifacts`. print("Hello World!") @@ -26,11 +26,11 @@ **Instructions**: 1. **Understand and Clarify**: Make sure you understand the task, ask clarifying questions if the task is not clear. -2. **Code Generation**: Only use code provided in the Documentation in your tags. Only use `edit_vision_code` to modify code written by `generate_vision_code`. +2. **Code Generation**: Only use code provided in the Documentation in your tags. Only use `edit_vision_code` to modify code written by `generate_vision_code`. DO NOT run `edit_vision_code` or `edit_code_artifact` more than 2 times in a row and instead ask the user for help. 3. **Execute**: Do only what the user asked you to do and no more. If you need to ask the user a question or show your results to the user, set to `true`. 4. **Artifacts**: Files are only saved in `artifacts` after , do not try to access artifacts until you observe that they are loaded. -4. **Response**: Keep your responses short and concise. Provide the user only with the information they need to continue the conversation. -5. **Output**: You can only respond with , , , and tags. +5. **Response**: Keep your responses short and concise. Provide the user only with the information they need to continue the conversation. +6. **Output**: You can only respond with , , , and tags. Your thoughts here... Your response to the user here... From 3f86993101579a841f7526ef7da5cbce8e9f1397 Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Tue, 15 Oct 2024 18:55:12 -0700 Subject: [PATCH 13/26] fix side case with edit_artifact --- vision_agent/tools/meta_tools.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/vision_agent/tools/meta_tools.py b/vision_agent/tools/meta_tools.py index b7352583..d07f3949 100644 --- a/vision_agent/tools/meta_tools.py +++ b/vision_agent/tools/meta_tools.py @@ -269,15 +269,15 @@ def edit_code_artifact( artifacts[name] = "" total_lines = len(artifacts[name].splitlines()) - if start < -1 or end < -1 or start > end or end > total_lines: - print("[Invalid line range]") - return "[Invalid line range]" - if start == -1: start = total_lines if end == -1: end = total_lines + if start < 0 or end < 0 or start > end or end > total_lines: + print("[Invalid line range]") + return "[Invalid line range]" + new_content_lines = content.splitlines(keepends=True) new_content_lines = [ line if line.endswith("\n") else line + "\n" for line in new_content_lines @@ -378,14 +378,16 @@ def generate_vision_plan( [End Plan Context] """ + # verbosity is set to 0 to avoid adding extra content to the VisionAgent conversation if ZMQ_PORT is not None: agent = va.agent.VisionAgentPlanner( report_progress_callback=lambda inp: report_progress_callback( int(ZMQ_PORT), inp - ) + ), + verbosity=0, ) else: - agent = va.agent.VisionAgentPlanner() + agent = va.agent.VisionAgentPlanner(verbosity=0) fixed_chat: List[Message] = [{"role": "user", "content": chat, "media": media}] response = agent.generate_plan( @@ -778,7 +780,7 @@ def _find_name(file: Path, names: List[str]) -> str: new_name = f"{name}_output_{i}{suffix}" if new_name not in names: return new_name - return f"{name}_output_{str(uuid.uuid4())}{suffix}" + return f"{name}_output_{str(uuid.uuid4())[:4]}{suffix}" def _extract_file_names( From 04c57f976a0a4f7933b71d750ae0c99cb541612f Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Tue, 15 Oct 2024 19:03:55 -0700 Subject: [PATCH 14/26] remove generate vision plan --- vision_agent/tools/meta_tools.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vision_agent/tools/meta_tools.py b/vision_agent/tools/meta_tools.py index d07f3949..994ef48c 100644 --- a/vision_agent/tools/meta_tools.py +++ b/vision_agent/tools/meta_tools.py @@ -881,7 +881,6 @@ def extract_and_save_files_to_artifacts( open_code_artifact, create_code_artifact, edit_code_artifact, - generate_vision_plan, generate_vision_code, edit_vision_code, view_media_artifact, From 5fa377e420654b1be3488cb487bfaf1440f7020a Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Tue, 15 Oct 2024 19:46:12 -0700 Subject: [PATCH 15/26] better parsing for json --- vision_agent/tools/meta_tools.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/vision_agent/tools/meta_tools.py b/vision_agent/tools/meta_tools.py index 994ef48c..887ff86d 100644 --- a/vision_agent/tools/meta_tools.py +++ b/vision_agent/tools/meta_tools.py @@ -806,7 +806,10 @@ def _extract_file_names( {str(existing_names)} Return the file paths in the following JSON format: -{{"png": ["image_name1.png", "other_image_name.png"], "mp4": ["video_name.mp4"]}}""" +```json +{{"png": ["image_name1.png", "other_image_name.png"], "mp4": ["video_name.mp4"]}} +``` +""" ) ) except json.JSONDecodeError: From c8eea979f0d4b7b0fec3507ed2a7107f1cef4e0e Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Tue, 15 Oct 2024 19:56:14 -0700 Subject: [PATCH 16/26] loop over formats --- vision_agent/tools/meta_tools.py | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/vision_agent/tools/meta_tools.py b/vision_agent/tools/meta_tools.py index 887ff86d..10c44bac 100644 --- a/vision_agent/tools/meta_tools.py +++ b/vision_agent/tools/meta_tools.py @@ -833,21 +833,21 @@ def extract_and_save_files_to_artifacts( # file system. files = {} for res in result.results: - if len(res.formats()) == 1 and res.formats()[0] in ["png", "jpeg", "mp4"]: # type: ignore - format = res.formats()[0] # type: ignore - if format == "png": - data = base64.b64decode(res.png) if res.png is not None else None - elif format == "jpeg": - data = base64.b64decode(res.jpeg) if res.jpeg is not None else None - elif format == "mp4": - data = base64.b64decode(res.mp4) if res.mp4 is not None else None - else: - data = None - - if format not in files: - files[format] = [data] - else: - files[format].append(data) + for format in res.formats(): + if format in ["png", "jpeg", "mp4"]: # type: ignore + if format == "png": + data = base64.b64decode(res.png) if res.png is not None else None + elif format == "jpeg": + data = base64.b64decode(res.jpeg) if res.jpeg is not None else None + elif format == "mp4": + data = base64.b64decode(res.mp4) if res.mp4 is not None else None + else: + data = None + + if format not in files: + files[format] = [data] + else: + files[format].append(data) response = _extract_file_names( code, From 666ab3cc92aff5610fbd4ca1a2ac247b8d72026d Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Tue, 15 Oct 2024 20:03:25 -0700 Subject: [PATCH 17/26] updated prompt to tell it save_image/video will save to artifacts --- vision_agent/agent/vision_agent_prompts.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/vision_agent/agent/vision_agent_prompts.py b/vision_agent/agent/vision_agent_prompts.py index 228cb592..62338df0 100644 --- a/vision_agent/agent/vision_agent_prompts.py +++ b/vision_agent/agent/vision_agent_prompts.py @@ -1,7 +1,7 @@ VA_CODE = """ **Role**: You are a helpful agent that assists users with writing code. -**Taks**: As a conversational agent, you are required to understand the user's request and provide a helpful response. Use a Chain-of-Thought approach to break down the problem, create a plan, and then provide a response. Ensure that your response is clear, concise, and helpful. You can use an interactive Python (Jupyter Notebook) environment, executing code with . You are given access to an `artifacts` object which contains files shared between you and the user. `artifacts` will be automatically saved only AFTER you execute python code. The user can see all `artifacts`. +**Taks**: As a conversational agent, you are required to understand the user's request and provide a helpful response. Use a Chain-of-Thought approach to break down the problem, create a plan, and then provide a response. Ensure that your response is clear, concise, and helpful. You can use an interactive Python (Jupyter Notebook) environment, executing code with . You are given access to an `artifacts` object which contains files shared between you and the user. `artifacts` will be saved only AFTER you execute python code with `save_imgae` or `save_video`. The user can see all `artifacts`. print("Hello World!") @@ -143,7 +143,7 @@ EXAMPLES_CODE3 = """ -USER: Can you create a function to count workers with helmets, return the count and save a visaulization of the bounding boxes? +USER: Can you create a function to count workers with helmets, return the count and a visaulization of the bounding boxes? OBSERVATION: [Artifacts loaded] @@ -169,7 +169,7 @@ OBSERVATION: [Artifact code.py (17 lines total)] 0|from vision_agent.tools import load_image, florence2_phrase_grounding, closest_box_distance, overlay_bounding_boxes, save_image -1|def count_workers_with_helmets(image_path: str, output_path: str): +1|def count_workers_with_helmets(image_path: str): 2| image = load_image(image_path) 3| detections = florence2_phrase_grounding("worker, helmet", image) 4| workers = [d for d in detections if d['label'] == 'worker'] @@ -183,15 +183,16 @@ 11| break 11| if person_has_helmet: 13| count += 1 -14| overlay_bounding_boxes(image, detections) -15| save_image(output_path, image) -16| return count +14| viz = overlay_bounding_boxes(image, detections) +16| return count, viz [End of artifact] -AGENT: I have generated the code to count the workers with helmets in the image, I must now run the code and print the output. +AGENT: I have generated the code to count the workers with helmets in the image, I must now run the code, print the counts and save the visualization. from code import count_workers_with_helmets -print(count_workers_with_helmets('/path/to/images/workers.png', 'workers_viz.png')) +count, viz = count_workers_with_helmets('/path/to/images/workers.png') +save_image(viz, 'workers_viz.png') +print(count) false From 906ee6684e2561a205f5e85c438441e6880c9bbb Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Tue, 15 Oct 2024 20:24:41 -0700 Subject: [PATCH 18/26] ensure artifact is saved --- vision_agent/agent/vision_agent.py | 12 +++++++++++- vision_agent/tools/meta_tools.py | 10 ++++++---- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/vision_agent/agent/vision_agent.py b/vision_agent/agent/vision_agent.py index 64a8ff49..51745f53 100644 --- a/vision_agent/agent/vision_agent.py +++ b/vision_agent/agent/vision_agent.py @@ -155,6 +155,7 @@ def execute_code_action( obs = str(result.logs) if result.error: obs += f"\n{result.error}" + __import__("ipdb").set_trace() extract_and_save_files_to_artifacts(artifacts, code, obs, result) return result, obs @@ -323,6 +324,7 @@ def __init__( agent: Optional[LMM] = None, verbosity: int = 0, local_artifacts_path: Optional[Union[str, Path]] = None, + remote_artifacts_path: Optional[Union[str, Path]] = None, callback_message: Optional[Callable[[Dict[str, Any]], None]] = None, code_interpreter: Optional[Union[str, CodeInterpreter]] = None, ) -> None: @@ -357,6 +359,14 @@ def __init__( else Path(tempfile.NamedTemporaryFile(delete=False).name) ), ) + self.remote_artifacts_path = cast( + str, + ( + Path(remote_artifacts_path) + if remote_artifacts_path is not None + else Path(WORKSPACE / "artifacts.pkl") + ), + ) def __call__( self, @@ -433,7 +443,7 @@ def chat_with_artifacts( if not artifacts: # this is setting remote artifacts path - artifacts = Artifacts(WORKSPACE / "artifacts.pkl") + artifacts = Artifacts(self.remote_artifacts_path, self.local_artifacts_path) # NOTE: each chat should have a dedicated code interpreter instance to avoid concurrency issues code_interpreter = ( diff --git a/vision_agent/tools/meta_tools.py b/vision_agent/tools/meta_tools.py index 10c44bac..d13e3731 100644 --- a/vision_agent/tools/meta_tools.py +++ b/vision_agent/tools/meta_tools.py @@ -87,8 +87,11 @@ class Artifacts: need to be in sync with the remote environment the VisionAgent is running in. """ - def __init__(self, remote_save_path: Union[str, Path]) -> None: + def __init__( + self, remote_save_path: Union[str, Path], local_save_path: Union[str, Path] + ) -> None: self.remote_save_path = Path(remote_save_path) + self.local_save_path = Path(local_save_path) self.artifacts: Dict[str, Any] = {} self.code_sandbox_runtime = None @@ -132,9 +135,7 @@ def show(self, uploaded_file_path: Optional[Union[str, Path]] = None) -> str: return output_str def save(self, local_path: Optional[Union[str, Path]] = None) -> None: - save_path = ( - Path(local_path) if local_path is not None else self.remote_save_path - ) + save_path = Path(local_path) if local_path is not None else self.local_save_path with open(save_path, "wb") as f: pkl.dump(self.artifacts, f) @@ -876,6 +877,7 @@ def extract_and_save_files_to_artifacts( list(artifacts.artifacts.keys()), ) artifacts[new_name] = files[format][j] + artifacts.save() META_TOOL_DOCSTRING = get_tool_documentation( From 802c7e2040c7c449542c6011a429c0d43585523e Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Tue, 15 Oct 2024 20:26:06 -0700 Subject: [PATCH 19/26] remove break point --- vision_agent/agent/vision_agent.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vision_agent/agent/vision_agent.py b/vision_agent/agent/vision_agent.py index 51745f53..761c90dd 100644 --- a/vision_agent/agent/vision_agent.py +++ b/vision_agent/agent/vision_agent.py @@ -155,7 +155,6 @@ def execute_code_action( obs = str(result.logs) if result.error: obs += f"\n{result.error}" - __import__("ipdb").set_trace() extract_and_save_files_to_artifacts(artifacts, code, obs, result) return result, obs From b58e48d674cae8f6ee61a15c2dcc29ec7c6cc700 Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Tue, 15 Oct 2024 20:35:57 -0700 Subject: [PATCH 20/26] fixed type errors --- vision_agent/agent/vision_agent.py | 10 ++++++++++ vision_agent/tools/meta_tools.py | 2 +- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/vision_agent/agent/vision_agent.py b/vision_agent/agent/vision_agent.py index 761c90dd..1bb00621 100644 --- a/vision_agent/agent/vision_agent.py +++ b/vision_agent/agent/vision_agent.py @@ -335,6 +335,8 @@ def __init__( verbosity (int): The verbosity level of the agent. local_artifacts_path (Optional[Union[str, Path]]): The path to the local artifacts file. + remote_artifacts_path (Optional[Union[str, Path]]): The path to the remote + artifacts file. callback_message (Optional[Callable[[Dict[str, Any]], None]]): Callback function to send intermediate update messages. code_interpreter (Optional[Union[str, CodeInterpreter]]): For string values @@ -643,6 +645,7 @@ def __init__( agent: Optional[LMM] = None, verbosity: int = 0, local_artifacts_path: Optional[Union[str, Path]] = None, + remote_artifacts_path: Optional[Union[str, Path]] = None, callback_message: Optional[Callable[[Dict[str, Any]], None]] = None, code_interpreter: Optional[Union[str, CodeInterpreter]] = None, ) -> None: @@ -654,6 +657,8 @@ def __init__( verbosity (int): The verbosity level of the agent. local_artifacts_path (Optional[Union[str, Path]]): The path to the local artifacts file. + remote_artifacts_path (Optional[Union[str, Path]]): The path to the remote + artifacts file. callback_message (Optional[Callable[[Dict[str, Any]], None]]): Callback function to send intermediate update messages. code_interpreter (Optional[Union[str, CodeInterpreter]]): For string values @@ -667,6 +672,7 @@ def __init__( agent, verbosity, local_artifacts_path, + remote_artifacts_path, callback_message, code_interpreter, ) @@ -678,6 +684,7 @@ def __init__( agent: Optional[LMM] = None, verbosity: int = 0, local_artifacts_path: Optional[Union[str, Path]] = None, + remote_artifacts_path: Optional[Union[str, Path]] = None, callback_message: Optional[Callable[[Dict[str, Any]], None]] = None, code_interpreter: Optional[Union[str, CodeInterpreter]] = None, ) -> None: @@ -689,6 +696,8 @@ def __init__( verbosity (int): The verbosity level of the agent. local_artifacts_path (Optional[Union[str, Path]]): The path to the local artifacts file. + remote_artifacts_path (Optional[Union[str, Path]]): The path to the remote + artifacts file. callback_message (Optional[Callable[[Dict[str, Any]], None]]): Callback function to send intermediate update messages. code_interpreter (Optional[Union[str, CodeInterpreter]]): For string values @@ -702,6 +711,7 @@ def __init__( agent, verbosity, local_artifacts_path, + remote_artifacts_path, callback_message, code_interpreter, ) diff --git a/vision_agent/tools/meta_tools.py b/vision_agent/tools/meta_tools.py index d13e3731..ffbfc204 100644 --- a/vision_agent/tools/meta_tools.py +++ b/vision_agent/tools/meta_tools.py @@ -835,7 +835,7 @@ def extract_and_save_files_to_artifacts( files = {} for res in result.results: for format in res.formats(): - if format in ["png", "jpeg", "mp4"]: # type: ignore + if format in ["png", "jpeg", "mp4"]: if format == "png": data = base64.b64decode(res.png) if res.png is not None else None elif format == "jpeg": From 55fc5982e6af391d07e85bdbaa4df15e18f4f74e Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Tue, 15 Oct 2024 21:02:46 -0700 Subject: [PATCH 21/26] upload and download artifacts per turn --- vision_agent/agent/vision_agent.py | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/vision_agent/agent/vision_agent.py b/vision_agent/agent/vision_agent.py index 1bb00621..42204190 100644 --- a/vision_agent/agent/vision_agent.py +++ b/vision_agent/agent/vision_agent.py @@ -39,7 +39,7 @@ class BoilerplateCode: "from typing import *", "from vision_agent.utils.execute import CodeInterpreter", "from vision_agent.tools.meta_tools import Artifacts, open_code_artifact, create_code_artifact, edit_code_artifact, get_tool_descriptions, generate_vision_code, edit_vision_code, view_media_artifact, object_detection_fine_tuning, use_object_detection_fine_tuning", - "artifacts = Artifacts('{remote_path}')", + "artifacts = Artifacts('{remote_path}', '{remote_path}')", "artifacts.load('{remote_path}')", ] post_code = [ @@ -202,8 +202,10 @@ def _add_media_obs( obs_chat_elt: Message = {"role": "observation", "content": obs} media_obs = check_and_load_image(code_action) if media_obs and result.success: - # for view_media_artifact, we need to ensure the media is loaded - # locally so the conversation agent can actually see it + # for view_media_artifact, we need to ensure the media is loaded locally so + # the conversation agent can actually see it. We also download it here so we + # can check if it contains the actual media (note this is in addition to + # downloading it per turn). code_interpreter.download_file( str(remote_artifacts_path.name), str(local_artifacts_path), @@ -530,6 +532,10 @@ def chat_with_artifacts( ) while not finished and iterations < self.max_iterations: + # ensure we upload the artifacts before each turn, so any local + # modifications we made to it will be reflected in the remote + code_interpreter.upload_file(self.local_artifacts_path) + response = run_conversation(self.agent, int_chat) if self.verbosity >= 1: _LOGGER.info(response) @@ -622,13 +628,14 @@ def chat_with_artifacts( iterations += 1 last_response = response - # after running the agent, download the artifacts locally - code_interpreter.download_file( - str(remote_artifacts_path.name), str(self.local_artifacts_path) - ) - artifacts.load( - self.local_artifacts_path, Path(self.local_artifacts_path).parent - ) + # after each turn, download the artifacts locally + code_interpreter.download_file( + str(remote_artifacts_path.name), str(self.local_artifacts_path) + ) + artifacts.load( + self.local_artifacts_path, Path(self.local_artifacts_path).parent + ) + return orig_chat, artifacts def streaming_message(self, message: Dict[str, Any]) -> None: From 8161c485b230dbff403da6a118c204a426c5ba47 Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Wed, 16 Oct 2024 09:13:25 -0700 Subject: [PATCH 22/26] fixed streamlit app for new updates --- examples/chat/app.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/examples/chat/app.py b/examples/chat/app.py index 25188649..66d222ba 100644 --- a/examples/chat/app.py +++ b/examples/chat/app.py @@ -27,13 +27,19 @@ "style": {"bottom": "calc(50% - 4.25rem", "right": "0.4rem"}, } # set artifacts remote_path to WORKSPACE -artifacts = va.tools.meta_tools.Artifacts(WORKSPACE / "artifacts.pkl") +local_artifacts_path = "artifacts.pkl" +remote_artifacts_path = WORKSPACE / "artifacts.pkl" +artifacts = va.tools.meta_tools.Artifacts(remote_artifacts_path, local_artifacts_path) if Path("artifacts.pkl").exists(): artifacts.load("artifacts.pkl") else: artifacts.save("artifacts.pkl") -agent = va.agent.VisionAgent(verbosity=1, local_artifacts_path="artifacts.pkl") +agent = va.agent.VisionAgent( + verbosity=2, + local_artifacts_path=local_artifacts_path, + remote_artifacts_path=remote_artifacts_path, +) st.set_page_config(layout="wide") @@ -54,7 +60,9 @@ def update_messages(messages, lock): with lock: if Path("artifacts.pkl").exists(): artifacts.load("artifacts.pkl") - new_chat, _ = agent.chat_with_artifacts(messages, artifacts=artifacts) + new_chat, _ = agent.chat_with_artifacts( + messages, artifacts=artifacts, test_multi_plan=False + ) for new_message in new_chat[len(messages) :]: messages.append(new_message) From dea87566965859603b6447fe44afb02b69da465b Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Wed, 16 Oct 2024 09:13:39 -0700 Subject: [PATCH 23/26] reduced code complexity --- vision_agent/agent/vision_agent.py | 127 +++++++++++------------------ vision_agent/tools/meta_tools.py | 6 +- 2 files changed, 51 insertions(+), 82 deletions(-) diff --git a/vision_agent/agent/vision_agent.py b/vision_agent/agent/vision_agent.py index 42204190..8cec02db 100644 --- a/vision_agent/agent/vision_agent.py +++ b/vision_agent/agent/vision_agent.py @@ -2,6 +2,7 @@ import json import logging import os +import pickle as pkl import tempfile from pathlib import Path from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast @@ -122,7 +123,9 @@ def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]: and "media" in chat[-1] and len(chat[-1]["media"]) > 0 # type: ignore ): - message["media"] = chat[-1]["media"] + media_obs = [media for media in chat[-1]["media"] if Path(media).exists()] # type: ignore + if len(media_obs) > 0: + message["media"] = media_obs # type: ignore conv_resp = cast(str, orch([message], stream=False)) # clean the response first, if we are executing code, do not resond or end @@ -146,10 +149,11 @@ def execute_code_action( artifacts: Artifacts, code: str, code_interpreter: CodeInterpreter, - artifact_remote_path: str, ) -> Tuple[Execution, str]: result = code_interpreter.exec_isolation( - BoilerplateCode.add_boilerplate(code, remote_path=artifact_remote_path) + BoilerplateCode.add_boilerplate( + code, remote_path=str(artifacts.remote_save_path) + ) ) obs = str(result.logs) @@ -163,7 +167,6 @@ def execute_user_code_action( artifacts: Artifacts, last_user_message: Message, code_interpreter: CodeInterpreter, - artifact_remote_path: str, ) -> Tuple[Optional[Execution], Optional[str]]: user_result = None user_obs = None @@ -180,50 +183,28 @@ def execute_user_code_action( if user_code_action is not None: user_code_action = use_extra_vision_agent_args(user_code_action, False) user_result, user_obs = execute_code_action( - artifacts, user_code_action, code_interpreter, artifact_remote_path + artifacts, user_code_action, code_interpreter ) if user_result.error: user_obs += f"\n{user_result.error}" - extract_and_save_files_to_artifacts( - artifacts, user_code_action, user_obs, user_result - ) return user_result, user_obs -def _add_media_obs( - code_action: str, - artifacts: Artifacts, - result: Execution, - obs: str, - code_interpreter: CodeInterpreter, - remote_artifacts_path: Path, - local_artifacts_path: Path, -) -> Dict[str, Any]: - obs_chat_elt: Message = {"role": "observation", "content": obs} - media_obs = check_and_load_image(code_action) - if media_obs and result.success: - # for view_media_artifact, we need to ensure the media is loaded locally so - # the conversation agent can actually see it. We also download it here so we - # can check if it contains the actual media (note this is in addition to - # downloading it per turn). +def download_and_merge_artifacts( + code_interpreter: CodeInterpreter, artifacts: Artifacts +) -> None: + with tempfile.TemporaryFile() as temp_file: code_interpreter.download_file( - str(remote_artifacts_path.name), - str(local_artifacts_path), - ) - artifacts.load( - local_artifacts_path, - local_artifacts_path.parent, + str(artifacts.remote_save_path), + str(temp_file), ) - - # check if the media is actually in the artifacts - media_obs_chat = [] - for media_ob in media_obs: - if media_ob in artifacts.artifacts: - media_obs_chat.append(local_artifacts_path.parent / media_ob) - if len(media_obs_chat) > 0: - obs_chat_elt["media"] = media_obs_chat - - return obs_chat_elt + temp_file.seek(0) + with open(str(temp_file), "rb") as f: + remote_artifacts = pkl.load(f) + merged_artifacts = {**artifacts.artifacts, **remote_artifacts} + artifacts.artifacts = merged_artifacts + artifacts.save() + artifacts.load(artifacts.local_save_path, artifacts.local_save_path.parent) def add_step_descriptions(response: Dict[str, Any]) -> Dict[str, Any]: @@ -354,21 +335,15 @@ def __init__( self.callback_message = callback_message if self.verbosity >= 1: _LOGGER.setLevel(logging.INFO) - self.local_artifacts_path = cast( - str, - ( - Path(local_artifacts_path) - if local_artifacts_path is not None - else Path(tempfile.NamedTemporaryFile(delete=False).name) - ), + self.local_artifacts_path = ( + Path(local_artifacts_path) + if local_artifacts_path is not None + else Path(tempfile.NamedTemporaryFile(delete=False).name) ) - self.remote_artifacts_path = cast( - str, - ( - Path(remote_artifacts_path) - if remote_artifacts_path is not None - else Path(WORKSPACE / "artifacts.pkl") - ), + self.remote_artifacts_path = ( + Path(remote_artifacts_path) + if remote_artifacts_path is not None + else Path(WORKSPACE / "artifacts.pkl") ) def __call__( @@ -455,8 +430,15 @@ def chat_with_artifacts( and not isinstance(self.code_interpreter, str) else CodeInterpreterFactory.new_instance( code_sandbox_runtime=self.code_interpreter, + remote_path=self.remote_artifacts_path.parent, ) ) + + if code_interpreter.remote_path != self.remote_artifacts_path.parent: + raise ValueError( + f"Code interpreter remote path {code_interpreter.remote_path} does not match {self.remote_artifacts_path.parent}" + ) + with code_interpreter: orig_chat = copy.deepcopy(chat) int_chat = copy.deepcopy(chat) @@ -501,9 +483,7 @@ def chat_with_artifacts( # Upload artifacts to remote location and show where they are going # to be loaded to. The actual loading happens in BoilerplateCode as # part of the pre_code. - remote_artifacts_path = code_interpreter.upload_file( - self.local_artifacts_path - ) + code_interpreter.upload_file(self.local_artifacts_path) artifacts_loaded = artifacts.show(code_interpreter.remote_path) int_chat.append({"role": "observation", "content": artifacts_loaded}) orig_chat.append({"role": "observation", "content": artifacts_loaded}) @@ -513,7 +493,6 @@ def chat_with_artifacts( artifacts, last_user_message, code_interpreter, - str(remote_artifacts_path), ) finished = user_result is not None and user_obs is not None if user_result is not None and user_obs is not None: @@ -537,6 +516,11 @@ def chat_with_artifacts( code_interpreter.upload_file(self.local_artifacts_path) response = run_conversation(self.agent, int_chat) + code_action = use_extra_vision_agent_args( + response.get("execute_python", None), + test_multi_plan, + custom_tool_names, + ) if self.verbosity >= 1: _LOGGER.info(response) int_chat.append( @@ -562,12 +546,6 @@ def chat_with_artifacts( finished = response.get("let_user_respond", False) - code_action = response.get("execute_python", None) - if code_action is not None: - code_action = use_extra_vision_agent_args( - code_action, test_multi_plan, custom_tool_names - ) - if last_response == response: self.streaming_message( { @@ -597,17 +575,11 @@ def chat_with_artifacts( artifacts, code_action, code_interpreter, - str(remote_artifacts_path), - ) - obs_chat_elt = _add_media_obs( - code_action, - artifacts, - result, - obs, - code_interpreter, - Path(remote_artifacts_path), - Path(self.local_artifacts_path), ) + obs_chat_elt: Message = {"role": "observation", "content": obs} + media_obs = check_and_load_image(code_action) + if media_obs and result.success: + obs_chat_elt["media"] = media_obs if self.verbosity >= 1: _LOGGER.info(obs) @@ -629,12 +601,7 @@ def chat_with_artifacts( last_response = response # after each turn, download the artifacts locally - code_interpreter.download_file( - str(remote_artifacts_path.name), str(self.local_artifacts_path) - ) - artifacts.load( - self.local_artifacts_path, Path(self.local_artifacts_path).parent - ) + download_and_merge_artifacts(code_interpreter, artifacts) return orig_chat, artifacts diff --git a/vision_agent/tools/meta_tools.py b/vision_agent/tools/meta_tools.py index ffbfc204..e8185daf 100644 --- a/vision_agent/tools/meta_tools.py +++ b/vision_agent/tools/meta_tools.py @@ -662,10 +662,10 @@ def get_diff_with_prompts(name: str, before: str, after: str) -> str: def use_extra_vision_agent_args( - code: str, + code: Optional[str], test_multi_plan: bool = True, custom_tool_names: Optional[List[str]] = None, -) -> str: +) -> Optional[str]: """This is for forcing arguments passed by the user to VisionAgent into the VisionAgentCoder call. @@ -677,6 +677,8 @@ def use_extra_vision_agent_args( Returns: str: The edited code. """ + if code is None: + return None red = RedBaron(code) for node in red: # seems to always be atomtrailers not call type From 38f23d309a29c3bbde0a724c8457dd5911480dd6 Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Wed, 16 Oct 2024 09:16:18 -0700 Subject: [PATCH 24/26] fix test case --- tests/unit/test_meta_tools.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/unit/test_meta_tools.py b/tests/unit/test_meta_tools.py index fff867d9..50a98e44 100644 --- a/tests/unit/test_meta_tools.py +++ b/tests/unit/test_meta_tools.py @@ -22,7 +22,7 @@ def test_check_and_load_image_two(): def test_use_object_detection_fine_tuning_none(): - artifacts = Artifacts("test") + artifacts = Artifacts("test", "test") code = "print('Hello, World!')" artifacts["code"] = code output = use_object_detection_fine_tuning(artifacts, "code", "123") @@ -33,7 +33,7 @@ def test_use_object_detection_fine_tuning_none(): def test_use_object_detection_fine_tuning(): - artifacts = Artifacts("test") + artifacts = Artifacts("test", "test") code = """florence2_phrase_grounding('one', image1) owl_v2_image('two', image2) florence2_sam2_image('three', image3)""" @@ -50,7 +50,7 @@ def test_use_object_detection_fine_tuning(): def test_use_object_detection_fine_tuning_twice(): - artifacts = Artifacts("test") + artifacts = Artifacts("test", "test") code = """florence2_phrase_grounding('one', image1) owl_v2_image('two', image2) florence2_sam2_image('three', image3)""" @@ -75,7 +75,7 @@ def test_use_object_detection_fine_tuning_twice(): def test_use_object_detection_fine_tuning_real_case(): - artifacts = Artifacts("test") + artifacts = Artifacts("test", "test") code = "florence2_phrase_grounding('(strange arg)', image1)" expected_code = 'florence2_phrase_grounding("(strange arg)", image1, "123")' artifacts["code"] = code From d09a3cd15a27f8ec4a0ad2f631d643219d44baf7 Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Wed, 16 Oct 2024 10:11:41 -0700 Subject: [PATCH 25/26] add back in list_artifacts --- tests/integ/test_tools.py | 2 +- vision_agent/agent/vision_agent.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/integ/test_tools.py b/tests/integ/test_tools.py index 09b8ba5c..0dfa32c8 100644 --- a/tests/integ/test_tools.py +++ b/tests/integ/test_tools.py @@ -205,7 +205,7 @@ def test_florence2_sam2_image_fine_tune_id(): fine_tune_id=FINE_TUNE_ID, ) # this calls a fine-tuned florence2 model which is going to be worse at this task - assert 14 <= len(result) <= 26 + assert 13 <= len(result) <= 26 assert [res["label"] for res in result] == ["coin"] * len(result) assert len([res["mask"] for res in result]) == len(result) diff --git a/vision_agent/agent/vision_agent.py b/vision_agent/agent/vision_agent.py index 8cec02db..74c34f41 100644 --- a/vision_agent/agent/vision_agent.py +++ b/vision_agent/agent/vision_agent.py @@ -39,7 +39,7 @@ class BoilerplateCode: pre_code = [ "from typing import *", "from vision_agent.utils.execute import CodeInterpreter", - "from vision_agent.tools.meta_tools import Artifacts, open_code_artifact, create_code_artifact, edit_code_artifact, get_tool_descriptions, generate_vision_code, edit_vision_code, view_media_artifact, object_detection_fine_tuning, use_object_detection_fine_tuning", + "from vision_agent.tools.meta_tools import Artifacts, open_code_artifact, create_code_artifact, edit_code_artifact, get_tool_descriptions, generate_vision_code, edit_vision_code, view_media_artifact, object_detection_fine_tuning, use_object_detection_fine_tuning, list_artifacts", "artifacts = Artifacts('{remote_path}', '{remote_path}')", "artifacts.load('{remote_path}')", ] From 9ba22275fd56aec504eadddd0623b622e82974fb Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Wed, 16 Oct 2024 10:34:01 -0700 Subject: [PATCH 26/26] loosen tests --- tests/integ/test_tools.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integ/test_tools.py b/tests/integ/test_tools.py index 0dfa32c8..993f268f 100644 --- a/tests/integ/test_tools.py +++ b/tests/integ/test_tools.py @@ -86,7 +86,7 @@ def test_owl_v2_fine_tune_id(): fine_tune_id=FINE_TUNE_ID, ) # this calls a fine-tuned florence2 model which is going to be worse at this task - assert 14 <= len(result) <= 26 + assert 13 <= len(result) <= 26 assert [res["label"] for res in result] == ["coin"] * len(result) assert all([all([0 <= x <= 1 for x in obj["bbox"]]) for obj in result])