diff --git a/vision_agent/agent/vision_agent.py b/vision_agent/agent/vision_agent.py index 24cb3851..29643ecd 100644 --- a/vision_agent/agent/vision_agent.py +++ b/vision_agent/agent/vision_agent.py @@ -19,6 +19,7 @@ META_TOOL_DOCSTRING, Artifacts, check_and_load_image, + extract_and_save_files_to_artifacts, use_extra_vision_agent_args, ) from vision_agent.utils import CodeInterpreterFactory @@ -36,7 +37,7 @@ class BoilerplateCode: pre_code = [ "from typing import *", "from vision_agent.utils.execute import CodeInterpreter", - "from vision_agent.tools.meta_tools import Artifacts, open_code_artifact, create_code_artifact, edit_code_artifact, get_tool_descriptions, generate_vision_code, edit_vision_code, write_media_artifact, view_media_artifact, object_detection_fine_tuning, use_object_detection_fine_tuning", + "from vision_agent.tools.meta_tools import Artifacts, open_code_artifact, create_code_artifact, edit_code_artifact, get_tool_descriptions, generate_vision_code, edit_vision_code, view_media_artifact, object_detection_fine_tuning, use_object_detection_fine_tuning", "artifacts = Artifacts('{remote_path}')", "artifacts.load('{remote_path}')", ] @@ -94,7 +95,7 @@ def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]: elif chat_i["role"] == "observation": conversation += f"OBSERVATION:\n{chat_i['content']}\n\n" elif chat_i["role"] == "assistant": - conversation += f"AGENT: {format_agent_message(chat_i['content'])}\n\n" + conversation += f"AGENT: {format_agent_message(chat_i['content'])}\n\n" # type: ignore else: raise ValueError(f"role {chat_i['role']} is not supported") @@ -127,11 +128,15 @@ def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]: def execute_code_action( - code: str, code_interpreter: CodeInterpreter, artifact_remote_path: str + artifacts: Artifacts, + code: str, + code_interpreter: CodeInterpreter, + artifact_remote_path: str, ) -> Tuple[Execution, str]: result = code_interpreter.exec_isolation( BoilerplateCode.add_boilerplate(code, remote_path=artifact_remote_path) ) + extract_and_save_files_to_artifacts(artifacts, code) obs = str(result.logs) if result.error: @@ -140,6 +145,7 @@ def execute_code_action( def execute_user_code_action( + artifacts: Artifacts, last_user_message: Message, code_interpreter: CodeInterpreter, artifact_remote_path: str, @@ -159,7 +165,7 @@ def execute_user_code_action( if user_code_action is not None: user_code_action = use_extra_vision_agent_args(user_code_action, False) user_result, user_obs = execute_code_action( - user_code_action, code_interpreter, artifact_remote_path + artifacts, user_code_action, code_interpreter, artifact_remote_path ) if user_result.error: user_obs += f"\n{user_result.error}" @@ -385,7 +391,10 @@ def chat_with_artifacts( self.streaming_message({"role": "observation", "content": artifacts_loaded}) user_result, user_obs = execute_user_code_action( - last_user_message, code_interpreter, str(remote_artifacts_path) + artifacts, + last_user_message, + code_interpreter, + str(remote_artifacts_path), ) finished = user_result is not None and user_obs is not None if user_result is not None and user_obs is not None: @@ -456,7 +465,10 @@ def chat_with_artifacts( if code_action is not None: result, obs = execute_code_action( - code_action, code_interpreter, str(remote_artifacts_path) + artifacts, + code_action, + code_interpreter, + str(remote_artifacts_path), ) media_obs = check_and_load_image(code_action) diff --git a/vision_agent/agent/vision_agent_prompts.py b/vision_agent/agent/vision_agent_prompts.py index 93343045..278b0b4c 100644 --- a/vision_agent/agent/vision_agent_prompts.py +++ b/vision_agent/agent/vision_agent_prompts.py @@ -187,10 +187,10 @@ 16| return count [End of artifact] -AGENT: I have generated the code to count the workers with helmets in the image, I must now run the code and print the output and write the visualization to the artifacts so I can see the result and the user can see the visaulization. +AGENT: I have generated the code to count the workers with helmets in the image, I must now run the code and print the output. from code import count_workers_with_helmets -print(count_workers_with_helmets('/path/to/images/workers.png', 'workers_viz.png'))\n write_media_artifact(artifacts, 'workers_viz.png', 'workers_viz.png') +print(count_workers_with_helmets('/path/to/images/workers.png', 'workers_viz.png')) false @@ -234,11 +234,10 @@ def count_workers_with_helmets(image_path: str, output_path: str): count = 0 [End of edits] -AGENT: I have updated the code to use the fine tuned model, I must now run the code and print the output and write the visualization to the artifacts so I can see the result and the user can see the visaulization. +AGENT: I have updated the code to use the fine tuned model, I must now run the code and print the output. from code import count_workers_with_helmets print(count_workers_with_helmets('/path/to/images/workers.png', 'workers_viz.png')) -write_media_artifact(artifacts, 'workers_viz.png', 'workers_viz.png') false diff --git a/vision_agent/tools/meta_tools.py b/vision_agent/tools/meta_tools.py index 7f59c685..d9537e7c 100644 --- a/vision_agent/tools/meta_tools.py +++ b/vision_agent/tools/meta_tools.py @@ -1,4 +1,3 @@ -import base64 import difflib import json import os @@ -9,7 +8,6 @@ from pathlib import Path from typing import Any, Dict, List, Optional, Union -import numpy as np from IPython.display import display from redbaron import RedBaron # type: ignore @@ -22,8 +20,7 @@ from vision_agent.tools.tools import TOOL_DESCRIPTIONS from vision_agent.tools.tools_types import BboxInput, BboxInputBase64, PromptTask from vision_agent.utils.execute import Execution, MimeType -from vision_agent.utils.image_utils import convert_to_b64, numpy_to_bytes -from vision_agent.utils.video import frames_to_bytes +from vision_agent.utils.image_utils import convert_to_b64 CURRENT_FILE = None CURRENT_LINE = 0 @@ -393,19 +390,6 @@ def generate_vision_plan( redisplay_results(response.test_results) response.test_results = None artifacts[name] = response.model_dump_json() - media_names = extract_json( - AnthropicLMM()( # type: ignore - f"""Extract any media file names from this output in the following JSON format: -{{"media": ["image1.jpg", "image2.jpg"]}} - -{artifacts[name]}""" - ) - ) - if "media" in media_names and isinstance(media_names, dict): - for media in media_names["media"]: - if isinstance(media, str): - with open(media, "rb") as f: - artifacts[media] = f.read() output_str = f"[Start Plan Context, saved at {name}]" for plan in response.plans.keys(): @@ -466,6 +450,12 @@ def detect_dogs(image_path: str): test_multi_plan=test_multi_plan, custom_tool_names=custom_tool_names, ) + + # capture and save any files that were saved in the code to the artifacts + extract_and_save_files_to_artifacts( + artifacts, response["code"] + "\n" + response["test"] + ) + redisplay_results(response["test_result"]) code = response["code"] artifacts[name] = code @@ -546,6 +536,11 @@ def detect_dogs(image_path: str): test_multi_plan=False, custom_tool_names=custom_tool_names, ) + # capture and save any files that were saved in the code to the artifacts + extract_and_save_files_to_artifacts( + artifacts, response["code"] + "\n" + response["test"] + ) + redisplay_results(response["test_result"]) code = response["code"] artifacts[name] = code @@ -567,49 +562,6 @@ def detect_dogs(image_path: str): return view_lines(code_lines, 0, total_lines, name, total_lines) -def write_media_artifact( - artifacts: Artifacts, - name: str, - media: Union[str, np.ndarray, List[np.ndarray]], - fps: Optional[float] = None, -) -> str: - """Writes a media file to the artifacts object. - - Parameters: - artifacts (Artifacts): The artifacts object to save the media to. - name (str): The name of the media artifact to save. - media (Union[str, np.ndarray, List[np.ndarray]]): The media to save, can either - be a file path, single image or list of frames for a video. - fps (Optional[float]): The frames per second if you are writing a video. - """ - if isinstance(media, str): - with open(media, "rb") as f: - media_bytes = f.read() - elif isinstance(media, list): - media_bytes = frames_to_bytes(media, fps=fps if fps is not None else 1.0) - elif isinstance(media, np.ndarray): - media_bytes = numpy_to_bytes(media) - else: - print(f"[Invalid media type {type(media)}]") - return f"[Invalid media type {type(media)}]" - artifacts[name] = media_bytes - print(f"[Media {name} saved]") - display( - { - MimeType.APPLICATION_ARTIFACT: json.dumps( - { - "name": name, - "action": "create", - "content": base64.b64encode(media_bytes).decode("utf-8"), - "contentType": "media_output", - } - ) - }, - raw=True, - ) - return f"[Media {name} saved]" - - def list_artifacts(artifacts: Artifacts) -> str: """Lists all the artifacts that have been loaded into the artifacts object.""" output_str = artifacts.show() @@ -813,6 +765,61 @@ def use_object_detection_fine_tuning( return diff +def extract_and_save_files_to_artifacts(artifacts: Artifacts, code: str) -> None: + """Extracts and saves files used in the code to the artifacts object. + + Parameters: + artifacts (Artifacts): The artifacts object to save the files to. + code (str): The code to extract the files from. + """ + try: + response = extract_json( + AnthropicLMM()( # type: ignore + f"""You are a helpful AI assistant. Your job is to look at a snippet of code and return the file paths that are being saved in the file. Below is the code snippet: + +```python +{code} +``` + +Return the file paths in the following JSON format: +{{"file_paths": ["/path/to/image1.jpg", "/other/path/to/data.json"]}}""" + ) + ) + except json.JSONDecodeError: + return + + text_file_ext = [ + ".txt", + ".md", + "rtf", + ".html", + ".htm", + "xml", + ".json", + ".csv", + ".tsv", + ".yaml", + ".yml", + ".toml", + ".conf", + ".env" ".ini", + ".log", + ".py", + ".java", + ".js", + ".cpp", + ".c" ".sql", + ".sh", + ] + + if "file_paths" in response and isinstance(response["file_paths"], list): + for file_path in response["file_paths"]: + read_mode = "r" if Path(file_path).suffix in text_file_ext else "rb" + if Path(file_path).is_file(): + with open(file_path, read_mode) as f: + artifacts[Path(file_path).name] = f.read() + + META_TOOL_DOCSTRING = get_tool_documentation( [ get_tool_descriptions, @@ -822,7 +829,6 @@ def use_object_detection_fine_tuning( generate_vision_plan, generate_vision_code, edit_vision_code, - write_media_artifact, view_media_artifact, object_detection_fine_tuning, use_object_detection_fine_tuning,