diff --git a/vision_agent/agent/vision_agent.py b/vision_agent/agent/vision_agent.py
index 24cb3851..29643ecd 100644
--- a/vision_agent/agent/vision_agent.py
+++ b/vision_agent/agent/vision_agent.py
@@ -19,6 +19,7 @@
META_TOOL_DOCSTRING,
Artifacts,
check_and_load_image,
+ extract_and_save_files_to_artifacts,
use_extra_vision_agent_args,
)
from vision_agent.utils import CodeInterpreterFactory
@@ -36,7 +37,7 @@ class BoilerplateCode:
pre_code = [
"from typing import *",
"from vision_agent.utils.execute import CodeInterpreter",
- "from vision_agent.tools.meta_tools import Artifacts, open_code_artifact, create_code_artifact, edit_code_artifact, get_tool_descriptions, generate_vision_code, edit_vision_code, write_media_artifact, view_media_artifact, object_detection_fine_tuning, use_object_detection_fine_tuning",
+ "from vision_agent.tools.meta_tools import Artifacts, open_code_artifact, create_code_artifact, edit_code_artifact, get_tool_descriptions, generate_vision_code, edit_vision_code, view_media_artifact, object_detection_fine_tuning, use_object_detection_fine_tuning",
"artifacts = Artifacts('{remote_path}')",
"artifacts.load('{remote_path}')",
]
@@ -94,7 +95,7 @@ def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
elif chat_i["role"] == "observation":
conversation += f"OBSERVATION:\n{chat_i['content']}\n\n"
elif chat_i["role"] == "assistant":
- conversation += f"AGENT: {format_agent_message(chat_i['content'])}\n\n"
+ conversation += f"AGENT: {format_agent_message(chat_i['content'])}\n\n" # type: ignore
else:
raise ValueError(f"role {chat_i['role']} is not supported")
@@ -127,11 +128,15 @@ def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
def execute_code_action(
- code: str, code_interpreter: CodeInterpreter, artifact_remote_path: str
+ artifacts: Artifacts,
+ code: str,
+ code_interpreter: CodeInterpreter,
+ artifact_remote_path: str,
) -> Tuple[Execution, str]:
result = code_interpreter.exec_isolation(
BoilerplateCode.add_boilerplate(code, remote_path=artifact_remote_path)
)
+ extract_and_save_files_to_artifacts(artifacts, code)
obs = str(result.logs)
if result.error:
@@ -140,6 +145,7 @@ def execute_code_action(
def execute_user_code_action(
+ artifacts: Artifacts,
last_user_message: Message,
code_interpreter: CodeInterpreter,
artifact_remote_path: str,
@@ -159,7 +165,7 @@ def execute_user_code_action(
if user_code_action is not None:
user_code_action = use_extra_vision_agent_args(user_code_action, False)
user_result, user_obs = execute_code_action(
- user_code_action, code_interpreter, artifact_remote_path
+ artifacts, user_code_action, code_interpreter, artifact_remote_path
)
if user_result.error:
user_obs += f"\n{user_result.error}"
@@ -385,7 +391,10 @@ def chat_with_artifacts(
self.streaming_message({"role": "observation", "content": artifacts_loaded})
user_result, user_obs = execute_user_code_action(
- last_user_message, code_interpreter, str(remote_artifacts_path)
+ artifacts,
+ last_user_message,
+ code_interpreter,
+ str(remote_artifacts_path),
)
finished = user_result is not None and user_obs is not None
if user_result is not None and user_obs is not None:
@@ -456,7 +465,10 @@ def chat_with_artifacts(
if code_action is not None:
result, obs = execute_code_action(
- code_action, code_interpreter, str(remote_artifacts_path)
+ artifacts,
+ code_action,
+ code_interpreter,
+ str(remote_artifacts_path),
)
media_obs = check_and_load_image(code_action)
diff --git a/vision_agent/agent/vision_agent_prompts.py b/vision_agent/agent/vision_agent_prompts.py
index 93343045..278b0b4c 100644
--- a/vision_agent/agent/vision_agent_prompts.py
+++ b/vision_agent/agent/vision_agent_prompts.py
@@ -187,10 +187,10 @@
16| return count
[End of artifact]
-AGENT: I have generated the code to count the workers with helmets in the image, I must now run the code and print the output and write the visualization to the artifacts so I can see the result and the user can see the visaulization.
+AGENT: I have generated the code to count the workers with helmets in the image, I must now run the code and print the output.
from code import count_workers_with_helmets
-print(count_workers_with_helmets('/path/to/images/workers.png', 'workers_viz.png'))\n write_media_artifact(artifacts, 'workers_viz.png', 'workers_viz.png')
+print(count_workers_with_helmets('/path/to/images/workers.png', 'workers_viz.png'))
false
@@ -234,11 +234,10 @@ def count_workers_with_helmets(image_path: str, output_path: str):
count = 0
[End of edits]
-AGENT: I have updated the code to use the fine tuned model, I must now run the code and print the output and write the visualization to the artifacts so I can see the result and the user can see the visaulization.
+AGENT: I have updated the code to use the fine tuned model, I must now run the code and print the output.
from code import count_workers_with_helmets
print(count_workers_with_helmets('/path/to/images/workers.png', 'workers_viz.png'))
-write_media_artifact(artifacts, 'workers_viz.png', 'workers_viz.png')
false
diff --git a/vision_agent/tools/meta_tools.py b/vision_agent/tools/meta_tools.py
index 7f59c685..d9537e7c 100644
--- a/vision_agent/tools/meta_tools.py
+++ b/vision_agent/tools/meta_tools.py
@@ -1,4 +1,3 @@
-import base64
import difflib
import json
import os
@@ -9,7 +8,6 @@
from pathlib import Path
from typing import Any, Dict, List, Optional, Union
-import numpy as np
from IPython.display import display
from redbaron import RedBaron # type: ignore
@@ -22,8 +20,7 @@
from vision_agent.tools.tools import TOOL_DESCRIPTIONS
from vision_agent.tools.tools_types import BboxInput, BboxInputBase64, PromptTask
from vision_agent.utils.execute import Execution, MimeType
-from vision_agent.utils.image_utils import convert_to_b64, numpy_to_bytes
-from vision_agent.utils.video import frames_to_bytes
+from vision_agent.utils.image_utils import convert_to_b64
CURRENT_FILE = None
CURRENT_LINE = 0
@@ -393,19 +390,6 @@ def generate_vision_plan(
redisplay_results(response.test_results)
response.test_results = None
artifacts[name] = response.model_dump_json()
- media_names = extract_json(
- AnthropicLMM()( # type: ignore
- f"""Extract any media file names from this output in the following JSON format:
-{{"media": ["image1.jpg", "image2.jpg"]}}
-
-{artifacts[name]}"""
- )
- )
- if "media" in media_names and isinstance(media_names, dict):
- for media in media_names["media"]:
- if isinstance(media, str):
- with open(media, "rb") as f:
- artifacts[media] = f.read()
output_str = f"[Start Plan Context, saved at {name}]"
for plan in response.plans.keys():
@@ -466,6 +450,12 @@ def detect_dogs(image_path: str):
test_multi_plan=test_multi_plan,
custom_tool_names=custom_tool_names,
)
+
+ # capture and save any files that were saved in the code to the artifacts
+ extract_and_save_files_to_artifacts(
+ artifacts, response["code"] + "\n" + response["test"]
+ )
+
redisplay_results(response["test_result"])
code = response["code"]
artifacts[name] = code
@@ -546,6 +536,11 @@ def detect_dogs(image_path: str):
test_multi_plan=False,
custom_tool_names=custom_tool_names,
)
+ # capture and save any files that were saved in the code to the artifacts
+ extract_and_save_files_to_artifacts(
+ artifacts, response["code"] + "\n" + response["test"]
+ )
+
redisplay_results(response["test_result"])
code = response["code"]
artifacts[name] = code
@@ -567,49 +562,6 @@ def detect_dogs(image_path: str):
return view_lines(code_lines, 0, total_lines, name, total_lines)
-def write_media_artifact(
- artifacts: Artifacts,
- name: str,
- media: Union[str, np.ndarray, List[np.ndarray]],
- fps: Optional[float] = None,
-) -> str:
- """Writes a media file to the artifacts object.
-
- Parameters:
- artifacts (Artifacts): The artifacts object to save the media to.
- name (str): The name of the media artifact to save.
- media (Union[str, np.ndarray, List[np.ndarray]]): The media to save, can either
- be a file path, single image or list of frames for a video.
- fps (Optional[float]): The frames per second if you are writing a video.
- """
- if isinstance(media, str):
- with open(media, "rb") as f:
- media_bytes = f.read()
- elif isinstance(media, list):
- media_bytes = frames_to_bytes(media, fps=fps if fps is not None else 1.0)
- elif isinstance(media, np.ndarray):
- media_bytes = numpy_to_bytes(media)
- else:
- print(f"[Invalid media type {type(media)}]")
- return f"[Invalid media type {type(media)}]"
- artifacts[name] = media_bytes
- print(f"[Media {name} saved]")
- display(
- {
- MimeType.APPLICATION_ARTIFACT: json.dumps(
- {
- "name": name,
- "action": "create",
- "content": base64.b64encode(media_bytes).decode("utf-8"),
- "contentType": "media_output",
- }
- )
- },
- raw=True,
- )
- return f"[Media {name} saved]"
-
-
def list_artifacts(artifacts: Artifacts) -> str:
"""Lists all the artifacts that have been loaded into the artifacts object."""
output_str = artifacts.show()
@@ -813,6 +765,61 @@ def use_object_detection_fine_tuning(
return diff
+def extract_and_save_files_to_artifacts(artifacts: Artifacts, code: str) -> None:
+ """Extracts and saves files used in the code to the artifacts object.
+
+ Parameters:
+ artifacts (Artifacts): The artifacts object to save the files to.
+ code (str): The code to extract the files from.
+ """
+ try:
+ response = extract_json(
+ AnthropicLMM()( # type: ignore
+ f"""You are a helpful AI assistant. Your job is to look at a snippet of code and return the file paths that are being saved in the file. Below is the code snippet:
+
+```python
+{code}
+```
+
+Return the file paths in the following JSON format:
+{{"file_paths": ["/path/to/image1.jpg", "/other/path/to/data.json"]}}"""
+ )
+ )
+ except json.JSONDecodeError:
+ return
+
+ text_file_ext = [
+ ".txt",
+ ".md",
+ "rtf",
+ ".html",
+ ".htm",
+ "xml",
+ ".json",
+ ".csv",
+ ".tsv",
+ ".yaml",
+ ".yml",
+ ".toml",
+ ".conf",
+ ".env" ".ini",
+ ".log",
+ ".py",
+ ".java",
+ ".js",
+ ".cpp",
+ ".c" ".sql",
+ ".sh",
+ ]
+
+ if "file_paths" in response and isinstance(response["file_paths"], list):
+ for file_path in response["file_paths"]:
+ read_mode = "r" if Path(file_path).suffix in text_file_ext else "rb"
+ if Path(file_path).is_file():
+ with open(file_path, read_mode) as f:
+ artifacts[Path(file_path).name] = f.read()
+
+
META_TOOL_DOCSTRING = get_tool_documentation(
[
get_tool_descriptions,
@@ -822,7 +829,6 @@ def use_object_detection_fine_tuning(
generate_vision_plan,
generate_vision_code,
edit_vision_code,
- write_media_artifact,
view_media_artifact,
object_detection_fine_tuning,
use_object_detection_fine_tuning,