diff --git a/README.md b/README.md
index e34e265e..3812767a 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,5 @@
-
+
# 🔍🤖 Vision Agent
[![](https://dcbadge.vercel.app/api/server/wPdN8RCYew?compact=true&style=flat)](https://discord.gg/wPdN8RCYew)
@@ -345,6 +345,11 @@ result = agent.generate_code(conv)
## Additional Backends
+### E2B Code Execution
+If you wish to run your code on the E2B backend, make sure you have your `E2B_API_KEY`
+set and then set `CODE_SANDBOX_RUNTIME=e2b` in your environment variables. This will
+run all the agent generated code on the E2B backend.
+
### Anthropic
`AnthropicVisionAgentCoder` uses Anthropic. To get started you just need to get an
Anthropic API key and set it in your environment variables:
diff --git a/assets/logo.jpg b/assets/logo.jpg
deleted file mode 100644
index a2fa3a2c..00000000
Binary files a/assets/logo.jpg and /dev/null differ
diff --git a/assets/logo.png b/assets/logo.png
new file mode 100644
index 00000000..636574f2
Binary files /dev/null and b/assets/logo.png differ
diff --git a/docs/index.md b/docs/index.md
index 08c808a9..297f70ac 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -341,6 +341,11 @@ result = agent.generate_code(conv)
## Additional Backends
+### E2B Code Execution
+If you wish to run your code on the E2B backend, make sure you have your `E2B_API_KEY`
+set and then set `CODE_SANDBOX_RUNTIME=e2b` in your environment variables. This will
+run all the agent generated code on the E2B backend.
+
### Anthropic
`AnthropicVisionAgentCoder` uses Anthropic. To get started you just need to get an
Anthropic API key and set it in your environment variables:
diff --git a/tests/integ/test_tools.py b/tests/integ/test_tools.py
index 690795f0..42c8e62e 100644
--- a/tests/integ/test_tools.py
+++ b/tests/integ/test_tools.py
@@ -24,7 +24,6 @@
grounding_dino,
grounding_sam,
ixc25_image_vqa,
- ixc25_temporal_localization,
ixc25_video_vqa,
loca_visual_prompt_counting,
loca_zero_shot_counting,
@@ -71,6 +70,14 @@ def test_owl_v2_image():
assert all([all([0 <= x <= 1 for x in obj["bbox"]]) for obj in result])
+def test_owl_v2_image_empty():
+ result = owl_v2_image(
+ prompt="coin",
+ image=np.zeros((0, 0, 3)).astype(np.uint8),
+ )
+ assert result == []
+
+
def test_owl_v2_fine_tune_id():
img = ski.data.coins()
result = owl_v2_image(
@@ -110,6 +117,14 @@ def test_florence2_phrase_grounding():
assert all([all([0 <= x <= 1 for x in obj["bbox"]]) for obj in result])
+def test_florence2_phrase_grounding_empty():
+ result = florence2_phrase_grounding(
+ image=np.zeros((0, 0, 3)).astype(np.uint8),
+ prompt="coin",
+ )
+ assert result == []
+
+
def test_florence2_phrase_grounding_fine_tune_id():
img = ski.data.coins()
result = florence2_phrase_grounding(
@@ -147,7 +162,7 @@ def test_florence2_phrase_grounding_video_fine_tune_id():
fine_tune_id=FINE_TUNE_ID,
)
assert len(result) == 10
- assert 16 <= len([res["label"] for res in result[0]]) <= 26
+ assert 12 <= len([res["label"] for res in result[0]]) <= 26
assert all([all([0 <= x <= 1 for x in obj["bbox"]]) for obj in result[0]])
@@ -195,6 +210,14 @@ def test_florence2_sam2_image_fine_tune_id():
assert len([res["mask"] for res in result]) == len(result)
+def test_florence2_sam2_image_empty():
+ result = florence2_sam2_image(
+ prompt="coin",
+ image=np.zeros((0, 0, 3)).astype(np.uint8),
+ )
+ assert result == []
+
+
def test_florence2_sam2_video():
frames = [
np.array(Image.fromarray(ski.data.coins()).convert("RGB")) for _ in range(10)
@@ -208,7 +231,7 @@ def test_florence2_sam2_video():
assert len([res["mask"] for res in result[0]]) == 25
-def test_segmentation():
+def test_detr_segmentation():
img = ski.data.coins()
result = detr_segmentation(
image=img,
@@ -218,6 +241,13 @@ def test_segmentation():
assert len([res["mask"] for res in result]) == 1
+def test_detr_segmentation_empty():
+ result = detr_segmentation(
+ image=np.zeros((0, 0, 3)).astype(np.uint8),
+ )
+ assert result == []
+
+
def test_clip():
img = ski.data.coins()
result = clip(
@@ -227,6 +257,15 @@ def test_clip():
assert result["scores"] == [0.9999, 0.0001]
+def test_clip_empty():
+ result = clip(
+ classes=["coins", "notes"],
+ image=np.zeros((0, 0, 3)).astype(np.uint8),
+ )
+ assert result["scores"] == []
+ assert result["labels"] == []
+
+
def test_vit_classification():
img = ski.data.coins()
result = vit_image_classification(
@@ -235,6 +274,14 @@ def test_vit_classification():
assert "typewriter keyboard" in result["labels"]
+def test_vit_classification_empty():
+ result = vit_image_classification(
+ image=np.zeros((0, 0, 3)).astype(np.uint8),
+ )
+ assert result["labels"] == []
+ assert result["scores"] == []
+
+
def test_nsfw_classification():
img = ski.data.coins()
result = vit_nsfw_classification(
@@ -243,7 +290,7 @@ def test_nsfw_classification():
assert result["label"] == "normal"
-def test_image_caption() -> None:
+def test_image_caption():
img = ski.data.rocket()
result = blip_image_caption(
image=img,
@@ -251,7 +298,7 @@ def test_image_caption() -> None:
assert result.strip() == "a rocket on a stand"
-def test_florence_image_caption() -> None:
+def test_florence_image_caption():
img = ski.data.rocket()
result = florence2_image_caption(
image=img,
@@ -259,7 +306,7 @@ def test_florence_image_caption() -> None:
assert "The image shows a rocket on a launch pad at night" in result.strip()
-def test_loca_zero_shot_counting() -> None:
+def test_loca_zero_shot_counting():
img = ski.data.coins()
result = loca_zero_shot_counting(
@@ -268,7 +315,7 @@ def test_loca_zero_shot_counting() -> None:
assert result["count"] == 21
-def test_loca_visual_prompt_counting() -> None:
+def test_loca_visual_prompt_counting():
img = ski.data.coins()
result = loca_visual_prompt_counting(
visual_prompt={"bbox": [85, 106, 122, 145]},
@@ -277,7 +324,7 @@ def test_loca_visual_prompt_counting() -> None:
assert result["count"] == 25
-def test_git_vqa_v2() -> None:
+def test_git_vqa_v2():
img = ski.data.rocket()
result = git_vqa_v2(
prompt="Is the scene captured during day or night ?",
@@ -286,7 +333,7 @@ def test_git_vqa_v2() -> None:
assert result.strip() == "night"
-def test_image_qa_with_context() -> None:
+def test_image_qa_with_context():
img = ski.data.rocket()
result = florence2_roberta_vqa(
prompt="Is the scene captured during day or night ?",
@@ -295,7 +342,7 @@ def test_image_qa_with_context() -> None:
assert "night" in result.strip()
-def test_ixc25_image_vqa() -> None:
+def test_ixc25_image_vqa():
img = ski.data.cat()
result = ixc25_image_vqa(
prompt="What animal is in this image?",
@@ -304,7 +351,7 @@ def test_ixc25_image_vqa() -> None:
assert "cat" in result.strip()
-def test_ixc25_video_vqa() -> None:
+def test_ixc25_video_vqa():
frames = [
np.array(Image.fromarray(ski.data.cat()).convert("RGB")) for _ in range(10)
]
@@ -315,18 +362,7 @@ def test_ixc25_video_vqa() -> None:
assert "cat" in result.strip()
-def test_ixc25_temporal_localization() -> None:
- frames = [
- np.array(Image.fromarray(ski.data.cat()).convert("RGB")) for _ in range(10)
- ]
- result = ixc25_temporal_localization(
- prompt="What animal is in this video?",
- frames=frames,
- )
- assert result == [True] * 10
-
-
-def test_ocr() -> None:
+def test_ocr():
img = ski.data.page()
result = ocr(
image=img,
@@ -334,7 +370,14 @@ def test_ocr() -> None:
assert any("Region-based segmentation" in res["label"] for res in result)
-def test_florence2_ocr() -> None:
+def test_ocr_empty():
+ result = ocr(
+ image=np.zeros((0, 0, 3)).astype(np.uint8),
+ )
+ assert result == []
+
+
+def test_florence2_ocr():
img = ski.data.page()
result = florence2_ocr(
image=img,
@@ -342,6 +385,13 @@ def test_florence2_ocr() -> None:
assert any("Region-based segmentation" in res["label"] for res in result)
+def test_florence2_ocr_empty():
+ result = florence2_ocr(
+ image=np.zeros((0, 0, 3)).astype(np.uint8),
+ )
+ assert result == []
+
+
def test_mask_distance():
# Create two binary masks
mask1 = np.zeros((100, 100), dtype=np.uint8)
@@ -399,14 +449,22 @@ def test_generate_hed():
assert result.shape == img.shape
-def test_countgd_counting() -> None:
+def test_countgd_counting():
img = ski.data.coins()
result = countgd_counting(image=img, prompt="coin")
assert len(result) == 24
assert [res["label"] for res in result] == ["coin"] * 24
-def test_countgd_example_based_counting() -> None:
+def test_countgd_counting_empty():
+ result = countgd_counting(
+ prompt="coin",
+ image=np.zeros((0, 0, 3)).astype(np.uint8),
+ )
+ assert result == []
+
+
+def test_countgd_example_based_counting():
img = ski.data.coins()
result = countgd_example_based_counting(
visual_prompts=[[85, 106, 122, 145]],
@@ -414,3 +472,11 @@ def test_countgd_example_based_counting() -> None:
)
assert len(result) == 24
assert [res["label"] for res in result] == ["object"] * 24
+
+
+def test_countgd_example_based_counting_empty():
+ result = countgd_example_based_counting(
+ visual_prompts=[[85, 106, 122, 145]],
+ image=np.zeros((0, 0, 3)).astype(np.uint8),
+ )
+ assert result == []
diff --git a/tests/unit/test_va.py b/tests/unit/test_va.py
index 3fe619e8..e7a6e7c5 100644
--- a/tests/unit/test_va.py
+++ b/tests/unit/test_va.py
@@ -1,4 +1,12 @@
-from vision_agent.agent.vision_agent import parse_execution
+from vision_agent.agent.agent_utils import extract_tag
+from vision_agent.tools.meta_tools import use_extra_vision_agent_args
+
+
+def parse_execution(code, test_multi_plan=True, custom_tool_names=None):
+ code = extract_tag(code, "execute_python")
+ if not code:
+ return None
+ return use_extra_vision_agent_args(code, test_multi_plan, custom_tool_names)
def test_parse_execution_zero():
diff --git a/vision_agent/agent/vision_agent.py b/vision_agent/agent/vision_agent.py
index 42541d33..29643ecd 100644
--- a/vision_agent/agent/vision_agent.py
+++ b/vision_agent/agent/vision_agent.py
@@ -1,4 +1,5 @@
import copy
+import json
import logging
import os
import tempfile
@@ -6,7 +7,7 @@
from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast
from vision_agent.agent import Agent
-from vision_agent.agent.agent_utils import extract_json
+from vision_agent.agent.agent_utils import extract_json, extract_tag
from vision_agent.agent.vision_agent_prompts import (
EXAMPLES_CODE1,
EXAMPLES_CODE2,
@@ -18,6 +19,7 @@
META_TOOL_DOCSTRING,
Artifacts,
check_and_load_image,
+ extract_and_save_files_to_artifacts,
use_extra_vision_agent_args,
)
from vision_agent.utils import CodeInterpreterFactory
@@ -35,7 +37,7 @@ class BoilerplateCode:
pre_code = [
"from typing import *",
"from vision_agent.utils.execute import CodeInterpreter",
- "from vision_agent.tools.meta_tools import Artifacts, open_code_artifact, create_code_artifact, edit_code_artifact, get_tool_descriptions, generate_vision_code, edit_vision_code, write_media_artifact, view_media_artifact, object_detection_fine_tuning, use_object_detection_fine_tuning",
+ "from vision_agent.tools.meta_tools import Artifacts, open_code_artifact, create_code_artifact, edit_code_artifact, get_tool_descriptions, generate_vision_code, edit_vision_code, view_media_artifact, object_detection_fine_tuning, use_object_detection_fine_tuning",
"artifacts = Artifacts('{remote_path}')",
"artifacts.load('{remote_path}')",
]
@@ -57,6 +59,32 @@ def add_boilerplate(code: str, **format: Any) -> str:
)
+def format_agent_message(agent_message: str) -> str:
+ agent_message_json = extract_json(agent_message)
+ output = ""
+ if "thinking" in agent_message_json and agent_message_json["thinking"]:
+ output += "
" + agent_message_json["thinking"] + ""
+ if "response" in agent_message_json and agent_message_json["response"]:
+ output += "
" + agent_message_json["response"] + ""
+ if "execute_python" in agent_message_json and agent_message_json["execute_python"]:
+ output += (
+ "\n
\n"
+ + agent_message_json["execute_python"]
+ + "\n\n"
+ )
+ if (
+ "let_user_respond" in agent_message_json
+ and agent_message_json["let_user_respond"]
+ ):
+ output += (
+ "
"
+ + str(agent_message_json["let_user_respond"])
+ + ""
+ )
+
+ return output
+
+
def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
chat = copy.deepcopy(chat)
@@ -67,7 +95,7 @@ def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
elif chat_i["role"] == "observation":
conversation += f"OBSERVATION:\n{chat_i['content']}\n\n"
elif chat_i["role"] == "assistant":
- conversation += f"AGENT: {chat_i['content']}\n\n"
+ conversation += f"AGENT: {format_agent_message(chat_i['content'])}\n\n" # type: ignore
else:
raise ValueError(f"role {chat_i['role']} is not supported")
@@ -84,15 +112,31 @@ def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
and len(chat[-1]["media"]) > 0 # type: ignore
):
message["media"] = chat[-1]["media"]
- return extract_json(orch([message], stream=False)) # type: ignore
+ conv_resp = cast(str, orch([message], stream=False))
+
+ let_user_respond_str = extract_tag(conv_resp, "let_user_respond")
+ let_user_respond = (
+ "true" in let_user_respond_str.lower() if let_user_respond_str else False
+ )
+
+ return {
+ "thinking": extract_tag(conv_resp, "thinking"),
+ "response": extract_tag(conv_resp, "response"),
+ "execute_python": extract_tag(conv_resp, "execute_python"),
+ "let_user_respond": let_user_respond,
+ }
def execute_code_action(
- code: str, code_interpreter: CodeInterpreter, artifact_remote_path: str
+ artifacts: Artifacts,
+ code: str,
+ code_interpreter: CodeInterpreter,
+ artifact_remote_path: str,
) -> Tuple[Execution, str]:
result = code_interpreter.exec_isolation(
BoilerplateCode.add_boilerplate(code, remote_path=artifact_remote_path)
)
+ extract_and_save_files_to_artifacts(artifacts, code)
obs = str(result.logs)
if result.error:
@@ -100,33 +144,8 @@ def execute_code_action(
return result, obs
-def parse_execution(
- response: str,
- test_multi_plan: bool = True,
- custom_tool_names: Optional[List[str]] = None,
-) -> Optional[str]:
- code = None
- remaining = response
- all_code = []
- while "
" in remaining:
- code_i = remaining[
- remaining.find("") + len("") :
- ]
- code_i = code_i[: code_i.find("")]
- remaining = remaining[
- remaining.find("") + len("") :
- ]
- all_code.append(code_i)
-
- if len(all_code) > 0:
- code = "\n".join(all_code)
-
- if code is not None:
- code = use_extra_vision_agent_args(code, test_multi_plan, custom_tool_names)
- return code
-
-
def execute_user_code_action(
+ artifacts: Artifacts,
last_user_message: Message,
code_interpreter: CodeInterpreter,
artifact_remote_path: str,
@@ -138,40 +157,43 @@ def execute_user_code_action(
return user_result, user_obs
last_user_content = cast(str, last_user_message.get("content", ""))
+ try:
+ user_code_action = json.loads(last_user_content).get("execute_python", None)
+ except json.JSONDecodeError:
+ return user_result, user_obs
- user_code_action = parse_execution(last_user_content, False)
if user_code_action is not None:
+ user_code_action = use_extra_vision_agent_args(user_code_action, False)
user_result, user_obs = execute_code_action(
- user_code_action, code_interpreter, artifact_remote_path
+ artifacts, user_code_action, code_interpreter, artifact_remote_path
)
if user_result.error:
user_obs += f"\n{user_result.error}"
return user_result, user_obs
-def add_step_descriptions(response: Dict[str, str]) -> Dict[str, str]:
+def add_step_descriptions(response: Dict[str, Any]) -> Dict[str, Any]:
response = copy.deepcopy(response)
- if "response" in response:
- resp_str = response["response"]
- if "
" in resp_str:
- # only include descriptions for these, the rest will just have executing
- # code
- description_map = {
- "open_code_artifact": "Reading file.",
- "create_code_artifact": "Creating file.",
- "edit_code_artifact": "Editing file.",
- "generate_vision_code": "Generating vision code.",
- "edit_vision_code": "Editing vision code.",
- }
- description = ""
- for k, v in description_map.items():
- if k in resp_str:
- description += v + " "
- if description == "":
- description = "Executing code."
- resp_str = resp_str[resp_str.find("") :]
- resp_str = description + resp_str
- response["response"] = resp_str
+
+ if "execute_python" in response and response["execute_python"]:
+ # only include descriptions for these, the rest will just have executing
+ # code
+ description_map = {
+ "open_code_artifact": "Reading file.",
+ "create_code_artifact": "Creating file.",
+ "edit_code_artifact": "Editing file.",
+ "generate_vision_code": "Generating vision code.",
+ "edit_vision_code": "Editing vision code.",
+ }
+ description = ""
+ for k, v in description_map.items():
+ if k in response["execute_python"]:
+ description += v + " "
+ if description == "":
+ description = "Executing code."
+
+ response["response"] = description
+
return response
@@ -369,7 +391,10 @@ def chat_with_artifacts(
self.streaming_message({"role": "observation", "content": artifacts_loaded})
user_result, user_obs = execute_user_code_action(
- last_user_message, code_interpreter, str(remote_artifacts_path)
+ artifacts,
+ last_user_message,
+ code_interpreter,
+ str(remote_artifacts_path),
)
finished = user_result is not None and user_obs is not None
if user_result is not None and user_obs is not None:
@@ -394,13 +419,13 @@ def chat_with_artifacts(
int_chat.append(
{
"role": "assistant",
- "content": str(add_step_descriptions(response)),
+ "content": json.dumps(add_step_descriptions(response)),
}
)
orig_chat.append(
{
"role": "assistant",
- "content": str(add_step_descriptions(response)),
+ "content": json.dumps(add_step_descriptions(response)),
}
)
@@ -408,11 +433,13 @@ def chat_with_artifacts(
if last_response == response:
response["let_user_respond"] = True
- finished = response["let_user_respond"]
+ finished = response.get("let_user_respond", False)
- code_action = parse_execution(
- response["response"], test_multi_plan, custom_tool_names
- )
+ code_action = response.get("execute_python", None)
+ if code_action is not None:
+ code_action = use_extra_vision_agent_args(
+ code_action, test_multi_plan, custom_tool_names
+ )
if last_response == response:
self.streaming_message(
@@ -431,14 +458,17 @@ def chat_with_artifacts(
self.streaming_message(
{
"role": "assistant",
- "content": response,
+ "content": json.dumps(response),
"finished": finished and code_action is None,
}
)
if code_action is not None:
result, obs = execute_code_action(
- code_action, code_interpreter, str(remote_artifacts_path)
+ artifacts,
+ code_action,
+ code_interpreter,
+ str(remote_artifacts_path),
)
media_obs = check_and_load_image(code_action)
diff --git a/vision_agent/agent/vision_agent_prompts.py b/vision_agent/agent/vision_agent_prompts.py
index a8b1e543..278b0b4c 100644
--- a/vision_agent/agent/vision_agent_prompts.py
+++ b/vision_agent/agent/vision_agent_prompts.py
@@ -27,13 +27,14 @@
**Instructions**:
1. **Understand and Clarify**: Make sure you understand the task, ask clarifying questions if the task is not clear.
2. **Code Generation**: Only use code provided in the Documentation in your tags. Only use `edit_vision_code` to modify code written by `generate_vision_code`.
-3. **Execute**: Do only what the user asked you to do and no more. If you need to ask the user a question, set `let_user_respond` to `true`.
+3. **Execute**: Do only what the user asked you to do and no more. If you need to ask the user a question or show your results to the user, set to `true`.
4. **Response**: Keep your responses short and concise. Provide the user only with the information they need to continue the conversation.
-5. **Output in JSON**: Respond in the following format in JSON:
+5. **Output**: You can only respond with , , , and tags.
-```json
-{{"thoughts": , "response": , "let_user_respond": }}.
-```
+Your thoughts here...
+Your response to the user here...
+Your code here...
+true/false for whether or not to you want to let the user respond.
"""
@@ -45,7 +46,11 @@
Artifact dog.jpg loaded to /path/to/images/dog.jpg
[End of artifacts]
-AGENT: {"thoughts": "I will use the generate_vision_code to detect the dogs in the image.", "response": "generate_vision_code(artifacts, 'dog_detector.py', 'Can you write code to detect dogs in this image?', media=['/path/to/images/dog.jpg'])", "let_user_respond": false}
+AGENT: I will use the generate_vision_code to detect the dogs in the image.
+
+generate_vision_code(artifacts, 'dog_detector.py', 'Can you write code to detect dogs in this image?', media=['/path/to/images/dog.jpg'])
+
+false
OBSERVATION:
[Artifact dog_detector.py]
@@ -56,14 +61,21 @@
4| return dogs
[End of artifact]
-AGENT: {"thoughts": "I have generated the code to detect the dogs in the image, I must now run the code and print the results to get the output.", "response": "from dog_detector import detect_dogs\n print(detect_dogs('/example/workspace/dog.jpg'))", "let_user_respond": false}
+AGENT: I have generated the code to detect the dogs in the image, I must now run the code and print the results to get the output.
+
+from dog_detector import detect_dogs
+print(detect_dogs('/example/workspace/dog.jpg'))
+
+false
OBSERVATION:
----- stdout -----
[{'score': 0.99, 'label': 'dog', 'box': [0.1, 0.2, 0.3, 0.4]}, {'score': 0.23, 'label': 'dog', 'box': [0.2, 0.3, 0.4, 0.5]}]
-AGENT: {"thoughts": "Two dogs are detected, I will show this to the user and ask them if the result looks good.", "response": "The code detectd two dogs, do the results look good to you?", "let_user_respond": true}
+AGENT: Two dogs are detected, I will show this to the user and ask them if the result looks good.
+The code detectd two dogs, do the results look good to you?
+true
"""
EXAMPLES_CODE1_EXTRA = """
@@ -75,7 +87,11 @@
Artifact dog_detector.py loaded to /path/to/code/dog_detector.py
[End of artifacts]
-AGENT: {"thoughts": "I will use edit_vision_code to fix the code to detect only one dog. The lower score is 0.23 so I will set the threshold above that.", "response": "edit_vision_code(artifacts, 'dog_detector.py', ['Can you write code to detect dogs in this image?', 'Can you increase the threshold to 0.24?'], media=['/path/to/images/dog.jpg'])", "let_user_respond": false}
+AGENT: I will use edit_vision_code to fix the code to detect only one dog. The lower score is 0.23 so I will set the threshold above that
+
+edit_vision_code(artifacts, 'dog_detector.py', ['Can you write code to detect dogs in this image?', 'Can you increase the threshold to 0.24?'], media=['/path/to/images/dog.jpg'])
+
+false
OBSERVATION:
[Artifact dog_detector.py]
@@ -86,13 +102,20 @@
4| return dogs
[End of artifact]
-AGENT: {"thoughts": "I have edited the code to detect only one dog, I must now run the code and print the results to get the output.", "response": "from dog_detector import detect_dogs\n print(detect_dogs('/path/to/images/dog.jpg'))", "let_user_respond": false}
+AGENT: I have edited the code to detect only one dog, I must now run the code and print the results to get the output.
+
+from dog_detector import detect_dogs
+print(detect_dogs('/path/to/images/dog.jpg'))
+
+false
OBSERVATION:
----- stdout -----
[{'score': 0.99, 'label': 'dog', 'box': [0.1, 0.2, 0.3, 0.4]}]
-AGENT: {"thoughts": "One dog is detected, I will show this to the user and ask them if the result looks good.", "response": "The code detected one dog, do these results look good to you?", "let_user_respond": true}
+AGENT: One dog is detected, I will show this to the user and ask them if the result looks good.
+The code detected one dog, do these results look good to you?
+true
"""
EXAMPLES_CODE2 = """
@@ -103,12 +126,18 @@
Artifact image.jpg loaded to /path/to/images/image.jpg
[End of artifacts]
-AGENT: {"thoughts": "The user hasn't asked me to write any code and the task is very simple so I will view the image and answer myself to respond to the user quickly.", "response": "view_media_artifacts('image.jpg')", "let_user_respond": false}
+AGENT: The user hasn't asked me to write any code and the task is very simple so I will view the image and answer myself to respond to the user quickly.
+
+view_media_artifacts('image.jpg')
+
+false
OBSERVATION:
[Image image.jpg displayed]
-AGENT: {"thoughts": "The image shows a cat and a dog sitting on the couch, I will tell the user and ask them if they need any other assistance.", "response": "The image contains a dog and a cat sitting on a couch. Can I help you with any other tasks?", "let_user_respond": true}
+AGENT: The image shows a cat and a dog sitting on the couch, I will tell the user and ask them if they need any other assistance.
+The image contains a dog and a cat sitting on a couch. Can I help you with any other tasks?
+true
"""
@@ -119,7 +148,9 @@
[Artifacts loaded]
[End of artifacts]
-AGENT: {"thoughts": "The user has asked to count workers with helmets but has not provided an image. I will ask the user for an image and then generate the code to count workers with helmets.", "response": "Can you provide an image of workers with helmets?", "let_user_respond": true}
+AGENT: The user has asked to count workers with helmets but has not provided an image. I will ask the user for an image and then generate the code to count workers with helmets.
+Can you provide an image of workers with helmets?
+true
USER: Yes you can use workers.png
@@ -128,7 +159,11 @@
Artifact workers.png loaded to /path/to/images/workers.png
[End of artifacts]
-AGENT: {"thoughts": "I will use the generate_vision_code to count the workers with helmets in the image.", "response": "generate_vision_code(artifacts, 'code.py', 'Can you create a function to count workers with helmets, return the count and save a visaulization of the bounding boxes?', media=['/paths/to/images/workers.png'])", "let_user_respond": false}
+AGENT: I will use the generate_vision_code to count the workers with helmets in the image.
+
+generate_vision_code(artifacts, 'code.py', 'Can you create a function to count workers with helmets, return the count and save a visaulization of the bounding boxes?', media=['/paths/to/images/workers.png'])
+
+false
OBSERVATION:
[Artifact code.py]
@@ -152,22 +187,37 @@
16| return count
[End of artifact]
-AGENT: {"thoughts": "I have generated the code to count the workers with helmets in the image, I must now run the code and print the output and write the visualization to the artifacts so I can see the result and the user can see the visaulization.", "response": "from code import count_workers_with_helmets\n print(count_workers_with_helmets('/path/to/images/workers.png', 'workers_viz.png'))\n write_media_artifact(artifacts, 'workers_viz.png', 'workers_viz.png')", "let_user_respond": false}
+AGENT: I have generated the code to count the workers with helmets in the image, I must now run the code and print the output.
+
+from code import count_workers_with_helmets
+print(count_workers_with_helmets('/path/to/images/workers.png', 'workers_viz.png'))
+
+false
OBSERVATION:
----- stdout -----
2
-AGENT: {"thoughts": "Two workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "The code to detect workers with helmets is saved in code.py and the visualization under 'workers_viz.png'.", "let_user_respond": true}
+AGENT: Two workers with helmets are detected, I will show this to the user and ask them if the result looks good.
+The code to detect workers with helmets is saved in code.py and the visualization under 'workers_viz.png'.
+true
USER: The detections are slightly off. Can you fine tune florence2 using these labels? "[{'image_path': 'image1.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}, {'image_path': 'image2.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}]"
-AGENT: {"thoughts": "Because the user has supplied me with labels I can call object_detection_fine_tuning on their behalf to fine tune the model", "response": "object_detection_fine_tuning([{'image_path': 'image1.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}, {'image_path': 'image2.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}])", "let_user_respond": false}
+AGENT: Because the user has supplied me with labels I can call object_detection_fine_tuning on their behalf to fine tune the model
+
+object_detection_fine_tuning([{'image_path': 'image1.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}, {'image_path': 'image2.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}])
+
+false
OBSERVATION:
[Fine tuning id: 23b3b022-5ebf-4798-9373-20ef36429abf]
-AGENT: {"thoughts": "The model has finished fine tuning, I will now replace the original florence2_phrase_grounding call with the fine tuning id.", "response": "use_object_detection_fine_tuning(artifacts, "code.py", "23b3b022-5ebf-4798-9373-20ef36429abf")", "let_user_respond": false}
+AGENT: The model has finished fine tuning, I will now replace the original florence2_phrase_grounding call with the fine tuning id.
+
+use_object_detection_fine_tuning(artifacts, "code.py", "23b3b022-5ebf-4798-9373-20ef36429abf")
+
+false
OBSERVATION:
[Artifact code.py edits]
@@ -184,11 +234,18 @@ def count_workers_with_helmets(image_path: str, output_path: str):
count = 0
[End of edits]
-AGENT: {"thoughts": "I have updated the code to use the fine tuned model, I must now run the code and print the output and write the visualization to the artifacts so I can see the result and the user can see the visaulization.", "response": "from code import count_workers_with_helmets\n print(count_workers_with_helmets('/path/to/images/workers.png', 'workers_viz.png'))\n write_media_artifact(artifacts, 'workers_viz.png', 'workers_viz.png')", "let_user_respond": false}
+AGENT: I have updated the code to use the fine tuned model, I must now run the code and print the output.
+
+from code import count_workers_with_helmets
+print(count_workers_with_helmets('/path/to/images/workers.png', 'workers_viz.png'))
+
+false
OBSERVATION:
----- stdout -----
3
-AGENT: {"thoughts": "Three workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "I have updated the code using the fine tuned florence2_phrase_grounding model to count the workers wearing helmets in code.py and saved the visualization under 'workers_viz.png'.", "let_user_respond": true}
+AGENT: Three workers with helmets are detected, I will show this to the user and ask them if the result looks good.
+I have updated the code using the fine tuned florence2_phrase_grounding model to count the workers wearing helmets in code.py and saved the visualization under 'workers_viz.png'.
+true
"""
diff --git a/vision_agent/tools/__init__.py b/vision_agent/tools/__init__.py
index 2a75aa2b..16df1193 100644
--- a/vision_agent/tools/__init__.py
+++ b/vision_agent/tools/__init__.py
@@ -37,7 +37,6 @@
grounding_dino,
grounding_sam,
ixc25_image_vqa,
- ixc25_temporal_localization,
ixc25_video_vqa,
load_image,
loca_visual_prompt_counting,
diff --git a/vision_agent/tools/meta_tools.py b/vision_agent/tools/meta_tools.py
index 7f59c685..d9537e7c 100644
--- a/vision_agent/tools/meta_tools.py
+++ b/vision_agent/tools/meta_tools.py
@@ -1,4 +1,3 @@
-import base64
import difflib
import json
import os
@@ -9,7 +8,6 @@
from pathlib import Path
from typing import Any, Dict, List, Optional, Union
-import numpy as np
from IPython.display import display
from redbaron import RedBaron # type: ignore
@@ -22,8 +20,7 @@
from vision_agent.tools.tools import TOOL_DESCRIPTIONS
from vision_agent.tools.tools_types import BboxInput, BboxInputBase64, PromptTask
from vision_agent.utils.execute import Execution, MimeType
-from vision_agent.utils.image_utils import convert_to_b64, numpy_to_bytes
-from vision_agent.utils.video import frames_to_bytes
+from vision_agent.utils.image_utils import convert_to_b64
CURRENT_FILE = None
CURRENT_LINE = 0
@@ -393,19 +390,6 @@ def generate_vision_plan(
redisplay_results(response.test_results)
response.test_results = None
artifacts[name] = response.model_dump_json()
- media_names = extract_json(
- AnthropicLMM()( # type: ignore
- f"""Extract any media file names from this output in the following JSON format:
-{{"media": ["image1.jpg", "image2.jpg"]}}
-
-{artifacts[name]}"""
- )
- )
- if "media" in media_names and isinstance(media_names, dict):
- for media in media_names["media"]:
- if isinstance(media, str):
- with open(media, "rb") as f:
- artifacts[media] = f.read()
output_str = f"[Start Plan Context, saved at {name}]"
for plan in response.plans.keys():
@@ -466,6 +450,12 @@ def detect_dogs(image_path: str):
test_multi_plan=test_multi_plan,
custom_tool_names=custom_tool_names,
)
+
+ # capture and save any files that were saved in the code to the artifacts
+ extract_and_save_files_to_artifacts(
+ artifacts, response["code"] + "\n" + response["test"]
+ )
+
redisplay_results(response["test_result"])
code = response["code"]
artifacts[name] = code
@@ -546,6 +536,11 @@ def detect_dogs(image_path: str):
test_multi_plan=False,
custom_tool_names=custom_tool_names,
)
+ # capture and save any files that were saved in the code to the artifacts
+ extract_and_save_files_to_artifacts(
+ artifacts, response["code"] + "\n" + response["test"]
+ )
+
redisplay_results(response["test_result"])
code = response["code"]
artifacts[name] = code
@@ -567,49 +562,6 @@ def detect_dogs(image_path: str):
return view_lines(code_lines, 0, total_lines, name, total_lines)
-def write_media_artifact(
- artifacts: Artifacts,
- name: str,
- media: Union[str, np.ndarray, List[np.ndarray]],
- fps: Optional[float] = None,
-) -> str:
- """Writes a media file to the artifacts object.
-
- Parameters:
- artifacts (Artifacts): The artifacts object to save the media to.
- name (str): The name of the media artifact to save.
- media (Union[str, np.ndarray, List[np.ndarray]]): The media to save, can either
- be a file path, single image or list of frames for a video.
- fps (Optional[float]): The frames per second if you are writing a video.
- """
- if isinstance(media, str):
- with open(media, "rb") as f:
- media_bytes = f.read()
- elif isinstance(media, list):
- media_bytes = frames_to_bytes(media, fps=fps if fps is not None else 1.0)
- elif isinstance(media, np.ndarray):
- media_bytes = numpy_to_bytes(media)
- else:
- print(f"[Invalid media type {type(media)}]")
- return f"[Invalid media type {type(media)}]"
- artifacts[name] = media_bytes
- print(f"[Media {name} saved]")
- display(
- {
- MimeType.APPLICATION_ARTIFACT: json.dumps(
- {
- "name": name,
- "action": "create",
- "content": base64.b64encode(media_bytes).decode("utf-8"),
- "contentType": "media_output",
- }
- )
- },
- raw=True,
- )
- return f"[Media {name} saved]"
-
-
def list_artifacts(artifacts: Artifacts) -> str:
"""Lists all the artifacts that have been loaded into the artifacts object."""
output_str = artifacts.show()
@@ -813,6 +765,61 @@ def use_object_detection_fine_tuning(
return diff
+def extract_and_save_files_to_artifacts(artifacts: Artifacts, code: str) -> None:
+ """Extracts and saves files used in the code to the artifacts object.
+
+ Parameters:
+ artifacts (Artifacts): The artifacts object to save the files to.
+ code (str): The code to extract the files from.
+ """
+ try:
+ response = extract_json(
+ AnthropicLMM()( # type: ignore
+ f"""You are a helpful AI assistant. Your job is to look at a snippet of code and return the file paths that are being saved in the file. Below is the code snippet:
+
+```python
+{code}
+```
+
+Return the file paths in the following JSON format:
+{{"file_paths": ["/path/to/image1.jpg", "/other/path/to/data.json"]}}"""
+ )
+ )
+ except json.JSONDecodeError:
+ return
+
+ text_file_ext = [
+ ".txt",
+ ".md",
+ "rtf",
+ ".html",
+ ".htm",
+ "xml",
+ ".json",
+ ".csv",
+ ".tsv",
+ ".yaml",
+ ".yml",
+ ".toml",
+ ".conf",
+ ".env" ".ini",
+ ".log",
+ ".py",
+ ".java",
+ ".js",
+ ".cpp",
+ ".c" ".sql",
+ ".sh",
+ ]
+
+ if "file_paths" in response and isinstance(response["file_paths"], list):
+ for file_path in response["file_paths"]:
+ read_mode = "r" if Path(file_path).suffix in text_file_ext else "rb"
+ if Path(file_path).is_file():
+ with open(file_path, read_mode) as f:
+ artifacts[Path(file_path).name] = f.read()
+
+
META_TOOL_DOCSTRING = get_tool_documentation(
[
get_tool_descriptions,
@@ -822,7 +829,6 @@ def use_object_detection_fine_tuning(
generate_vision_plan,
generate_vision_code,
edit_vision_code,
- write_media_artifact,
view_media_artifact,
object_detection_fine_tuning,
use_object_detection_fine_tuning,
diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py
index 45f10c33..86e484b2 100644
--- a/vision_agent/tools/tools.py
+++ b/vision_agent/tools/tools.py
@@ -181,6 +181,8 @@ def owl_v2_image(
"""
image_size = image.shape[:2]
+ if image_size[0] < 1 or image_size[1] < 1:
+ return []
if fine_tune_id is not None:
image_b64 = convert_to_b64(image)
@@ -413,6 +415,9 @@ def florence2_sam2_image(
},
]
"""
+ if image.shape[0] < 1 or image.shape[1] < 1:
+ return []
+
if fine_tune_id is not None:
image_b64 = convert_to_b64(image)
landing_api = LandingPublicAPI()
@@ -701,6 +706,8 @@ def countgd_counting(
]
"""
image_size = image.shape[:2]
+ if image_size[0] < 1 or image_size[1] < 1:
+ return []
buffer_bytes = numpy_to_bytes(image)
files = [("image", buffer_bytes)]
prompt = prompt.replace(", ", " .")
@@ -759,6 +766,8 @@ def countgd_example_based_counting(
]
"""
image_size = image.shape[:2]
+ if image_size[0] < 1 or image_size[1] < 1:
+ return []
buffer_bytes = numpy_to_bytes(image)
files = [("image", buffer_bytes)]
visual_prompts = [
@@ -828,6 +837,8 @@ def ixc25_image_vqa(prompt: str, image: np.ndarray) -> str:
>>> ixc25_image_vqa('What is the cat doing?', image)
'drinking milk'
"""
+ if image.shape[0] < 1 or image.shape[1] < 1:
+ raise ValueError(f"Image is empty, image shape: {image.shape}")
buffer_bytes = numpy_to_bytes(image)
files = [("image", buffer_bytes)]
@@ -871,47 +882,6 @@ def ixc25_video_vqa(prompt: str, frames: List[np.ndarray]) -> str:
return cast(str, data["answer"])
-def ixc25_temporal_localization(prompt: str, frames: List[np.ndarray]) -> List[bool]:
- """'ixc25_temporal_localization' uses ixc25_video_vqa to temporally segment a video
- given a prompt that can be other an object or a phrase. It returns a list of
- boolean values indicating whether the object or phrase is present in the
- corresponding frame.
-
- Parameters:
- prompt (str): The question about the video
- frames (List[np.ndarray]): The reference frames used for the question
-
- Returns:
- List[bool]: A list of boolean values indicating whether the object or phrase is
- present in the corresponding frame.
-
- Example
- -------
- >>> output = ixc25_temporal_localization('soccer goal', frames)
- >>> print(output)
- [False, False, False, True, True, True, False, False, False, False]
- >>> save_video([f for i, f in enumerate(frames) if output[i]], 'output.mp4')
- """
-
- buffer_bytes = frames_to_bytes(frames)
- files = [("video", buffer_bytes)]
- payload = {
- "prompt": prompt,
- "chunk_length": 2,
- "function_name": "ixc25_temporal_localization",
- }
- data: List[int] = send_inference_request(
- payload,
- "video-temporal-localization?model=internlm-xcomposer",
- files=files,
- v2=True,
- )
- chunk_size = round(len(frames) / len(data))
- data_explode = [[elt] * chunk_size for elt in data]
- data_bool = [bool(elt) for sublist in data_explode for elt in sublist]
- return data_bool[: len(frames)]
-
-
def gpt4o_image_vqa(prompt: str, image: np.ndarray) -> str:
"""'gpt4o_image_vqa' is a tool that can answer any questions about arbitrary images
including regular images or images of documents or presentations. It returns text
@@ -1024,6 +994,9 @@ def clip(image: np.ndarray, classes: List[str]) -> Dict[str, Any]:
{"labels": ["dog", "cat", "bird"], "scores": [0.68, 0.30, 0.02]},
"""
+ if image.shape[0] < 1 or image.shape[1] < 1:
+ return {"labels": [], "scores": []}
+
image_b64 = convert_to_b64(image)
data = {
"prompt": ",".join(classes),
@@ -1052,6 +1025,8 @@ def vit_image_classification(image: np.ndarray) -> Dict[str, Any]:
>>> vit_image_classification(image)
{"labels": ["leopard", "lemur, otter", "bird"], "scores": [0.68, 0.30, 0.02]},
"""
+ if image.shape[0] < 1 or image.shape[1] < 1:
+ return {"labels": [], "scores": []}
image_b64 = convert_to_b64(image)
data = {
@@ -1080,6 +1055,8 @@ def vit_nsfw_classification(image: np.ndarray) -> Dict[str, Any]:
>>> vit_nsfw_classification(image)
{"label": "normal", "scores": 0.68},
"""
+ if image.shape[0] < 1 or image.shape[1] < 1:
+ raise ValueError(f"Image is empty, image shape: {image.shape}")
image_b64 = convert_to_b64(image)
data = {
@@ -1180,6 +1157,8 @@ def florence2_phrase_grounding(
]
"""
image_size = image.shape[:2]
+ if image_size[0] < 1 or image_size[1] < 1:
+ return []
image_b64 = convert_to_b64(image)
if fine_tune_id is not None:
@@ -1399,6 +1378,8 @@ def detr_segmentation(image: np.ndarray) -> List[Dict[str, Any]]:
},
]
"""
+ if image.shape[0] < 1 or image.shape[1] < 1:
+ return []
image_b64 = convert_to_b64(image)
data = {
"image": image_b64,
@@ -1442,6 +1423,9 @@ def depth_anything_v2(image: np.ndarray) -> np.ndarray:
[10, 11, 15, ..., 202, 202, 205],
[10, 10, 10, ..., 200, 200, 200]], dtype=uint8),
"""
+ if image.shape[0] < 1 or image.shape[1] < 1:
+ raise ValueError(f"Image is empty, image shape: {image.shape}")
+
image_b64 = convert_to_b64(image)
data = {
"image": image_b64,