diff --git a/README.md b/README.md index e34e265e..3812767a 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@
- vision_agent + vision_agent # 🔍🤖 Vision Agent [![](https://dcbadge.vercel.app/api/server/wPdN8RCYew?compact=true&style=flat)](https://discord.gg/wPdN8RCYew) @@ -345,6 +345,11 @@ result = agent.generate_code(conv) ## Additional Backends +### E2B Code Execution +If you wish to run your code on the E2B backend, make sure you have your `E2B_API_KEY` +set and then set `CODE_SANDBOX_RUNTIME=e2b` in your environment variables. This will +run all the agent generated code on the E2B backend. + ### Anthropic `AnthropicVisionAgentCoder` uses Anthropic. To get started you just need to get an Anthropic API key and set it in your environment variables: diff --git a/assets/logo.jpg b/assets/logo.jpg deleted file mode 100644 index a2fa3a2c..00000000 Binary files a/assets/logo.jpg and /dev/null differ diff --git a/assets/logo.png b/assets/logo.png new file mode 100644 index 00000000..636574f2 Binary files /dev/null and b/assets/logo.png differ diff --git a/docs/index.md b/docs/index.md index 08c808a9..297f70ac 100644 --- a/docs/index.md +++ b/docs/index.md @@ -341,6 +341,11 @@ result = agent.generate_code(conv) ## Additional Backends +### E2B Code Execution +If you wish to run your code on the E2B backend, make sure you have your `E2B_API_KEY` +set and then set `CODE_SANDBOX_RUNTIME=e2b` in your environment variables. This will +run all the agent generated code on the E2B backend. + ### Anthropic `AnthropicVisionAgentCoder` uses Anthropic. To get started you just need to get an Anthropic API key and set it in your environment variables: diff --git a/tests/integ/test_tools.py b/tests/integ/test_tools.py index 690795f0..42c8e62e 100644 --- a/tests/integ/test_tools.py +++ b/tests/integ/test_tools.py @@ -24,7 +24,6 @@ grounding_dino, grounding_sam, ixc25_image_vqa, - ixc25_temporal_localization, ixc25_video_vqa, loca_visual_prompt_counting, loca_zero_shot_counting, @@ -71,6 +70,14 @@ def test_owl_v2_image(): assert all([all([0 <= x <= 1 for x in obj["bbox"]]) for obj in result]) +def test_owl_v2_image_empty(): + result = owl_v2_image( + prompt="coin", + image=np.zeros((0, 0, 3)).astype(np.uint8), + ) + assert result == [] + + def test_owl_v2_fine_tune_id(): img = ski.data.coins() result = owl_v2_image( @@ -110,6 +117,14 @@ def test_florence2_phrase_grounding(): assert all([all([0 <= x <= 1 for x in obj["bbox"]]) for obj in result]) +def test_florence2_phrase_grounding_empty(): + result = florence2_phrase_grounding( + image=np.zeros((0, 0, 3)).astype(np.uint8), + prompt="coin", + ) + assert result == [] + + def test_florence2_phrase_grounding_fine_tune_id(): img = ski.data.coins() result = florence2_phrase_grounding( @@ -147,7 +162,7 @@ def test_florence2_phrase_grounding_video_fine_tune_id(): fine_tune_id=FINE_TUNE_ID, ) assert len(result) == 10 - assert 16 <= len([res["label"] for res in result[0]]) <= 26 + assert 12 <= len([res["label"] for res in result[0]]) <= 26 assert all([all([0 <= x <= 1 for x in obj["bbox"]]) for obj in result[0]]) @@ -195,6 +210,14 @@ def test_florence2_sam2_image_fine_tune_id(): assert len([res["mask"] for res in result]) == len(result) +def test_florence2_sam2_image_empty(): + result = florence2_sam2_image( + prompt="coin", + image=np.zeros((0, 0, 3)).astype(np.uint8), + ) + assert result == [] + + def test_florence2_sam2_video(): frames = [ np.array(Image.fromarray(ski.data.coins()).convert("RGB")) for _ in range(10) @@ -208,7 +231,7 @@ def test_florence2_sam2_video(): assert len([res["mask"] for res in result[0]]) == 25 -def test_segmentation(): +def test_detr_segmentation(): img = ski.data.coins() result = detr_segmentation( image=img, @@ -218,6 +241,13 @@ def test_segmentation(): assert len([res["mask"] for res in result]) == 1 +def test_detr_segmentation_empty(): + result = detr_segmentation( + image=np.zeros((0, 0, 3)).astype(np.uint8), + ) + assert result == [] + + def test_clip(): img = ski.data.coins() result = clip( @@ -227,6 +257,15 @@ def test_clip(): assert result["scores"] == [0.9999, 0.0001] +def test_clip_empty(): + result = clip( + classes=["coins", "notes"], + image=np.zeros((0, 0, 3)).astype(np.uint8), + ) + assert result["scores"] == [] + assert result["labels"] == [] + + def test_vit_classification(): img = ski.data.coins() result = vit_image_classification( @@ -235,6 +274,14 @@ def test_vit_classification(): assert "typewriter keyboard" in result["labels"] +def test_vit_classification_empty(): + result = vit_image_classification( + image=np.zeros((0, 0, 3)).astype(np.uint8), + ) + assert result["labels"] == [] + assert result["scores"] == [] + + def test_nsfw_classification(): img = ski.data.coins() result = vit_nsfw_classification( @@ -243,7 +290,7 @@ def test_nsfw_classification(): assert result["label"] == "normal" -def test_image_caption() -> None: +def test_image_caption(): img = ski.data.rocket() result = blip_image_caption( image=img, @@ -251,7 +298,7 @@ def test_image_caption() -> None: assert result.strip() == "a rocket on a stand" -def test_florence_image_caption() -> None: +def test_florence_image_caption(): img = ski.data.rocket() result = florence2_image_caption( image=img, @@ -259,7 +306,7 @@ def test_florence_image_caption() -> None: assert "The image shows a rocket on a launch pad at night" in result.strip() -def test_loca_zero_shot_counting() -> None: +def test_loca_zero_shot_counting(): img = ski.data.coins() result = loca_zero_shot_counting( @@ -268,7 +315,7 @@ def test_loca_zero_shot_counting() -> None: assert result["count"] == 21 -def test_loca_visual_prompt_counting() -> None: +def test_loca_visual_prompt_counting(): img = ski.data.coins() result = loca_visual_prompt_counting( visual_prompt={"bbox": [85, 106, 122, 145]}, @@ -277,7 +324,7 @@ def test_loca_visual_prompt_counting() -> None: assert result["count"] == 25 -def test_git_vqa_v2() -> None: +def test_git_vqa_v2(): img = ski.data.rocket() result = git_vqa_v2( prompt="Is the scene captured during day or night ?", @@ -286,7 +333,7 @@ def test_git_vqa_v2() -> None: assert result.strip() == "night" -def test_image_qa_with_context() -> None: +def test_image_qa_with_context(): img = ski.data.rocket() result = florence2_roberta_vqa( prompt="Is the scene captured during day or night ?", @@ -295,7 +342,7 @@ def test_image_qa_with_context() -> None: assert "night" in result.strip() -def test_ixc25_image_vqa() -> None: +def test_ixc25_image_vqa(): img = ski.data.cat() result = ixc25_image_vqa( prompt="What animal is in this image?", @@ -304,7 +351,7 @@ def test_ixc25_image_vqa() -> None: assert "cat" in result.strip() -def test_ixc25_video_vqa() -> None: +def test_ixc25_video_vqa(): frames = [ np.array(Image.fromarray(ski.data.cat()).convert("RGB")) for _ in range(10) ] @@ -315,18 +362,7 @@ def test_ixc25_video_vqa() -> None: assert "cat" in result.strip() -def test_ixc25_temporal_localization() -> None: - frames = [ - np.array(Image.fromarray(ski.data.cat()).convert("RGB")) for _ in range(10) - ] - result = ixc25_temporal_localization( - prompt="What animal is in this video?", - frames=frames, - ) - assert result == [True] * 10 - - -def test_ocr() -> None: +def test_ocr(): img = ski.data.page() result = ocr( image=img, @@ -334,7 +370,14 @@ def test_ocr() -> None: assert any("Region-based segmentation" in res["label"] for res in result) -def test_florence2_ocr() -> None: +def test_ocr_empty(): + result = ocr( + image=np.zeros((0, 0, 3)).astype(np.uint8), + ) + assert result == [] + + +def test_florence2_ocr(): img = ski.data.page() result = florence2_ocr( image=img, @@ -342,6 +385,13 @@ def test_florence2_ocr() -> None: assert any("Region-based segmentation" in res["label"] for res in result) +def test_florence2_ocr_empty(): + result = florence2_ocr( + image=np.zeros((0, 0, 3)).astype(np.uint8), + ) + assert result == [] + + def test_mask_distance(): # Create two binary masks mask1 = np.zeros((100, 100), dtype=np.uint8) @@ -399,14 +449,22 @@ def test_generate_hed(): assert result.shape == img.shape -def test_countgd_counting() -> None: +def test_countgd_counting(): img = ski.data.coins() result = countgd_counting(image=img, prompt="coin") assert len(result) == 24 assert [res["label"] for res in result] == ["coin"] * 24 -def test_countgd_example_based_counting() -> None: +def test_countgd_counting_empty(): + result = countgd_counting( + prompt="coin", + image=np.zeros((0, 0, 3)).astype(np.uint8), + ) + assert result == [] + + +def test_countgd_example_based_counting(): img = ski.data.coins() result = countgd_example_based_counting( visual_prompts=[[85, 106, 122, 145]], @@ -414,3 +472,11 @@ def test_countgd_example_based_counting() -> None: ) assert len(result) == 24 assert [res["label"] for res in result] == ["object"] * 24 + + +def test_countgd_example_based_counting_empty(): + result = countgd_example_based_counting( + visual_prompts=[[85, 106, 122, 145]], + image=np.zeros((0, 0, 3)).astype(np.uint8), + ) + assert result == [] diff --git a/tests/unit/test_va.py b/tests/unit/test_va.py index 3fe619e8..e7a6e7c5 100644 --- a/tests/unit/test_va.py +++ b/tests/unit/test_va.py @@ -1,4 +1,12 @@ -from vision_agent.agent.vision_agent import parse_execution +from vision_agent.agent.agent_utils import extract_tag +from vision_agent.tools.meta_tools import use_extra_vision_agent_args + + +def parse_execution(code, test_multi_plan=True, custom_tool_names=None): + code = extract_tag(code, "execute_python") + if not code: + return None + return use_extra_vision_agent_args(code, test_multi_plan, custom_tool_names) def test_parse_execution_zero(): diff --git a/vision_agent/agent/vision_agent.py b/vision_agent/agent/vision_agent.py index 42541d33..29643ecd 100644 --- a/vision_agent/agent/vision_agent.py +++ b/vision_agent/agent/vision_agent.py @@ -1,4 +1,5 @@ import copy +import json import logging import os import tempfile @@ -6,7 +7,7 @@ from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast from vision_agent.agent import Agent -from vision_agent.agent.agent_utils import extract_json +from vision_agent.agent.agent_utils import extract_json, extract_tag from vision_agent.agent.vision_agent_prompts import ( EXAMPLES_CODE1, EXAMPLES_CODE2, @@ -18,6 +19,7 @@ META_TOOL_DOCSTRING, Artifacts, check_and_load_image, + extract_and_save_files_to_artifacts, use_extra_vision_agent_args, ) from vision_agent.utils import CodeInterpreterFactory @@ -35,7 +37,7 @@ class BoilerplateCode: pre_code = [ "from typing import *", "from vision_agent.utils.execute import CodeInterpreter", - "from vision_agent.tools.meta_tools import Artifacts, open_code_artifact, create_code_artifact, edit_code_artifact, get_tool_descriptions, generate_vision_code, edit_vision_code, write_media_artifact, view_media_artifact, object_detection_fine_tuning, use_object_detection_fine_tuning", + "from vision_agent.tools.meta_tools import Artifacts, open_code_artifact, create_code_artifact, edit_code_artifact, get_tool_descriptions, generate_vision_code, edit_vision_code, view_media_artifact, object_detection_fine_tuning, use_object_detection_fine_tuning", "artifacts = Artifacts('{remote_path}')", "artifacts.load('{remote_path}')", ] @@ -57,6 +59,32 @@ def add_boilerplate(code: str, **format: Any) -> str: ) +def format_agent_message(agent_message: str) -> str: + agent_message_json = extract_json(agent_message) + output = "" + if "thinking" in agent_message_json and agent_message_json["thinking"]: + output += "" + agent_message_json["thinking"] + "" + if "response" in agent_message_json and agent_message_json["response"]: + output += "" + agent_message_json["response"] + "" + if "execute_python" in agent_message_json and agent_message_json["execute_python"]: + output += ( + "\n\n" + + agent_message_json["execute_python"] + + "\n\n" + ) + if ( + "let_user_respond" in agent_message_json + and agent_message_json["let_user_respond"] + ): + output += ( + "" + + str(agent_message_json["let_user_respond"]) + + "" + ) + + return output + + def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]: chat = copy.deepcopy(chat) @@ -67,7 +95,7 @@ def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]: elif chat_i["role"] == "observation": conversation += f"OBSERVATION:\n{chat_i['content']}\n\n" elif chat_i["role"] == "assistant": - conversation += f"AGENT: {chat_i['content']}\n\n" + conversation += f"AGENT: {format_agent_message(chat_i['content'])}\n\n" # type: ignore else: raise ValueError(f"role {chat_i['role']} is not supported") @@ -84,15 +112,31 @@ def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]: and len(chat[-1]["media"]) > 0 # type: ignore ): message["media"] = chat[-1]["media"] - return extract_json(orch([message], stream=False)) # type: ignore + conv_resp = cast(str, orch([message], stream=False)) + + let_user_respond_str = extract_tag(conv_resp, "let_user_respond") + let_user_respond = ( + "true" in let_user_respond_str.lower() if let_user_respond_str else False + ) + + return { + "thinking": extract_tag(conv_resp, "thinking"), + "response": extract_tag(conv_resp, "response"), + "execute_python": extract_tag(conv_resp, "execute_python"), + "let_user_respond": let_user_respond, + } def execute_code_action( - code: str, code_interpreter: CodeInterpreter, artifact_remote_path: str + artifacts: Artifacts, + code: str, + code_interpreter: CodeInterpreter, + artifact_remote_path: str, ) -> Tuple[Execution, str]: result = code_interpreter.exec_isolation( BoilerplateCode.add_boilerplate(code, remote_path=artifact_remote_path) ) + extract_and_save_files_to_artifacts(artifacts, code) obs = str(result.logs) if result.error: @@ -100,33 +144,8 @@ def execute_code_action( return result, obs -def parse_execution( - response: str, - test_multi_plan: bool = True, - custom_tool_names: Optional[List[str]] = None, -) -> Optional[str]: - code = None - remaining = response - all_code = [] - while "" in remaining: - code_i = remaining[ - remaining.find("") + len("") : - ] - code_i = code_i[: code_i.find("")] - remaining = remaining[ - remaining.find("") + len("") : - ] - all_code.append(code_i) - - if len(all_code) > 0: - code = "\n".join(all_code) - - if code is not None: - code = use_extra_vision_agent_args(code, test_multi_plan, custom_tool_names) - return code - - def execute_user_code_action( + artifacts: Artifacts, last_user_message: Message, code_interpreter: CodeInterpreter, artifact_remote_path: str, @@ -138,40 +157,43 @@ def execute_user_code_action( return user_result, user_obs last_user_content = cast(str, last_user_message.get("content", "")) + try: + user_code_action = json.loads(last_user_content).get("execute_python", None) + except json.JSONDecodeError: + return user_result, user_obs - user_code_action = parse_execution(last_user_content, False) if user_code_action is not None: + user_code_action = use_extra_vision_agent_args(user_code_action, False) user_result, user_obs = execute_code_action( - user_code_action, code_interpreter, artifact_remote_path + artifacts, user_code_action, code_interpreter, artifact_remote_path ) if user_result.error: user_obs += f"\n{user_result.error}" return user_result, user_obs -def add_step_descriptions(response: Dict[str, str]) -> Dict[str, str]: +def add_step_descriptions(response: Dict[str, Any]) -> Dict[str, Any]: response = copy.deepcopy(response) - if "response" in response: - resp_str = response["response"] - if "" in resp_str: - # only include descriptions for these, the rest will just have executing - # code - description_map = { - "open_code_artifact": "Reading file.", - "create_code_artifact": "Creating file.", - "edit_code_artifact": "Editing file.", - "generate_vision_code": "Generating vision code.", - "edit_vision_code": "Editing vision code.", - } - description = "" - for k, v in description_map.items(): - if k in resp_str: - description += v + " " - if description == "": - description = "Executing code." - resp_str = resp_str[resp_str.find("") :] - resp_str = description + resp_str - response["response"] = resp_str + + if "execute_python" in response and response["execute_python"]: + # only include descriptions for these, the rest will just have executing + # code + description_map = { + "open_code_artifact": "Reading file.", + "create_code_artifact": "Creating file.", + "edit_code_artifact": "Editing file.", + "generate_vision_code": "Generating vision code.", + "edit_vision_code": "Editing vision code.", + } + description = "" + for k, v in description_map.items(): + if k in response["execute_python"]: + description += v + " " + if description == "": + description = "Executing code." + + response["response"] = description + return response @@ -369,7 +391,10 @@ def chat_with_artifacts( self.streaming_message({"role": "observation", "content": artifacts_loaded}) user_result, user_obs = execute_user_code_action( - last_user_message, code_interpreter, str(remote_artifacts_path) + artifacts, + last_user_message, + code_interpreter, + str(remote_artifacts_path), ) finished = user_result is not None and user_obs is not None if user_result is not None and user_obs is not None: @@ -394,13 +419,13 @@ def chat_with_artifacts( int_chat.append( { "role": "assistant", - "content": str(add_step_descriptions(response)), + "content": json.dumps(add_step_descriptions(response)), } ) orig_chat.append( { "role": "assistant", - "content": str(add_step_descriptions(response)), + "content": json.dumps(add_step_descriptions(response)), } ) @@ -408,11 +433,13 @@ def chat_with_artifacts( if last_response == response: response["let_user_respond"] = True - finished = response["let_user_respond"] + finished = response.get("let_user_respond", False) - code_action = parse_execution( - response["response"], test_multi_plan, custom_tool_names - ) + code_action = response.get("execute_python", None) + if code_action is not None: + code_action = use_extra_vision_agent_args( + code_action, test_multi_plan, custom_tool_names + ) if last_response == response: self.streaming_message( @@ -431,14 +458,17 @@ def chat_with_artifacts( self.streaming_message( { "role": "assistant", - "content": response, + "content": json.dumps(response), "finished": finished and code_action is None, } ) if code_action is not None: result, obs = execute_code_action( - code_action, code_interpreter, str(remote_artifacts_path) + artifacts, + code_action, + code_interpreter, + str(remote_artifacts_path), ) media_obs = check_and_load_image(code_action) diff --git a/vision_agent/agent/vision_agent_prompts.py b/vision_agent/agent/vision_agent_prompts.py index a8b1e543..278b0b4c 100644 --- a/vision_agent/agent/vision_agent_prompts.py +++ b/vision_agent/agent/vision_agent_prompts.py @@ -27,13 +27,14 @@ **Instructions**: 1. **Understand and Clarify**: Make sure you understand the task, ask clarifying questions if the task is not clear. 2. **Code Generation**: Only use code provided in the Documentation in your tags. Only use `edit_vision_code` to modify code written by `generate_vision_code`. -3. **Execute**: Do only what the user asked you to do and no more. If you need to ask the user a question, set `let_user_respond` to `true`. +3. **Execute**: Do only what the user asked you to do and no more. If you need to ask the user a question or show your results to the user, set to `true`. 4. **Response**: Keep your responses short and concise. Provide the user only with the information they need to continue the conversation. -5. **Output in JSON**: Respond in the following format in JSON: +5. **Output**: You can only respond with , , , and tags. -```json -{{"thoughts": , "response": , "let_user_respond": }}. -``` +Your thoughts here... +Your response to the user here... +Your code here... +true/false for whether or not to you want to let the user respond. """ @@ -45,7 +46,11 @@ Artifact dog.jpg loaded to /path/to/images/dog.jpg [End of artifacts] -AGENT: {"thoughts": "I will use the generate_vision_code to detect the dogs in the image.", "response": "generate_vision_code(artifacts, 'dog_detector.py', 'Can you write code to detect dogs in this image?', media=['/path/to/images/dog.jpg'])", "let_user_respond": false} +AGENT: I will use the generate_vision_code to detect the dogs in the image. + +generate_vision_code(artifacts, 'dog_detector.py', 'Can you write code to detect dogs in this image?', media=['/path/to/images/dog.jpg']) + +false OBSERVATION: [Artifact dog_detector.py] @@ -56,14 +61,21 @@ 4| return dogs [End of artifact] -AGENT: {"thoughts": "I have generated the code to detect the dogs in the image, I must now run the code and print the results to get the output.", "response": "from dog_detector import detect_dogs\n print(detect_dogs('/example/workspace/dog.jpg'))", "let_user_respond": false} +AGENT: I have generated the code to detect the dogs in the image, I must now run the code and print the results to get the output. + +from dog_detector import detect_dogs +print(detect_dogs('/example/workspace/dog.jpg')) + +false OBSERVATION: ----- stdout ----- [{'score': 0.99, 'label': 'dog', 'box': [0.1, 0.2, 0.3, 0.4]}, {'score': 0.23, 'label': 'dog', 'box': [0.2, 0.3, 0.4, 0.5]}] -AGENT: {"thoughts": "Two dogs are detected, I will show this to the user and ask them if the result looks good.", "response": "The code detectd two dogs, do the results look good to you?", "let_user_respond": true} +AGENT: Two dogs are detected, I will show this to the user and ask them if the result looks good. +The code detectd two dogs, do the results look good to you? +true """ EXAMPLES_CODE1_EXTRA = """ @@ -75,7 +87,11 @@ Artifact dog_detector.py loaded to /path/to/code/dog_detector.py [End of artifacts] -AGENT: {"thoughts": "I will use edit_vision_code to fix the code to detect only one dog. The lower score is 0.23 so I will set the threshold above that.", "response": "edit_vision_code(artifacts, 'dog_detector.py', ['Can you write code to detect dogs in this image?', 'Can you increase the threshold to 0.24?'], media=['/path/to/images/dog.jpg'])", "let_user_respond": false} +AGENT: I will use edit_vision_code to fix the code to detect only one dog. The lower score is 0.23 so I will set the threshold above that + +edit_vision_code(artifacts, 'dog_detector.py', ['Can you write code to detect dogs in this image?', 'Can you increase the threshold to 0.24?'], media=['/path/to/images/dog.jpg']) + +false OBSERVATION: [Artifact dog_detector.py] @@ -86,13 +102,20 @@ 4| return dogs [End of artifact] -AGENT: {"thoughts": "I have edited the code to detect only one dog, I must now run the code and print the results to get the output.", "response": "from dog_detector import detect_dogs\n print(detect_dogs('/path/to/images/dog.jpg'))", "let_user_respond": false} +AGENT: I have edited the code to detect only one dog, I must now run the code and print the results to get the output. + +from dog_detector import detect_dogs +print(detect_dogs('/path/to/images/dog.jpg')) + +false OBSERVATION: ----- stdout ----- [{'score': 0.99, 'label': 'dog', 'box': [0.1, 0.2, 0.3, 0.4]}] -AGENT: {"thoughts": "One dog is detected, I will show this to the user and ask them if the result looks good.", "response": "The code detected one dog, do these results look good to you?", "let_user_respond": true} +AGENT: One dog is detected, I will show this to the user and ask them if the result looks good. +The code detected one dog, do these results look good to you? +true """ EXAMPLES_CODE2 = """ @@ -103,12 +126,18 @@ Artifact image.jpg loaded to /path/to/images/image.jpg [End of artifacts] -AGENT: {"thoughts": "The user hasn't asked me to write any code and the task is very simple so I will view the image and answer myself to respond to the user quickly.", "response": "view_media_artifacts('image.jpg')", "let_user_respond": false} +AGENT: The user hasn't asked me to write any code and the task is very simple so I will view the image and answer myself to respond to the user quickly. + +view_media_artifacts('image.jpg') + +false OBSERVATION: [Image image.jpg displayed] -AGENT: {"thoughts": "The image shows a cat and a dog sitting on the couch, I will tell the user and ask them if they need any other assistance.", "response": "The image contains a dog and a cat sitting on a couch. Can I help you with any other tasks?", "let_user_respond": true} +AGENT: The image shows a cat and a dog sitting on the couch, I will tell the user and ask them if they need any other assistance. +The image contains a dog and a cat sitting on a couch. Can I help you with any other tasks? +true """ @@ -119,7 +148,9 @@ [Artifacts loaded] [End of artifacts] -AGENT: {"thoughts": "The user has asked to count workers with helmets but has not provided an image. I will ask the user for an image and then generate the code to count workers with helmets.", "response": "Can you provide an image of workers with helmets?", "let_user_respond": true} +AGENT: The user has asked to count workers with helmets but has not provided an image. I will ask the user for an image and then generate the code to count workers with helmets. +Can you provide an image of workers with helmets? +true USER: Yes you can use workers.png @@ -128,7 +159,11 @@ Artifact workers.png loaded to /path/to/images/workers.png [End of artifacts] -AGENT: {"thoughts": "I will use the generate_vision_code to count the workers with helmets in the image.", "response": "generate_vision_code(artifacts, 'code.py', 'Can you create a function to count workers with helmets, return the count and save a visaulization of the bounding boxes?', media=['/paths/to/images/workers.png'])", "let_user_respond": false} +AGENT: I will use the generate_vision_code to count the workers with helmets in the image. + +generate_vision_code(artifacts, 'code.py', 'Can you create a function to count workers with helmets, return the count and save a visaulization of the bounding boxes?', media=['/paths/to/images/workers.png']) + +false OBSERVATION: [Artifact code.py] @@ -152,22 +187,37 @@ 16| return count [End of artifact] -AGENT: {"thoughts": "I have generated the code to count the workers with helmets in the image, I must now run the code and print the output and write the visualization to the artifacts so I can see the result and the user can see the visaulization.", "response": "from code import count_workers_with_helmets\n print(count_workers_with_helmets('/path/to/images/workers.png', 'workers_viz.png'))\n write_media_artifact(artifacts, 'workers_viz.png', 'workers_viz.png')", "let_user_respond": false} +AGENT: I have generated the code to count the workers with helmets in the image, I must now run the code and print the output. + +from code import count_workers_with_helmets +print(count_workers_with_helmets('/path/to/images/workers.png', 'workers_viz.png')) + +false OBSERVATION: ----- stdout ----- 2 -AGENT: {"thoughts": "Two workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "The code to detect workers with helmets is saved in code.py and the visualization under 'workers_viz.png'.", "let_user_respond": true} +AGENT: Two workers with helmets are detected, I will show this to the user and ask them if the result looks good. +The code to detect workers with helmets is saved in code.py and the visualization under 'workers_viz.png'. +true USER: The detections are slightly off. Can you fine tune florence2 using these labels? "[{'image_path': 'image1.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}, {'image_path': 'image2.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}]" -AGENT: {"thoughts": "Because the user has supplied me with labels I can call object_detection_fine_tuning on their behalf to fine tune the model", "response": "object_detection_fine_tuning([{'image_path': 'image1.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}, {'image_path': 'image2.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}])", "let_user_respond": false} +AGENT: Because the user has supplied me with labels I can call object_detection_fine_tuning on their behalf to fine tune the model + +object_detection_fine_tuning([{'image_path': 'image1.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}, {'image_path': 'image2.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}]) + +false OBSERVATION: [Fine tuning id: 23b3b022-5ebf-4798-9373-20ef36429abf] -AGENT: {"thoughts": "The model has finished fine tuning, I will now replace the original florence2_phrase_grounding call with the fine tuning id.", "response": "use_object_detection_fine_tuning(artifacts, "code.py", "23b3b022-5ebf-4798-9373-20ef36429abf")", "let_user_respond": false} +AGENT: The model has finished fine tuning, I will now replace the original florence2_phrase_grounding call with the fine tuning id. + +use_object_detection_fine_tuning(artifacts, "code.py", "23b3b022-5ebf-4798-9373-20ef36429abf") + +false OBSERVATION: [Artifact code.py edits] @@ -184,11 +234,18 @@ def count_workers_with_helmets(image_path: str, output_path: str): count = 0 [End of edits] -AGENT: {"thoughts": "I have updated the code to use the fine tuned model, I must now run the code and print the output and write the visualization to the artifacts so I can see the result and the user can see the visaulization.", "response": "from code import count_workers_with_helmets\n print(count_workers_with_helmets('/path/to/images/workers.png', 'workers_viz.png'))\n write_media_artifact(artifacts, 'workers_viz.png', 'workers_viz.png')", "let_user_respond": false} +AGENT: I have updated the code to use the fine tuned model, I must now run the code and print the output. + +from code import count_workers_with_helmets +print(count_workers_with_helmets('/path/to/images/workers.png', 'workers_viz.png')) + +false OBSERVATION: ----- stdout ----- 3 -AGENT: {"thoughts": "Three workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "I have updated the code using the fine tuned florence2_phrase_grounding model to count the workers wearing helmets in code.py and saved the visualization under 'workers_viz.png'.", "let_user_respond": true} +AGENT: Three workers with helmets are detected, I will show this to the user and ask them if the result looks good. +I have updated the code using the fine tuned florence2_phrase_grounding model to count the workers wearing helmets in code.py and saved the visualization under 'workers_viz.png'. +true """ diff --git a/vision_agent/tools/__init__.py b/vision_agent/tools/__init__.py index 2a75aa2b..16df1193 100644 --- a/vision_agent/tools/__init__.py +++ b/vision_agent/tools/__init__.py @@ -37,7 +37,6 @@ grounding_dino, grounding_sam, ixc25_image_vqa, - ixc25_temporal_localization, ixc25_video_vqa, load_image, loca_visual_prompt_counting, diff --git a/vision_agent/tools/meta_tools.py b/vision_agent/tools/meta_tools.py index 7f59c685..d9537e7c 100644 --- a/vision_agent/tools/meta_tools.py +++ b/vision_agent/tools/meta_tools.py @@ -1,4 +1,3 @@ -import base64 import difflib import json import os @@ -9,7 +8,6 @@ from pathlib import Path from typing import Any, Dict, List, Optional, Union -import numpy as np from IPython.display import display from redbaron import RedBaron # type: ignore @@ -22,8 +20,7 @@ from vision_agent.tools.tools import TOOL_DESCRIPTIONS from vision_agent.tools.tools_types import BboxInput, BboxInputBase64, PromptTask from vision_agent.utils.execute import Execution, MimeType -from vision_agent.utils.image_utils import convert_to_b64, numpy_to_bytes -from vision_agent.utils.video import frames_to_bytes +from vision_agent.utils.image_utils import convert_to_b64 CURRENT_FILE = None CURRENT_LINE = 0 @@ -393,19 +390,6 @@ def generate_vision_plan( redisplay_results(response.test_results) response.test_results = None artifacts[name] = response.model_dump_json() - media_names = extract_json( - AnthropicLMM()( # type: ignore - f"""Extract any media file names from this output in the following JSON format: -{{"media": ["image1.jpg", "image2.jpg"]}} - -{artifacts[name]}""" - ) - ) - if "media" in media_names and isinstance(media_names, dict): - for media in media_names["media"]: - if isinstance(media, str): - with open(media, "rb") as f: - artifacts[media] = f.read() output_str = f"[Start Plan Context, saved at {name}]" for plan in response.plans.keys(): @@ -466,6 +450,12 @@ def detect_dogs(image_path: str): test_multi_plan=test_multi_plan, custom_tool_names=custom_tool_names, ) + + # capture and save any files that were saved in the code to the artifacts + extract_and_save_files_to_artifacts( + artifacts, response["code"] + "\n" + response["test"] + ) + redisplay_results(response["test_result"]) code = response["code"] artifacts[name] = code @@ -546,6 +536,11 @@ def detect_dogs(image_path: str): test_multi_plan=False, custom_tool_names=custom_tool_names, ) + # capture and save any files that were saved in the code to the artifacts + extract_and_save_files_to_artifacts( + artifacts, response["code"] + "\n" + response["test"] + ) + redisplay_results(response["test_result"]) code = response["code"] artifacts[name] = code @@ -567,49 +562,6 @@ def detect_dogs(image_path: str): return view_lines(code_lines, 0, total_lines, name, total_lines) -def write_media_artifact( - artifacts: Artifacts, - name: str, - media: Union[str, np.ndarray, List[np.ndarray]], - fps: Optional[float] = None, -) -> str: - """Writes a media file to the artifacts object. - - Parameters: - artifacts (Artifacts): The artifacts object to save the media to. - name (str): The name of the media artifact to save. - media (Union[str, np.ndarray, List[np.ndarray]]): The media to save, can either - be a file path, single image or list of frames for a video. - fps (Optional[float]): The frames per second if you are writing a video. - """ - if isinstance(media, str): - with open(media, "rb") as f: - media_bytes = f.read() - elif isinstance(media, list): - media_bytes = frames_to_bytes(media, fps=fps if fps is not None else 1.0) - elif isinstance(media, np.ndarray): - media_bytes = numpy_to_bytes(media) - else: - print(f"[Invalid media type {type(media)}]") - return f"[Invalid media type {type(media)}]" - artifacts[name] = media_bytes - print(f"[Media {name} saved]") - display( - { - MimeType.APPLICATION_ARTIFACT: json.dumps( - { - "name": name, - "action": "create", - "content": base64.b64encode(media_bytes).decode("utf-8"), - "contentType": "media_output", - } - ) - }, - raw=True, - ) - return f"[Media {name} saved]" - - def list_artifacts(artifacts: Artifacts) -> str: """Lists all the artifacts that have been loaded into the artifacts object.""" output_str = artifacts.show() @@ -813,6 +765,61 @@ def use_object_detection_fine_tuning( return diff +def extract_and_save_files_to_artifacts(artifacts: Artifacts, code: str) -> None: + """Extracts and saves files used in the code to the artifacts object. + + Parameters: + artifacts (Artifacts): The artifacts object to save the files to. + code (str): The code to extract the files from. + """ + try: + response = extract_json( + AnthropicLMM()( # type: ignore + f"""You are a helpful AI assistant. Your job is to look at a snippet of code and return the file paths that are being saved in the file. Below is the code snippet: + +```python +{code} +``` + +Return the file paths in the following JSON format: +{{"file_paths": ["/path/to/image1.jpg", "/other/path/to/data.json"]}}""" + ) + ) + except json.JSONDecodeError: + return + + text_file_ext = [ + ".txt", + ".md", + "rtf", + ".html", + ".htm", + "xml", + ".json", + ".csv", + ".tsv", + ".yaml", + ".yml", + ".toml", + ".conf", + ".env" ".ini", + ".log", + ".py", + ".java", + ".js", + ".cpp", + ".c" ".sql", + ".sh", + ] + + if "file_paths" in response and isinstance(response["file_paths"], list): + for file_path in response["file_paths"]: + read_mode = "r" if Path(file_path).suffix in text_file_ext else "rb" + if Path(file_path).is_file(): + with open(file_path, read_mode) as f: + artifacts[Path(file_path).name] = f.read() + + META_TOOL_DOCSTRING = get_tool_documentation( [ get_tool_descriptions, @@ -822,7 +829,6 @@ def use_object_detection_fine_tuning( generate_vision_plan, generate_vision_code, edit_vision_code, - write_media_artifact, view_media_artifact, object_detection_fine_tuning, use_object_detection_fine_tuning, diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py index 45f10c33..86e484b2 100644 --- a/vision_agent/tools/tools.py +++ b/vision_agent/tools/tools.py @@ -181,6 +181,8 @@ def owl_v2_image( """ image_size = image.shape[:2] + if image_size[0] < 1 or image_size[1] < 1: + return [] if fine_tune_id is not None: image_b64 = convert_to_b64(image) @@ -413,6 +415,9 @@ def florence2_sam2_image( }, ] """ + if image.shape[0] < 1 or image.shape[1] < 1: + return [] + if fine_tune_id is not None: image_b64 = convert_to_b64(image) landing_api = LandingPublicAPI() @@ -701,6 +706,8 @@ def countgd_counting( ] """ image_size = image.shape[:2] + if image_size[0] < 1 or image_size[1] < 1: + return [] buffer_bytes = numpy_to_bytes(image) files = [("image", buffer_bytes)] prompt = prompt.replace(", ", " .") @@ -759,6 +766,8 @@ def countgd_example_based_counting( ] """ image_size = image.shape[:2] + if image_size[0] < 1 or image_size[1] < 1: + return [] buffer_bytes = numpy_to_bytes(image) files = [("image", buffer_bytes)] visual_prompts = [ @@ -828,6 +837,8 @@ def ixc25_image_vqa(prompt: str, image: np.ndarray) -> str: >>> ixc25_image_vqa('What is the cat doing?', image) 'drinking milk' """ + if image.shape[0] < 1 or image.shape[1] < 1: + raise ValueError(f"Image is empty, image shape: {image.shape}") buffer_bytes = numpy_to_bytes(image) files = [("image", buffer_bytes)] @@ -871,47 +882,6 @@ def ixc25_video_vqa(prompt: str, frames: List[np.ndarray]) -> str: return cast(str, data["answer"]) -def ixc25_temporal_localization(prompt: str, frames: List[np.ndarray]) -> List[bool]: - """'ixc25_temporal_localization' uses ixc25_video_vqa to temporally segment a video - given a prompt that can be other an object or a phrase. It returns a list of - boolean values indicating whether the object or phrase is present in the - corresponding frame. - - Parameters: - prompt (str): The question about the video - frames (List[np.ndarray]): The reference frames used for the question - - Returns: - List[bool]: A list of boolean values indicating whether the object or phrase is - present in the corresponding frame. - - Example - ------- - >>> output = ixc25_temporal_localization('soccer goal', frames) - >>> print(output) - [False, False, False, True, True, True, False, False, False, False] - >>> save_video([f for i, f in enumerate(frames) if output[i]], 'output.mp4') - """ - - buffer_bytes = frames_to_bytes(frames) - files = [("video", buffer_bytes)] - payload = { - "prompt": prompt, - "chunk_length": 2, - "function_name": "ixc25_temporal_localization", - } - data: List[int] = send_inference_request( - payload, - "video-temporal-localization?model=internlm-xcomposer", - files=files, - v2=True, - ) - chunk_size = round(len(frames) / len(data)) - data_explode = [[elt] * chunk_size for elt in data] - data_bool = [bool(elt) for sublist in data_explode for elt in sublist] - return data_bool[: len(frames)] - - def gpt4o_image_vqa(prompt: str, image: np.ndarray) -> str: """'gpt4o_image_vqa' is a tool that can answer any questions about arbitrary images including regular images or images of documents or presentations. It returns text @@ -1024,6 +994,9 @@ def clip(image: np.ndarray, classes: List[str]) -> Dict[str, Any]: {"labels": ["dog", "cat", "bird"], "scores": [0.68, 0.30, 0.02]}, """ + if image.shape[0] < 1 or image.shape[1] < 1: + return {"labels": [], "scores": []} + image_b64 = convert_to_b64(image) data = { "prompt": ",".join(classes), @@ -1052,6 +1025,8 @@ def vit_image_classification(image: np.ndarray) -> Dict[str, Any]: >>> vit_image_classification(image) {"labels": ["leopard", "lemur, otter", "bird"], "scores": [0.68, 0.30, 0.02]}, """ + if image.shape[0] < 1 or image.shape[1] < 1: + return {"labels": [], "scores": []} image_b64 = convert_to_b64(image) data = { @@ -1080,6 +1055,8 @@ def vit_nsfw_classification(image: np.ndarray) -> Dict[str, Any]: >>> vit_nsfw_classification(image) {"label": "normal", "scores": 0.68}, """ + if image.shape[0] < 1 or image.shape[1] < 1: + raise ValueError(f"Image is empty, image shape: {image.shape}") image_b64 = convert_to_b64(image) data = { @@ -1180,6 +1157,8 @@ def florence2_phrase_grounding( ] """ image_size = image.shape[:2] + if image_size[0] < 1 or image_size[1] < 1: + return [] image_b64 = convert_to_b64(image) if fine_tune_id is not None: @@ -1399,6 +1378,8 @@ def detr_segmentation(image: np.ndarray) -> List[Dict[str, Any]]: }, ] """ + if image.shape[0] < 1 or image.shape[1] < 1: + return [] image_b64 = convert_to_b64(image) data = { "image": image_b64, @@ -1442,6 +1423,9 @@ def depth_anything_v2(image: np.ndarray) -> np.ndarray: [10, 11, 15, ..., 202, 202, 205], [10, 10, 10, ..., 200, 200, 200]], dtype=uint8), """ + if image.shape[0] < 1 or image.shape[1] < 1: + raise ValueError(f"Image is empty, image shape: {image.shape}") + image_b64 = convert_to_b64(image) data = { "image": image_b64,