From 77974337a41e7e0d34728b56e898010c7a81a1a5 Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Wed, 2 Oct 2024 15:28:55 -0700 Subject: [PATCH 01/13] strip installs from code --- tests/unit/test_utils.py | 22 +++++++++++++++++++++- vision_agent/agent/agent_utils.py | 6 ++++++ vision_agent/agent/vision_agent_coder.py | 10 +++++++--- 3 files changed, 34 insertions(+), 4 deletions(-) diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py index 4db319f9..73471a30 100644 --- a/tests/unit/test_utils.py +++ b/tests/unit/test_utils.py @@ -1,4 +1,8 @@ -from vision_agent.agent.agent_utils import extract_code, extract_json +from vision_agent.agent.agent_utils import ( + extract_code, + extract_json, + remove_installs_from_code, +) def test_basic_json_extract(): @@ -43,3 +47,19 @@ def test_basic_json_extract(): a_code = extract_code(a) assert "def test_basic_json_extract():" in a_code assert "assert extract_json(a) == {" in a_code + + +def test_remove_installs_from_code(): + a = """import os +imoprt sys + +!pip install pandas + + +def test(): + print("!pip install dummy") +""" + out = remove_installs_from_code(a) + assert "import os" in out + assert "!pip install pandas" not in out + assert "!pip install dummy" in out diff --git a/vision_agent/agent/agent_utils.py b/vision_agent/agent/agent_utils.py index dc0debee..624ad608 100644 --- a/vision_agent/agent/agent_utils.py +++ b/vision_agent/agent/agent_utils.py @@ -77,3 +77,9 @@ def extract_code(code: str) -> str: if code.startswith("python\n"): code = code[len("python\n") :] return code + + +def remove_installs_from_code(code: str) -> str: + pattern = r"\n!pip install.*?(\n|\Z)\n" + code = re.sub(pattern, "", code, flags=re.DOTALL) + return code diff --git a/vision_agent/agent/vision_agent_coder.py b/vision_agent/agent/vision_agent_coder.py index aa4d83da..1e5030a2 100644 --- a/vision_agent/agent/vision_agent_coder.py +++ b/vision_agent/agent/vision_agent_coder.py @@ -13,7 +13,11 @@ import vision_agent.tools as T from vision_agent.agent import Agent -from vision_agent.agent.agent_utils import extract_code, extract_json +from vision_agent.agent.agent_utils import ( + extract_code, + extract_json, + remove_installs_from_code, +) from vision_agent.agent.vision_agent_coder_prompts import ( CODE, FIX_BUG, @@ -836,8 +840,8 @@ def chat_with_workflow( media=media_list, ) success = cast(bool, results["success"]) - code = cast(str, results["code"]) - test = cast(str, results["test"]) + code = remove_installs_from_code(cast(str, results["code"])) + test = remove_installs_from_code(cast(str, results["test"])) working_memory.extend(results["working_memory"]) # type: ignore plan.append({"code": code, "test": test, "plan": plan_i}) From 25aa67c97d78bdff8f5bbdb63becfcee6570e427 Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Wed, 2 Oct 2024 15:34:28 -0700 Subject: [PATCH 02/13] make vision agent less verbose --- vision_agent/agent/vision_agent_prompts.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/vision_agent/agent/vision_agent_prompts.py b/vision_agent/agent/vision_agent_prompts.py index bc3295ef..a8b1e543 100644 --- a/vision_agent/agent/vision_agent_prompts.py +++ b/vision_agent/agent/vision_agent_prompts.py @@ -28,7 +28,8 @@ 1. **Understand and Clarify**: Make sure you understand the task, ask clarifying questions if the task is not clear. 2. **Code Generation**: Only use code provided in the Documentation in your tags. Only use `edit_vision_code` to modify code written by `generate_vision_code`. 3. **Execute**: Do only what the user asked you to do and no more. If you need to ask the user a question, set `let_user_respond` to `true`. -4. **Output in JSON**: Respond in the following format in JSON: +4. **Response**: Keep your responses short and concise. Provide the user only with the information they need to continue the conversation. +5. **Output in JSON**: Respond in the following format in JSON: ```json {{"thoughts": , "response": , "let_user_respond": }}. @@ -62,7 +63,7 @@ [{'score': 0.99, 'label': 'dog', 'box': [0.1, 0.2, 0.3, 0.4]}, {'score': 0.23, 'label': 'dog', 'box': [0.2, 0.3, 0.4, 0.5]}] -AGENT: {"thoughts": "Two dogs are detected, I will show this to the user and ask them if the result looks good.", "response": "I have written the code to detect dogs and shown the output, do the results look good to you?", "let_user_respond": true} +AGENT: {"thoughts": "Two dogs are detected, I will show this to the user and ask them if the result looks good.", "response": "The code detectd two dogs, do the results look good to you?", "let_user_respond": true} """ EXAMPLES_CODE1_EXTRA = """ @@ -91,7 +92,7 @@ ----- stdout ----- [{'score': 0.99, 'label': 'dog', 'box': [0.1, 0.2, 0.3, 0.4]}] -AGENT: {"thoughts": "One dog is detected, I will show this to the user and ask them if the result looks good.", "response": "I have written the code to detect one dog and shown you the output, do the results look good to you?", "let_user_respond": true} +AGENT: {"thoughts": "One dog is detected, I will show this to the user and ask them if the result looks good.", "response": "The code detected one dog, do these results look good to you?", "let_user_respond": true} """ EXAMPLES_CODE2 = """ @@ -157,16 +158,16 @@ ----- stdout ----- 2 -AGENT: {"thoughts": "Two workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "I have written the code to count the workers wearing helmets in code.py and saved the visualization under 'workers_viz.png'.", "let_user_respond": true} +AGENT: {"thoughts": "Two workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "The code to detect workers with helmets is saved in code.py and the visualization under 'workers_viz.png'.", "let_user_respond": true} USER: The detections are slightly off. Can you fine tune florence2 using these labels? "[{'image_path': 'image1.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}, {'image_path': 'image2.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}]" -AGENT: {"thoughts": "Because the user has supplied me with labels I can call object_detection_fine_tuning on their behalf to fine tune the model", "response": "I will fine tune florence2 with the labels you provided object_detection_fine_tuning([{'image_path': 'image1.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}, {'image_path': 'image2.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}])", "let_user_respond": false} +AGENT: {"thoughts": "Because the user has supplied me with labels I can call object_detection_fine_tuning on their behalf to fine tune the model", "response": "object_detection_fine_tuning([{'image_path': 'image1.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}, {'image_path': 'image2.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}])", "let_user_respond": false} OBSERVATION: [Fine tuning id: 23b3b022-5ebf-4798-9373-20ef36429abf] -AGENT: {"thoughts": "The model has finished fine tuning, I will now replace the original florence2_phrase_grounding call with the fine tuning id.", "response": "I will not update the code to use the fine tuned model. use_object_detection_fine_tuning(artifacts, "code.py", "23b3b022-5ebf-4798-9373-20ef36429abf")", "let_user_respond": false} +AGENT: {"thoughts": "The model has finished fine tuning, I will now replace the original florence2_phrase_grounding call with the fine tuning id.", "response": "use_object_detection_fine_tuning(artifacts, "code.py", "23b3b022-5ebf-4798-9373-20ef36429abf")", "let_user_respond": false} OBSERVATION: [Artifact code.py edits] From 985fcbc8e4fd7f4f4385b0bc585d8273191816c3 Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Wed, 2 Oct 2024 18:49:15 -0700 Subject: [PATCH 03/13] make artifact name more clear --- vision_agent/tools/meta_tools.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vision_agent/tools/meta_tools.py b/vision_agent/tools/meta_tools.py index 976addae..6489652f 100644 --- a/vision_agent/tools/meta_tools.py +++ b/vision_agent/tools/meta_tools.py @@ -116,7 +116,7 @@ def show(self, uploaded_file_path: Optional[Union[str, Path]] = None) -> str: ) output_str = "[Artifacts loaded]\n" for k in self.artifacts.keys(): - output_str += f"Artifact {k} loaded to {str(loaded_path / k)}\n" + output_str += f"Artifact name: {k}, loaded to path: {str(loaded_path / k)}\n" output_str += "[End of artifacts]\n" print(output_str) return output_str From 96ad6699849b26a8d313da2f2ebd037005b31e26 Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Wed, 2 Oct 2024 18:49:27 -0700 Subject: [PATCH 04/13] don't load artifacts locally --- vision_agent/agent/vision_agent.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/vision_agent/agent/vision_agent.py b/vision_agent/agent/vision_agent.py index 3c1682e8..c4e36156 100644 --- a/vision_agent/agent/vision_agent.py +++ b/vision_agent/agent/vision_agent.py @@ -407,8 +407,6 @@ def chat_with_code( code_interpreter.download_file( str(remote_artifacts_path.name), str(self.local_artifacts_path) ) - artifacts.load(self.local_artifacts_path) - artifacts.save() return orig_chat, artifacts def streaming_message(self, message: Dict[str, Any]) -> None: From 2499aeaf9a76572839ddc57b89c4fe89554aae75 Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Thu, 3 Oct 2024 17:25:22 -0700 Subject: [PATCH 05/13] added more error handling for saving files --- tests/unit/tools/test_tools.py | 76 +++++++++++++++++++++++++++------- vision_agent/tools/tools.py | 12 ++++++ 2 files changed, 72 insertions(+), 16 deletions(-) diff --git a/tests/unit/tools/test_tools.py b/tests/unit/tools/test_tools.py index b2f1a87a..5fc82b84 100644 --- a/tests/unit/tools/test_tools.py +++ b/tests/unit/tools/test_tools.py @@ -1,25 +1,69 @@ -# Generated by CodiumAI +import os +import tempfile from pathlib import Path import numpy as np -from vision_agent.tools.tools import save_video +from vision_agent.tools.tools import save_image, save_video -class TestSaveVideo: - def test_saves_frames_without_output_path(self): - frames = [ - np.random.randint(0, 256, (480, 640, 3), dtype=np.uint8) for _ in range(10) - ] - output_path = save_video(frames) - assert Path(output_path).exists() +def test_saves_frames_without_output_path(): + frames = [ + np.random.randint(0, 256, (480, 640, 3), dtype=np.uint8) for _ in range(10) + ] + output_path = save_video(frames) + assert Path(output_path).exists() + os.remove(output_path) + - def test_saves_frames_with_output_path(self, tmp_path): - frames = [ - np.random.randint(0, 256, (480, 640, 3), dtype=np.uint8) for _ in range(10) - ] - video_output_path = str(tmp_path / "output.mp4") - output_path = save_video(frames, video_output_path) +def test_saves_frames_with_output_path(): + frames = [ + np.random.randint(0, 256, (480, 640, 3), dtype=np.uint8) for _ in range(10) + ] - assert output_path == video_output_path + with tempfile.TemporaryDirectory() as tmp_dir: + video_output_path = Path(tmp_dir) / "output.mp4" + output_path = save_video(frames, str(video_output_path)) + + assert output_path == str(video_output_path) assert Path(output_path).exists() + + +def test_save_null_image(): + image = None + try: + save_image(image, "tmp.jpg") + except ValueError as e: + assert str(e) == "The image is not a valid NumPy array with shape (H, W, C)" + + +def test_save_empty_image(): + image = np.zeros((0, 0, 3), dtype=np.uint8) + try: + save_image(image, "tmp.jpg") + except ValueError as e: + assert str(e) == "The image is not a valid NumPy array with shape (H, W, C)" + + +def test_save_null_video(): + frames = None + try: + save_video(frames, "tmp.mp4") + except ValueError as e: + assert str(e) == "Frames must be a list of NumPy arrays" + +def test_save_empty_list(): + frames = [] + try: + save_video(frames, "tmp.mp4") + except ValueError as e: + assert str(e) == "Frames must be a list of NumPy arrays" + + +def test_save_invalid_frame(): + frames = [np.zeros((0, 0, 3), dtype=np.uint8)] + try: + save_video(frames, "tmp.mp4") + except ValueError as e: + assert str(e) == "Frame is not a valid NumPy array with shape (H, W, C)" + diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py index 7d881921..486e21a2 100644 --- a/vision_agent/tools/tools.py +++ b/vision_agent/tools/tools.py @@ -1808,6 +1808,9 @@ def save_image(image: np.ndarray, file_path: str) -> None: """ from IPython.display import display + if not isinstance(image, np.ndarray) or (image.shape[0] == 0 and image.shape[1] == 0): + raise ValueError("The image is not a valid NumPy array with shape (H, W, C)") + pil_image = Image.fromarray(image.astype(np.uint8)).convert("RGB") display(pil_image) pil_image.save(file_path) @@ -1834,6 +1837,15 @@ def save_video( if fps <= 0: raise ValueError(f"fps must be greater than 0 got {fps}") + if not isinstance(frames, list) or len(frames) == 0: + raise ValueError("Frames must be a list of NumPy arrays") + + for frame in frames: + if not isinstance(frame, np.ndarray) or ( + frame.shape[0] == 0 and frame.shape[1] == 0 + ): + raise ValueError("The frame is not a valid NumPy array with shape (H, W, C)") + if output_video_path is None: output_video_path = tempfile.NamedTemporaryFile( delete=False, suffix=".mp4" From 9ecf977d33b3b90c3331ca10b52f682edfbc2c9e Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Fri, 4 Oct 2024 07:49:43 -0700 Subject: [PATCH 06/13] added prompt to keep code closer to user request --- vision_agent/agent/vision_agent_coder_prompts.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/vision_agent/agent/vision_agent_coder_prompts.py b/vision_agent/agent/vision_agent_coder_prompts.py index 07f2c6e2..7be5a221 100644 --- a/vision_agent/agent/vision_agent_coder_prompts.py +++ b/vision_agent/agent/vision_agent_coder_prompts.py @@ -314,14 +314,15 @@ def check_helmets(image_path): --- END EXAMPLE1 --- **Instructions**: -1. **Understand and Clarify**: Make sure you understand the task. +1. **Understand and Clarify**: Make sure you understand the user request. 2. **Algorithm/Method Selection**: Decide on the most efficient method, use the tool outputs and tool thoughts to guide you. 3. **Pseudocode Creation**: Write down the steps you will follow in pseudocode. 4. **Code Generation**: Translate your pseudocode into executable Python code. - 4.1. Take in the media path as an argument and load with either `load_image` or `extract_frames_and_timestamps`. - 4.2. Coordinates are always returned normalized from `vision_agent.tools`. - 4.3. Do not create dummy input or functions, the code must be usable if the user provides new media. - 4.4. Use unnormalized coordinates when comparing bounding boxes. + 4.1. Ensure your code follows the user request first and then the subtasks. + 4.2. Take in the media path as an argument and load with either `load_image` or `extract_frames_and_timestamps`. + 4.3. Coordinates are always returned normalized from `vision_agent.tools`. + 4.4. Do not create dummy input or functions, the code must be usable if the user provides new media. + 4.5. Use unnormalized coordinates when comparing bounding boxes. """ TEST = """ From 85182e4939aff08ceaf0c95985fe8264a6d7232b Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Fri, 4 Oct 2024 11:11:59 -0700 Subject: [PATCH 07/13] revert back to old prompt --- vision_agent/tools/tools.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py index 486e21a2..698b9ffd 100644 --- a/vision_agent/tools/tools.py +++ b/vision_agent/tools/tools.py @@ -1146,10 +1146,10 @@ def florence2_image_caption(image: np.ndarray, detail_caption: bool = True) -> s def florence2_phrase_grounding( prompt: str, image: np.ndarray, fine_tune_id: Optional[str] = None ) -> List[Dict[str, Any]]: - """'florence2_phrase_grounding' will run florence2 on a image. It can - detect multiple objects given a text prompt which can be object names or caption. - You can optionally separate the object names in the text with commas. It returns - a list of bounding boxes with normalized coordinates, label names and associated + """'florence2_phrase_grounding' is a tool that can detect multiple + objects given a text prompt which can be object names or caption. You + can optionally separate the object names in the text with commas. It returns a list + of bounding boxes with normalized coordinates, label names and associated probability scores of 1.0. Parameters: From f01643682b048c4c8e9ece1ce761049212ededdd Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Fri, 4 Oct 2024 11:12:05 -0700 Subject: [PATCH 08/13] revert back to old prompt --- vision_agent/agent/vision_agent_coder_prompts.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/vision_agent/agent/vision_agent_coder_prompts.py b/vision_agent/agent/vision_agent_coder_prompts.py index 7be5a221..07f2c6e2 100644 --- a/vision_agent/agent/vision_agent_coder_prompts.py +++ b/vision_agent/agent/vision_agent_coder_prompts.py @@ -314,15 +314,14 @@ def check_helmets(image_path): --- END EXAMPLE1 --- **Instructions**: -1. **Understand and Clarify**: Make sure you understand the user request. +1. **Understand and Clarify**: Make sure you understand the task. 2. **Algorithm/Method Selection**: Decide on the most efficient method, use the tool outputs and tool thoughts to guide you. 3. **Pseudocode Creation**: Write down the steps you will follow in pseudocode. 4. **Code Generation**: Translate your pseudocode into executable Python code. - 4.1. Ensure your code follows the user request first and then the subtasks. - 4.2. Take in the media path as an argument and load with either `load_image` or `extract_frames_and_timestamps`. - 4.3. Coordinates are always returned normalized from `vision_agent.tools`. - 4.4. Do not create dummy input or functions, the code must be usable if the user provides new media. - 4.5. Use unnormalized coordinates when comparing bounding boxes. + 4.1. Take in the media path as an argument and load with either `load_image` or `extract_frames_and_timestamps`. + 4.2. Coordinates are always returned normalized from `vision_agent.tools`. + 4.3. Do not create dummy input or functions, the code must be usable if the user provides new media. + 4.4. Use unnormalized coordinates when comparing bounding boxes. """ TEST = """ From 5e33fcc5c89c0a8b8a7b21732866c75edc35f4f1 Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Fri, 4 Oct 2024 13:31:55 -0700 Subject: [PATCH 09/13] formatting fix --- tests/unit/tools/test_tools.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/tools/test_tools.py b/tests/unit/tools/test_tools.py index 5fc82b84..292d2eae 100644 --- a/tests/unit/tools/test_tools.py +++ b/tests/unit/tools/test_tools.py @@ -52,6 +52,7 @@ def test_save_null_video(): except ValueError as e: assert str(e) == "Frames must be a list of NumPy arrays" + def test_save_empty_list(): frames = [] try: @@ -66,4 +67,3 @@ def test_save_invalid_frame(): save_video(frames, "tmp.mp4") except ValueError as e: assert str(e) == "Frame is not a valid NumPy array with shape (H, W, C)" - From dbdb9b3f623331c67e4c205c808d7a11f147fe2f Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Fri, 4 Oct 2024 13:33:44 -0700 Subject: [PATCH 10/13] formatting fix --- vision_agent/tools/meta_tools.py | 4 +++- vision_agent/tools/tool_utils.py | 2 +- vision_agent/tools/tools.py | 8 ++++++-- vision_agent/tools/tools_types.py | 2 +- 4 files changed, 11 insertions(+), 5 deletions(-) diff --git a/vision_agent/tools/meta_tools.py b/vision_agent/tools/meta_tools.py index 6489652f..dc910300 100644 --- a/vision_agent/tools/meta_tools.py +++ b/vision_agent/tools/meta_tools.py @@ -116,7 +116,9 @@ def show(self, uploaded_file_path: Optional[Union[str, Path]] = None) -> str: ) output_str = "[Artifacts loaded]\n" for k in self.artifacts.keys(): - output_str += f"Artifact name: {k}, loaded to path: {str(loaded_path / k)}\n" + output_str += ( + f"Artifact name: {k}, loaded to path: {str(loaded_path / k)}\n" + ) output_str += "[End of artifacts]\n" print(output_str) return output_str diff --git a/vision_agent/tools/tool_utils.py b/vision_agent/tools/tool_utils.py index 924b96e6..b35d6fef 100644 --- a/vision_agent/tools/tool_utils.py +++ b/vision_agent/tools/tool_utils.py @@ -1,6 +1,6 @@ -import os import inspect import logging +import os from base64 import b64encode from typing import Any, Callable, Dict, List, MutableMapping, Optional, Tuple diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py index 698b9ffd..bc73a9ae 100644 --- a/vision_agent/tools/tools.py +++ b/vision_agent/tools/tools.py @@ -1808,7 +1808,9 @@ def save_image(image: np.ndarray, file_path: str) -> None: """ from IPython.display import display - if not isinstance(image, np.ndarray) or (image.shape[0] == 0 and image.shape[1] == 0): + if not isinstance(image, np.ndarray) or ( + image.shape[0] == 0 and image.shape[1] == 0 + ): raise ValueError("The image is not a valid NumPy array with shape (H, W, C)") pil_image = Image.fromarray(image.astype(np.uint8)).convert("RGB") @@ -1844,7 +1846,9 @@ def save_video( if not isinstance(frame, np.ndarray) or ( frame.shape[0] == 0 and frame.shape[1] == 0 ): - raise ValueError("The frame is not a valid NumPy array with shape (H, W, C)") + raise ValueError( + "The frame is not a valid NumPy array with shape (H, W, C)" + ) if output_video_path is None: output_video_path = tempfile.NamedTemporaryFile( diff --git a/vision_agent/tools/tools_types.py b/vision_agent/tools/tools_types.py index 1cc765b6..b2812fc0 100644 --- a/vision_agent/tools/tools_types.py +++ b/vision_agent/tools/tools_types.py @@ -1,6 +1,6 @@ from enum import Enum -from uuid import UUID from typing import List, Optional, Tuple, Union +from uuid import UUID from pydantic import BaseModel, ConfigDict, Field, SerializationInfo, field_serializer From 71a7bd653d94b563ce2b3366f8bf4cdccdfbf3b3 Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Fri, 4 Oct 2024 13:39:21 -0700 Subject: [PATCH 11/13] fix test case --- tests/unit/tools/test_tools.py | 2 +- vision_agent/tools/tools.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/unit/tools/test_tools.py b/tests/unit/tools/test_tools.py index 292d2eae..eec3c78f 100644 --- a/tests/unit/tools/test_tools.py +++ b/tests/unit/tools/test_tools.py @@ -66,4 +66,4 @@ def test_save_invalid_frame(): try: save_video(frames, "tmp.mp4") except ValueError as e: - assert str(e) == "Frame is not a valid NumPy array with shape (H, W, C)" + assert str(e) == "A frame is not a valid NumPy array with shape (H, W, C)" diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py index bc73a9ae..63776be2 100644 --- a/vision_agent/tools/tools.py +++ b/vision_agent/tools/tools.py @@ -1847,7 +1847,7 @@ def save_video( frame.shape[0] == 0 and frame.shape[1] == 0 ): raise ValueError( - "The frame is not a valid NumPy array with shape (H, W, C)" + "A frame is not a valid NumPy array with shape (H, W, C)" ) if output_video_path is None: From 0a087ce9e43519e55dc425a917afa226731b09d9 Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Fri, 4 Oct 2024 13:40:53 -0700 Subject: [PATCH 12/13] fix format issue --- vision_agent/tools/tools.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py index 63776be2..f83132a5 100644 --- a/vision_agent/tools/tools.py +++ b/vision_agent/tools/tools.py @@ -1846,9 +1846,7 @@ def save_video( if not isinstance(frame, np.ndarray) or ( frame.shape[0] == 0 and frame.shape[1] == 0 ): - raise ValueError( - "A frame is not a valid NumPy array with shape (H, W, C)" - ) + raise ValueError("A frame is not a valid NumPy array with shape (H, W, C)") if output_video_path is None: output_video_path = tempfile.NamedTemporaryFile( From 9a1394ef2527cdaadd3a4bd6fc24adade7e673c0 Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Fri, 4 Oct 2024 14:30:35 -0700 Subject: [PATCH 13/13] merge overlay count into overlay bbox --- vision_agent/tools/__init__.py | 1 - vision_agent/tools/tools.py | 95 ++++++++++++++-------------------- 2 files changed, 40 insertions(+), 56 deletions(-) diff --git a/vision_agent/tools/__init__.py b/vision_agent/tools/__init__.py index e5b7c334..da74f677 100644 --- a/vision_agent/tools/__init__.py +++ b/vision_agent/tools/__init__.py @@ -45,7 +45,6 @@ loca_zero_shot_counting, ocr, overlay_bounding_boxes, - overlay_counting_results, overlay_heat_map, overlay_segmentation_masks, owl_v2_image, diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py index f83132a5..b2b8a985 100644 --- a/vision_agent/tools/tools.py +++ b/vision_agent/tools/tools.py @@ -13,7 +13,7 @@ import cv2 import numpy as np import requests -from PIL import Image, ImageDraw, ImageEnhance, ImageFont +from PIL import Image, ImageDraw, ImageFont from pillow_heif import register_heif_opener # type: ignore from pytube import YouTube # type: ignore @@ -1917,30 +1917,36 @@ def overlay_bounding_boxes( bboxes = bbox_int[i] bboxes = sorted(bboxes, key=lambda x: x["label"], reverse=True) - width, height = pil_image.size - fontsize = max(12, int(min(width, height) / 40)) - draw = ImageDraw.Draw(pil_image) - font = ImageFont.truetype( - str( - resources.files("vision_agent.fonts").joinpath("default_font_ch_en.ttf") - ), - fontsize, - ) - - for elt in bboxes: - label = elt["label"] - box = elt["bbox"] - scores = elt["score"] - - # denormalize the box if it is normalized - box = denormalize_bbox(box, (height, width)) - draw.rectangle(box, outline=color[label], width=4) - text = f"{label}: {scores:.2f}" - text_box = draw.textbbox((box[0], box[1]), text=text, font=font) - draw.rectangle( - (box[0], box[1], text_box[2], text_box[3]), fill=color[label] + if len(bboxes) > 20: + pil_image = _plot_counting(pil_image, bboxes, color) + else: + width, height = pil_image.size + fontsize = max(12, int(min(width, height) / 40)) + draw = ImageDraw.Draw(pil_image) + font = ImageFont.truetype( + str( + resources.files("vision_agent.fonts").joinpath( + "default_font_ch_en.ttf" + ) + ), + fontsize, ) - draw.text((box[0], box[1]), text, fill="black", font=font) + + for elt in bboxes: + label = elt["label"] + box = elt["bbox"] + scores = elt["score"] + + # denormalize the box if it is normalized + box = denormalize_bbox(box, (height, width)) + draw.rectangle(box, outline=color[label], width=4) + text = f"{label}: {scores:.2f}" + text_box = draw.textbbox((box[0], box[1]), text=text, font=font) + draw.rectangle( + (box[0], box[1], text_box[2], text_box[3]), fill=color[label] + ) + draw.text((box[0], box[1]), text, fill="black", font=font) + frame_out.append(np.array(pil_image)) return frame_out[0] if len(frame_out) == 1 else frame_out @@ -2099,39 +2105,19 @@ def overlay_heat_map( return np.array(combined) -def overlay_counting_results( - image: np.ndarray, instances: List[Dict[str, Any]] -) -> np.ndarray: - """'overlay_counting_results' is a utility function that displays counting results on - an image. - - Parameters: - image (np.ndarray): The image to display the bounding boxes on. - instances (List[Dict[str, Any]]): A list of dictionaries containing the bounding - box information of each instance - - Returns: - np.ndarray: The image with the instance_id dislpayed - - Example - ------- - >>> image_with_bboxes = overlay_counting_results( - image, [{'score': 0.99, 'label': 'object', 'bbox': [0.1, 0.11, 0.35, 0.4]}], - ) - """ - pil_image = Image.fromarray(image.astype(np.uint8)).convert("RGB") - color = (158, 218, 229) - - width, height = pil_image.size +def _plot_counting( + image: Image.Image, + bboxes: List[Dict[str, Any]], + colors: Dict[str, Tuple[int, int, int]], +) -> Image.Image: + width, height = image.size fontsize = max(10, int(min(width, height) / 80)) - pil_image = ImageEnhance.Brightness(pil_image).enhance(0.5) - draw = ImageDraw.Draw(pil_image) + draw = ImageDraw.Draw(image) font = ImageFont.truetype( str(resources.files("vision_agent.fonts").joinpath("default_font_ch_en.ttf")), fontsize, ) - - for i, elt in enumerate(instances, 1): + for i, elt in enumerate(bboxes, 1): label = f"{i}" box = elt["bbox"] @@ -2153,7 +2139,7 @@ def overlay_counting_results( text_y1 = cy + text_height / 2 # Draw the rectangle encapsulating the text - draw.rectangle((text_x0, text_y0, text_x1, text_y1), fill=color) + draw.rectangle((text_x0, text_y0, text_x1, text_y1), fill=colors[elt["label"]]) # Draw the text at the center of the bounding box draw.text( @@ -2164,7 +2150,7 @@ def overlay_counting_results( anchor="lt", ) - return np.array(pil_image) + return image FUNCTION_TOOLS = [ @@ -2197,7 +2183,6 @@ def overlay_counting_results( overlay_bounding_boxes, overlay_segmentation_masks, overlay_heat_map, - overlay_counting_results, ] TOOLS = FUNCTION_TOOLS + UTIL_TOOLS