From 77974337a41e7e0d34728b56e898010c7a81a1a5 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Wed, 2 Oct 2024 15:28:55 -0700
Subject: [PATCH 01/13] strip installs from code

---
 tests/unit/test_utils.py                 | 22 +++++++++++++++++++++-
 vision_agent/agent/agent_utils.py        |  6 ++++++
 vision_agent/agent/vision_agent_coder.py | 10 +++++++---
 3 files changed, 34 insertions(+), 4 deletions(-)

diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py
index 4db319f9..73471a30 100644
--- a/tests/unit/test_utils.py
+++ b/tests/unit/test_utils.py
@@ -1,4 +1,8 @@
-from vision_agent.agent.agent_utils import extract_code, extract_json
+from vision_agent.agent.agent_utils import (
+    extract_code,
+    extract_json,
+    remove_installs_from_code,
+)
 
 
 def test_basic_json_extract():
@@ -43,3 +47,19 @@ def test_basic_json_extract():
     a_code = extract_code(a)
     assert "def test_basic_json_extract():" in a_code
     assert "assert extract_json(a) == {" in a_code
+
+
+def test_remove_installs_from_code():
+    a = """import os
+imoprt sys
+
+!pip install pandas
+
+
+def test():
+    print("!pip install dummy")
+"""
+    out = remove_installs_from_code(a)
+    assert "import os" in out
+    assert "!pip install pandas" not in out
+    assert "!pip install dummy" in out
diff --git a/vision_agent/agent/agent_utils.py b/vision_agent/agent/agent_utils.py
index dc0debee..624ad608 100644
--- a/vision_agent/agent/agent_utils.py
+++ b/vision_agent/agent/agent_utils.py
@@ -77,3 +77,9 @@ def extract_code(code: str) -> str:
     if code.startswith("python\n"):
         code = code[len("python\n") :]
     return code
+
+
+def remove_installs_from_code(code: str) -> str:
+    pattern = r"\n!pip install.*?(\n|\Z)\n"
+    code = re.sub(pattern, "", code, flags=re.DOTALL)
+    return code
diff --git a/vision_agent/agent/vision_agent_coder.py b/vision_agent/agent/vision_agent_coder.py
index aa4d83da..1e5030a2 100644
--- a/vision_agent/agent/vision_agent_coder.py
+++ b/vision_agent/agent/vision_agent_coder.py
@@ -13,7 +13,11 @@
 
 import vision_agent.tools as T
 from vision_agent.agent import Agent
-from vision_agent.agent.agent_utils import extract_code, extract_json
+from vision_agent.agent.agent_utils import (
+    extract_code,
+    extract_json,
+    remove_installs_from_code,
+)
 from vision_agent.agent.vision_agent_coder_prompts import (
     CODE,
     FIX_BUG,
@@ -836,8 +840,8 @@ def chat_with_workflow(
                 media=media_list,
             )
             success = cast(bool, results["success"])
-            code = cast(str, results["code"])
-            test = cast(str, results["test"])
+            code = remove_installs_from_code(cast(str, results["code"]))
+            test = remove_installs_from_code(cast(str, results["test"]))
             working_memory.extend(results["working_memory"])  # type: ignore
             plan.append({"code": code, "test": test, "plan": plan_i})
 

From 25aa67c97d78bdff8f5bbdb63becfcee6570e427 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Wed, 2 Oct 2024 15:34:28 -0700
Subject: [PATCH 02/13] make vision agent less verbose

---
 vision_agent/agent/vision_agent_prompts.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/vision_agent/agent/vision_agent_prompts.py b/vision_agent/agent/vision_agent_prompts.py
index bc3295ef..a8b1e543 100644
--- a/vision_agent/agent/vision_agent_prompts.py
+++ b/vision_agent/agent/vision_agent_prompts.py
@@ -28,7 +28,8 @@
 1. **Understand and Clarify**: Make sure you understand the task, ask clarifying questions if the task is not clear.
 2. **Code Generation**: Only use code provided in the Documentation in your <execute_python> tags. Only use `edit_vision_code` to modify code written by `generate_vision_code`.
 3. **Execute**: Do only what the user asked you to do and no more. If you need to ask the user a question, set `let_user_respond` to `true`.
-4. **Output in JSON**: Respond in the following format in JSON:
+4. **Response**: Keep your responses short and concise. Provide the user only with the information they need to continue the conversation.
+5. **Output in JSON**: Respond in the following format in JSON:
 
 ```json
 {{"thoughts": <your thoughts>, "response": <your response to the user>, "let_user_respond": <a boolean whether or not to let the user respond>}}.
@@ -62,7 +63,7 @@
 [{'score': 0.99, 'label': 'dog', 'box': [0.1, 0.2, 0.3, 0.4]}, {'score': 0.23, 'label': 'dog', 'box': [0.2, 0.3, 0.4, 0.5]}]
 
 
-AGENT: {"thoughts": "Two dogs are detected, I will show this to the user and ask them if the result looks good.", "response": "I have written the code to detect dogs and shown the output, do the results look good to you?", "let_user_respond": true}
+AGENT: {"thoughts": "Two dogs are detected, I will show this to the user and ask them if the result looks good.", "response": "The code detectd two dogs, do the results look good to you?", "let_user_respond": true}
 """
 
 EXAMPLES_CODE1_EXTRA = """
@@ -91,7 +92,7 @@
 ----- stdout -----
 [{'score': 0.99, 'label': 'dog', 'box': [0.1, 0.2, 0.3, 0.4]}]
 
-AGENT: {"thoughts": "One dog is detected, I will show this to the user and ask them if the result looks good.", "response": "I have written the code to detect one dog and shown you the output, do the results look good to you?", "let_user_respond": true}
+AGENT: {"thoughts": "One dog is detected, I will show this to the user and ask them if the result looks good.", "response": "The code detected one dog, do these results look good to you?", "let_user_respond": true}
 """
 
 EXAMPLES_CODE2 = """
@@ -157,16 +158,16 @@
 ----- stdout -----
 2
 
-AGENT: {"thoughts": "Two workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "I have written the code to count the workers wearing helmets in code.py and saved the visualization under 'workers_viz.png'.", "let_user_respond": true}
+AGENT: {"thoughts": "Two workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "The code to detect workers with helmets is saved in code.py and the visualization under 'workers_viz.png'.", "let_user_respond": true}
 
 USER: The detections are slightly off. Can you fine tune florence2 using these labels? "[{'image_path': 'image1.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}, {'image_path': 'image2.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}]"
 
-AGENT: {"thoughts": "Because the user has supplied me with labels I can call object_detection_fine_tuning on their behalf to fine tune the model", "response": "I will fine tune florence2 with the labels you provided <execute_python>object_detection_fine_tuning([{'image_path': 'image1.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}, {'image_path': 'image2.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}])</execute_python>", "let_user_respond": false}
+AGENT: {"thoughts": "Because the user has supplied me with labels I can call object_detection_fine_tuning on their behalf to fine tune the model", "response": "<execute_python>object_detection_fine_tuning([{'image_path': 'image1.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}, {'image_path': 'image2.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}])</execute_python>", "let_user_respond": false}
 
 OBSERVATION:
 [Fine tuning id: 23b3b022-5ebf-4798-9373-20ef36429abf]
 
-AGENT: {"thoughts": "The model has finished fine tuning, I will now replace the original florence2_phrase_grounding call with the fine tuning id.", "response": "I will not update the code to use the fine tuned model. <execute_python>use_object_detection_fine_tuning(artifacts, "code.py", "23b3b022-5ebf-4798-9373-20ef36429abf")</execute_python>", "let_user_respond": false}
+AGENT: {"thoughts": "The model has finished fine tuning, I will now replace the original florence2_phrase_grounding call with the fine tuning id.", "response": "<execute_python>use_object_detection_fine_tuning(artifacts, "code.py", "23b3b022-5ebf-4798-9373-20ef36429abf")</execute_python>", "let_user_respond": false}
 
 OBSERVATION:
 [Artifact code.py edits]

From 985fcbc8e4fd7f4f4385b0bc585d8273191816c3 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Wed, 2 Oct 2024 18:49:15 -0700
Subject: [PATCH 03/13] make artifact name more clear

---
 vision_agent/tools/meta_tools.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vision_agent/tools/meta_tools.py b/vision_agent/tools/meta_tools.py
index 976addae..6489652f 100644
--- a/vision_agent/tools/meta_tools.py
+++ b/vision_agent/tools/meta_tools.py
@@ -116,7 +116,7 @@ def show(self, uploaded_file_path: Optional[Union[str, Path]] = None) -> str:
         )
         output_str = "[Artifacts loaded]\n"
         for k in self.artifacts.keys():
-            output_str += f"Artifact {k} loaded to {str(loaded_path / k)}\n"
+            output_str += f"Artifact name: {k}, loaded to path: {str(loaded_path / k)}\n"
         output_str += "[End of artifacts]\n"
         print(output_str)
         return output_str

From 96ad6699849b26a8d313da2f2ebd037005b31e26 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Wed, 2 Oct 2024 18:49:27 -0700
Subject: [PATCH 04/13] don't load artifacts locally

---
 vision_agent/agent/vision_agent.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/vision_agent/agent/vision_agent.py b/vision_agent/agent/vision_agent.py
index 3c1682e8..c4e36156 100644
--- a/vision_agent/agent/vision_agent.py
+++ b/vision_agent/agent/vision_agent.py
@@ -407,8 +407,6 @@ def chat_with_code(
             code_interpreter.download_file(
                 str(remote_artifacts_path.name), str(self.local_artifacts_path)
             )
-            artifacts.load(self.local_artifacts_path)
-            artifacts.save()
         return orig_chat, artifacts
 
     def streaming_message(self, message: Dict[str, Any]) -> None:

From 2499aeaf9a76572839ddc57b89c4fe89554aae75 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Thu, 3 Oct 2024 17:25:22 -0700
Subject: [PATCH 05/13] added more error handling for saving files

---
 tests/unit/tools/test_tools.py | 76 +++++++++++++++++++++++++++-------
 vision_agent/tools/tools.py    | 12 ++++++
 2 files changed, 72 insertions(+), 16 deletions(-)

diff --git a/tests/unit/tools/test_tools.py b/tests/unit/tools/test_tools.py
index b2f1a87a..5fc82b84 100644
--- a/tests/unit/tools/test_tools.py
+++ b/tests/unit/tools/test_tools.py
@@ -1,25 +1,69 @@
-# Generated by CodiumAI
+import os
+import tempfile
 from pathlib import Path
 
 import numpy as np
 
-from vision_agent.tools.tools import save_video
+from vision_agent.tools.tools import save_image, save_video
 
 
-class TestSaveVideo:
-    def test_saves_frames_without_output_path(self):
-        frames = [
-            np.random.randint(0, 256, (480, 640, 3), dtype=np.uint8) for _ in range(10)
-        ]
-        output_path = save_video(frames)
-        assert Path(output_path).exists()
+def test_saves_frames_without_output_path():
+    frames = [
+        np.random.randint(0, 256, (480, 640, 3), dtype=np.uint8) for _ in range(10)
+    ]
+    output_path = save_video(frames)
+    assert Path(output_path).exists()
+    os.remove(output_path)
+
 
-    def test_saves_frames_with_output_path(self, tmp_path):
-        frames = [
-            np.random.randint(0, 256, (480, 640, 3), dtype=np.uint8) for _ in range(10)
-        ]
-        video_output_path = str(tmp_path / "output.mp4")
-        output_path = save_video(frames, video_output_path)
+def test_saves_frames_with_output_path():
+    frames = [
+        np.random.randint(0, 256, (480, 640, 3), dtype=np.uint8) for _ in range(10)
+    ]
 
-        assert output_path == video_output_path
+    with tempfile.TemporaryDirectory() as tmp_dir:
+        video_output_path = Path(tmp_dir) / "output.mp4"
+        output_path = save_video(frames, str(video_output_path))
+
+        assert output_path == str(video_output_path)
         assert Path(output_path).exists()
+
+
+def test_save_null_image():
+    image = None
+    try:
+        save_image(image, "tmp.jpg")
+    except ValueError as e:
+        assert str(e) == "The image is not a valid NumPy array with shape (H, W, C)"
+
+
+def test_save_empty_image():
+    image = np.zeros((0, 0, 3), dtype=np.uint8)
+    try:
+        save_image(image, "tmp.jpg")
+    except ValueError as e:
+        assert str(e) == "The image is not a valid NumPy array with shape (H, W, C)"
+
+
+def test_save_null_video():
+    frames = None
+    try:
+        save_video(frames, "tmp.mp4")
+    except ValueError as e:
+        assert str(e) == "Frames must be a list of NumPy arrays"
+
+def test_save_empty_list():
+    frames = []
+    try:
+        save_video(frames, "tmp.mp4")
+    except ValueError as e:
+        assert str(e) == "Frames must be a list of NumPy arrays"
+
+
+def test_save_invalid_frame():
+    frames = [np.zeros((0, 0, 3), dtype=np.uint8)]
+    try:
+        save_video(frames, "tmp.mp4")
+    except ValueError as e:
+        assert str(e) == "Frame is not a valid NumPy array with shape (H, W, C)"
+
diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py
index 7d881921..486e21a2 100644
--- a/vision_agent/tools/tools.py
+++ b/vision_agent/tools/tools.py
@@ -1808,6 +1808,9 @@ def save_image(image: np.ndarray, file_path: str) -> None:
     """
     from IPython.display import display
 
+    if not isinstance(image, np.ndarray) or (image.shape[0] == 0 and image.shape[1] == 0):
+        raise ValueError("The image is not a valid NumPy array with shape (H, W, C)")
+
     pil_image = Image.fromarray(image.astype(np.uint8)).convert("RGB")
     display(pil_image)
     pil_image.save(file_path)
@@ -1834,6 +1837,15 @@ def save_video(
     if fps <= 0:
         raise ValueError(f"fps must be greater than 0 got {fps}")
 
+    if not isinstance(frames, list) or len(frames) == 0:
+        raise ValueError("Frames must be a list of NumPy arrays")
+
+    for frame in frames:
+        if not isinstance(frame, np.ndarray) or (
+            frame.shape[0] == 0 and frame.shape[1] == 0
+        ):
+            raise ValueError("The frame is not a valid NumPy array with shape (H, W, C)")
+
     if output_video_path is None:
         output_video_path = tempfile.NamedTemporaryFile(
             delete=False, suffix=".mp4"

From 9ecf977d33b3b90c3331ca10b52f682edfbc2c9e Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Fri, 4 Oct 2024 07:49:43 -0700
Subject: [PATCH 06/13] added prompt to keep code closer to user request

---
 vision_agent/agent/vision_agent_coder_prompts.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/vision_agent/agent/vision_agent_coder_prompts.py b/vision_agent/agent/vision_agent_coder_prompts.py
index 07f2c6e2..7be5a221 100644
--- a/vision_agent/agent/vision_agent_coder_prompts.py
+++ b/vision_agent/agent/vision_agent_coder_prompts.py
@@ -314,14 +314,15 @@ def check_helmets(image_path):
 --- END EXAMPLE1 ---
 
 **Instructions**:
-1. **Understand and Clarify**: Make sure you understand the task.
+1. **Understand and Clarify**: Make sure you understand the user request.
 2. **Algorithm/Method Selection**: Decide on the most efficient method, use the tool outputs and tool thoughts to guide you.
 3. **Pseudocode Creation**: Write down the steps you will follow in pseudocode.
 4. **Code Generation**: Translate your pseudocode into executable Python code.
-    4.1. Take in the media path as an argument and load with either `load_image` or `extract_frames_and_timestamps`.
-    4.2. Coordinates are always returned normalized from `vision_agent.tools`.
-    4.3. Do not create dummy input or functions, the code must be usable if the user provides new media.
-    4.4. Use unnormalized coordinates when comparing bounding boxes.
+    4.1. Ensure your code follows the user request first and then the subtasks. 
+    4.2. Take in the media path as an argument and load with either `load_image` or `extract_frames_and_timestamps`.
+    4.3. Coordinates are always returned normalized from `vision_agent.tools`.
+    4.4. Do not create dummy input or functions, the code must be usable if the user provides new media.
+    4.5. Use unnormalized coordinates when comparing bounding boxes.
 """
 
 TEST = """

From 85182e4939aff08ceaf0c95985fe8264a6d7232b Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Fri, 4 Oct 2024 11:11:59 -0700
Subject: [PATCH 07/13] revert back to old prompt

---
 vision_agent/tools/tools.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py
index 486e21a2..698b9ffd 100644
--- a/vision_agent/tools/tools.py
+++ b/vision_agent/tools/tools.py
@@ -1146,10 +1146,10 @@ def florence2_image_caption(image: np.ndarray, detail_caption: bool = True) -> s
 def florence2_phrase_grounding(
     prompt: str, image: np.ndarray, fine_tune_id: Optional[str] = None
 ) -> List[Dict[str, Any]]:
-    """'florence2_phrase_grounding' will run florence2 on a image. It can
-    detect multiple objects given a text prompt which can be object names or caption.
-    You can optionally separate the object names in the text with commas. It returns
-    a list of bounding boxes with normalized coordinates, label names and associated
+    """'florence2_phrase_grounding' is a tool that can detect multiple
+    objects given a text prompt which can be object names or caption. You
+    can optionally separate the object names in the text with commas. It returns a list
+    of bounding boxes with normalized coordinates, label names and associated
     probability scores of 1.0.
 
     Parameters:

From f01643682b048c4c8e9ece1ce761049212ededdd Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Fri, 4 Oct 2024 11:12:05 -0700
Subject: [PATCH 08/13] revert back to old prompt

---
 vision_agent/agent/vision_agent_coder_prompts.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/vision_agent/agent/vision_agent_coder_prompts.py b/vision_agent/agent/vision_agent_coder_prompts.py
index 7be5a221..07f2c6e2 100644
--- a/vision_agent/agent/vision_agent_coder_prompts.py
+++ b/vision_agent/agent/vision_agent_coder_prompts.py
@@ -314,15 +314,14 @@ def check_helmets(image_path):
 --- END EXAMPLE1 ---
 
 **Instructions**:
-1. **Understand and Clarify**: Make sure you understand the user request.
+1. **Understand and Clarify**: Make sure you understand the task.
 2. **Algorithm/Method Selection**: Decide on the most efficient method, use the tool outputs and tool thoughts to guide you.
 3. **Pseudocode Creation**: Write down the steps you will follow in pseudocode.
 4. **Code Generation**: Translate your pseudocode into executable Python code.
-    4.1. Ensure your code follows the user request first and then the subtasks. 
-    4.2. Take in the media path as an argument and load with either `load_image` or `extract_frames_and_timestamps`.
-    4.3. Coordinates are always returned normalized from `vision_agent.tools`.
-    4.4. Do not create dummy input or functions, the code must be usable if the user provides new media.
-    4.5. Use unnormalized coordinates when comparing bounding boxes.
+    4.1. Take in the media path as an argument and load with either `load_image` or `extract_frames_and_timestamps`.
+    4.2. Coordinates are always returned normalized from `vision_agent.tools`.
+    4.3. Do not create dummy input or functions, the code must be usable if the user provides new media.
+    4.4. Use unnormalized coordinates when comparing bounding boxes.
 """
 
 TEST = """

From 5e33fcc5c89c0a8b8a7b21732866c75edc35f4f1 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Fri, 4 Oct 2024 13:31:55 -0700
Subject: [PATCH 09/13] formatting fix

---
 tests/unit/tools/test_tools.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/unit/tools/test_tools.py b/tests/unit/tools/test_tools.py
index 5fc82b84..292d2eae 100644
--- a/tests/unit/tools/test_tools.py
+++ b/tests/unit/tools/test_tools.py
@@ -52,6 +52,7 @@ def test_save_null_video():
     except ValueError as e:
         assert str(e) == "Frames must be a list of NumPy arrays"
 
+
 def test_save_empty_list():
     frames = []
     try:
@@ -66,4 +67,3 @@ def test_save_invalid_frame():
         save_video(frames, "tmp.mp4")
     except ValueError as e:
         assert str(e) == "Frame is not a valid NumPy array with shape (H, W, C)"
-

From dbdb9b3f623331c67e4c205c808d7a11f147fe2f Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Fri, 4 Oct 2024 13:33:44 -0700
Subject: [PATCH 10/13] formatting fix

---
 vision_agent/tools/meta_tools.py  | 4 +++-
 vision_agent/tools/tool_utils.py  | 2 +-
 vision_agent/tools/tools.py       | 8 ++++++--
 vision_agent/tools/tools_types.py | 2 +-
 4 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/vision_agent/tools/meta_tools.py b/vision_agent/tools/meta_tools.py
index 6489652f..dc910300 100644
--- a/vision_agent/tools/meta_tools.py
+++ b/vision_agent/tools/meta_tools.py
@@ -116,7 +116,9 @@ def show(self, uploaded_file_path: Optional[Union[str, Path]] = None) -> str:
         )
         output_str = "[Artifacts loaded]\n"
         for k in self.artifacts.keys():
-            output_str += f"Artifact name: {k}, loaded to path: {str(loaded_path / k)}\n"
+            output_str += (
+                f"Artifact name: {k}, loaded to path: {str(loaded_path / k)}\n"
+            )
         output_str += "[End of artifacts]\n"
         print(output_str)
         return output_str
diff --git a/vision_agent/tools/tool_utils.py b/vision_agent/tools/tool_utils.py
index 924b96e6..b35d6fef 100644
--- a/vision_agent/tools/tool_utils.py
+++ b/vision_agent/tools/tool_utils.py
@@ -1,6 +1,6 @@
-import os
 import inspect
 import logging
+import os
 from base64 import b64encode
 from typing import Any, Callable, Dict, List, MutableMapping, Optional, Tuple
 
diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py
index 698b9ffd..bc73a9ae 100644
--- a/vision_agent/tools/tools.py
+++ b/vision_agent/tools/tools.py
@@ -1808,7 +1808,9 @@ def save_image(image: np.ndarray, file_path: str) -> None:
     """
     from IPython.display import display
 
-    if not isinstance(image, np.ndarray) or (image.shape[0] == 0 and image.shape[1] == 0):
+    if not isinstance(image, np.ndarray) or (
+        image.shape[0] == 0 and image.shape[1] == 0
+    ):
         raise ValueError("The image is not a valid NumPy array with shape (H, W, C)")
 
     pil_image = Image.fromarray(image.astype(np.uint8)).convert("RGB")
@@ -1844,7 +1846,9 @@ def save_video(
         if not isinstance(frame, np.ndarray) or (
             frame.shape[0] == 0 and frame.shape[1] == 0
         ):
-            raise ValueError("The frame is not a valid NumPy array with shape (H, W, C)")
+            raise ValueError(
+                "The frame is not a valid NumPy array with shape (H, W, C)"
+            )
 
     if output_video_path is None:
         output_video_path = tempfile.NamedTemporaryFile(
diff --git a/vision_agent/tools/tools_types.py b/vision_agent/tools/tools_types.py
index 1cc765b6..b2812fc0 100644
--- a/vision_agent/tools/tools_types.py
+++ b/vision_agent/tools/tools_types.py
@@ -1,6 +1,6 @@
 from enum import Enum
-from uuid import UUID
 from typing import List, Optional, Tuple, Union
+from uuid import UUID
 
 from pydantic import BaseModel, ConfigDict, Field, SerializationInfo, field_serializer
 

From 71a7bd653d94b563ce2b3366f8bf4cdccdfbf3b3 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Fri, 4 Oct 2024 13:39:21 -0700
Subject: [PATCH 11/13] fix test case

---
 tests/unit/tools/test_tools.py | 2 +-
 vision_agent/tools/tools.py    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/unit/tools/test_tools.py b/tests/unit/tools/test_tools.py
index 292d2eae..eec3c78f 100644
--- a/tests/unit/tools/test_tools.py
+++ b/tests/unit/tools/test_tools.py
@@ -66,4 +66,4 @@ def test_save_invalid_frame():
     try:
         save_video(frames, "tmp.mp4")
     except ValueError as e:
-        assert str(e) == "Frame is not a valid NumPy array with shape (H, W, C)"
+        assert str(e) == "A frame is not a valid NumPy array with shape (H, W, C)"
diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py
index bc73a9ae..63776be2 100644
--- a/vision_agent/tools/tools.py
+++ b/vision_agent/tools/tools.py
@@ -1847,7 +1847,7 @@ def save_video(
             frame.shape[0] == 0 and frame.shape[1] == 0
         ):
             raise ValueError(
-                "The frame is not a valid NumPy array with shape (H, W, C)"
+                "A frame is not a valid NumPy array with shape (H, W, C)"
             )
 
     if output_video_path is None:

From 0a087ce9e43519e55dc425a917afa226731b09d9 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Fri, 4 Oct 2024 13:40:53 -0700
Subject: [PATCH 12/13] fix format issue

---
 vision_agent/tools/tools.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py
index 63776be2..f83132a5 100644
--- a/vision_agent/tools/tools.py
+++ b/vision_agent/tools/tools.py
@@ -1846,9 +1846,7 @@ def save_video(
         if not isinstance(frame, np.ndarray) or (
             frame.shape[0] == 0 and frame.shape[1] == 0
         ):
-            raise ValueError(
-                "A frame is not a valid NumPy array with shape (H, W, C)"
-            )
+            raise ValueError("A frame is not a valid NumPy array with shape (H, W, C)")
 
     if output_video_path is None:
         output_video_path = tempfile.NamedTemporaryFile(

From 9a1394ef2527cdaadd3a4bd6fc24adade7e673c0 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Fri, 4 Oct 2024 14:30:35 -0700
Subject: [PATCH 13/13] merge overlay count into overlay bbox

---
 vision_agent/tools/__init__.py |  1 -
 vision_agent/tools/tools.py    | 95 ++++++++++++++--------------------
 2 files changed, 40 insertions(+), 56 deletions(-)

diff --git a/vision_agent/tools/__init__.py b/vision_agent/tools/__init__.py
index e5b7c334..da74f677 100644
--- a/vision_agent/tools/__init__.py
+++ b/vision_agent/tools/__init__.py
@@ -45,7 +45,6 @@
     loca_zero_shot_counting,
     ocr,
     overlay_bounding_boxes,
-    overlay_counting_results,
     overlay_heat_map,
     overlay_segmentation_masks,
     owl_v2_image,
diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py
index f83132a5..b2b8a985 100644
--- a/vision_agent/tools/tools.py
+++ b/vision_agent/tools/tools.py
@@ -13,7 +13,7 @@
 import cv2
 import numpy as np
 import requests
-from PIL import Image, ImageDraw, ImageEnhance, ImageFont
+from PIL import Image, ImageDraw, ImageFont
 from pillow_heif import register_heif_opener  # type: ignore
 from pytube import YouTube  # type: ignore
 
@@ -1917,30 +1917,36 @@ def overlay_bounding_boxes(
         bboxes = bbox_int[i]
         bboxes = sorted(bboxes, key=lambda x: x["label"], reverse=True)
 
-        width, height = pil_image.size
-        fontsize = max(12, int(min(width, height) / 40))
-        draw = ImageDraw.Draw(pil_image)
-        font = ImageFont.truetype(
-            str(
-                resources.files("vision_agent.fonts").joinpath("default_font_ch_en.ttf")
-            ),
-            fontsize,
-        )
-
-        for elt in bboxes:
-            label = elt["label"]
-            box = elt["bbox"]
-            scores = elt["score"]
-
-            # denormalize the box if it is normalized
-            box = denormalize_bbox(box, (height, width))
-            draw.rectangle(box, outline=color[label], width=4)
-            text = f"{label}: {scores:.2f}"
-            text_box = draw.textbbox((box[0], box[1]), text=text, font=font)
-            draw.rectangle(
-                (box[0], box[1], text_box[2], text_box[3]), fill=color[label]
+        if len(bboxes) > 20:
+            pil_image = _plot_counting(pil_image, bboxes, color)
+        else:
+            width, height = pil_image.size
+            fontsize = max(12, int(min(width, height) / 40))
+            draw = ImageDraw.Draw(pil_image)
+            font = ImageFont.truetype(
+                str(
+                    resources.files("vision_agent.fonts").joinpath(
+                        "default_font_ch_en.ttf"
+                    )
+                ),
+                fontsize,
             )
-            draw.text((box[0], box[1]), text, fill="black", font=font)
+
+            for elt in bboxes:
+                label = elt["label"]
+                box = elt["bbox"]
+                scores = elt["score"]
+
+                # denormalize the box if it is normalized
+                box = denormalize_bbox(box, (height, width))
+                draw.rectangle(box, outline=color[label], width=4)
+                text = f"{label}: {scores:.2f}"
+                text_box = draw.textbbox((box[0], box[1]), text=text, font=font)
+                draw.rectangle(
+                    (box[0], box[1], text_box[2], text_box[3]), fill=color[label]
+                )
+                draw.text((box[0], box[1]), text, fill="black", font=font)
+
         frame_out.append(np.array(pil_image))
     return frame_out[0] if len(frame_out) == 1 else frame_out
 
@@ -2099,39 +2105,19 @@ def overlay_heat_map(
     return np.array(combined)
 
 
-def overlay_counting_results(
-    image: np.ndarray, instances: List[Dict[str, Any]]
-) -> np.ndarray:
-    """'overlay_counting_results' is a utility function that displays counting results on
-    an image.
-
-    Parameters:
-        image (np.ndarray): The image to display the bounding boxes on.
-        instances (List[Dict[str, Any]]): A list of dictionaries containing the bounding
-            box information of each instance
-
-    Returns:
-        np.ndarray: The image with the instance_id dislpayed
-
-    Example
-    -------
-        >>> image_with_bboxes = overlay_counting_results(
-            image, [{'score': 0.99, 'label': 'object', 'bbox': [0.1, 0.11, 0.35, 0.4]}],
-        )
-    """
-    pil_image = Image.fromarray(image.astype(np.uint8)).convert("RGB")
-    color = (158, 218, 229)
-
-    width, height = pil_image.size
+def _plot_counting(
+    image: Image.Image,
+    bboxes: List[Dict[str, Any]],
+    colors: Dict[str, Tuple[int, int, int]],
+) -> Image.Image:
+    width, height = image.size
     fontsize = max(10, int(min(width, height) / 80))
-    pil_image = ImageEnhance.Brightness(pil_image).enhance(0.5)
-    draw = ImageDraw.Draw(pil_image)
+    draw = ImageDraw.Draw(image)
     font = ImageFont.truetype(
         str(resources.files("vision_agent.fonts").joinpath("default_font_ch_en.ttf")),
         fontsize,
     )
-
-    for i, elt in enumerate(instances, 1):
+    for i, elt in enumerate(bboxes, 1):
         label = f"{i}"
         box = elt["bbox"]
 
@@ -2153,7 +2139,7 @@ def overlay_counting_results(
         text_y1 = cy + text_height / 2
 
         # Draw the rectangle encapsulating the text
-        draw.rectangle((text_x0, text_y0, text_x1, text_y1), fill=color)
+        draw.rectangle((text_x0, text_y0, text_x1, text_y1), fill=colors[elt["label"]])
 
         # Draw the text at the center of the bounding box
         draw.text(
@@ -2164,7 +2150,7 @@ def overlay_counting_results(
             anchor="lt",
         )
 
-    return np.array(pil_image)
+    return image
 
 
 FUNCTION_TOOLS = [
@@ -2197,7 +2183,6 @@ def overlay_counting_results(
     overlay_bounding_boxes,
     overlay_segmentation_masks,
     overlay_heat_map,
-    overlay_counting_results,
 ]
 
 TOOLS = FUNCTION_TOOLS + UTIL_TOOLS