diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py index 4db319f9..73471a30 100644 --- a/tests/unit/test_utils.py +++ b/tests/unit/test_utils.py @@ -1,4 +1,8 @@ -from vision_agent.agent.agent_utils import extract_code, extract_json +from vision_agent.agent.agent_utils import ( + extract_code, + extract_json, + remove_installs_from_code, +) def test_basic_json_extract(): @@ -43,3 +47,19 @@ def test_basic_json_extract(): a_code = extract_code(a) assert "def test_basic_json_extract():" in a_code assert "assert extract_json(a) == {" in a_code + + +def test_remove_installs_from_code(): + a = """import os +imoprt sys + +!pip install pandas + + +def test(): + print("!pip install dummy") +""" + out = remove_installs_from_code(a) + assert "import os" in out + assert "!pip install pandas" not in out + assert "!pip install dummy" in out diff --git a/tests/unit/tools/test_tools.py b/tests/unit/tools/test_tools.py index b2f1a87a..eec3c78f 100644 --- a/tests/unit/tools/test_tools.py +++ b/tests/unit/tools/test_tools.py @@ -1,25 +1,69 @@ -# Generated by CodiumAI +import os +import tempfile from pathlib import Path import numpy as np -from vision_agent.tools.tools import save_video +from vision_agent.tools.tools import save_image, save_video -class TestSaveVideo: - def test_saves_frames_without_output_path(self): - frames = [ - np.random.randint(0, 256, (480, 640, 3), dtype=np.uint8) for _ in range(10) - ] - output_path = save_video(frames) - assert Path(output_path).exists() +def test_saves_frames_without_output_path(): + frames = [ + np.random.randint(0, 256, (480, 640, 3), dtype=np.uint8) for _ in range(10) + ] + output_path = save_video(frames) + assert Path(output_path).exists() + os.remove(output_path) + - def test_saves_frames_with_output_path(self, tmp_path): - frames = [ - np.random.randint(0, 256, (480, 640, 3), dtype=np.uint8) for _ in range(10) - ] - video_output_path = str(tmp_path / "output.mp4") - output_path = save_video(frames, video_output_path) +def test_saves_frames_with_output_path(): + frames = [ + np.random.randint(0, 256, (480, 640, 3), dtype=np.uint8) for _ in range(10) + ] - assert output_path == video_output_path + with tempfile.TemporaryDirectory() as tmp_dir: + video_output_path = Path(tmp_dir) / "output.mp4" + output_path = save_video(frames, str(video_output_path)) + + assert output_path == str(video_output_path) assert Path(output_path).exists() + + +def test_save_null_image(): + image = None + try: + save_image(image, "tmp.jpg") + except ValueError as e: + assert str(e) == "The image is not a valid NumPy array with shape (H, W, C)" + + +def test_save_empty_image(): + image = np.zeros((0, 0, 3), dtype=np.uint8) + try: + save_image(image, "tmp.jpg") + except ValueError as e: + assert str(e) == "The image is not a valid NumPy array with shape (H, W, C)" + + +def test_save_null_video(): + frames = None + try: + save_video(frames, "tmp.mp4") + except ValueError as e: + assert str(e) == "Frames must be a list of NumPy arrays" + + +def test_save_empty_list(): + frames = [] + try: + save_video(frames, "tmp.mp4") + except ValueError as e: + assert str(e) == "Frames must be a list of NumPy arrays" + + +def test_save_invalid_frame(): + frames = [np.zeros((0, 0, 3), dtype=np.uint8)] + try: + save_video(frames, "tmp.mp4") + except ValueError as e: + assert str(e) == "A frame is not a valid NumPy array with shape (H, W, C)" diff --git a/vision_agent/agent/agent_utils.py b/vision_agent/agent/agent_utils.py index dc0debee..624ad608 100644 --- a/vision_agent/agent/agent_utils.py +++ b/vision_agent/agent/agent_utils.py @@ -77,3 +77,9 @@ def extract_code(code: str) -> str: if code.startswith("python\n"): code = code[len("python\n") :] return code + + +def remove_installs_from_code(code: str) -> str: + pattern = r"\n!pip install.*?(\n|\Z)\n" + code = re.sub(pattern, "", code, flags=re.DOTALL) + return code diff --git a/vision_agent/agent/vision_agent.py b/vision_agent/agent/vision_agent.py index 3c1682e8..c4e36156 100644 --- a/vision_agent/agent/vision_agent.py +++ b/vision_agent/agent/vision_agent.py @@ -407,8 +407,6 @@ def chat_with_code( code_interpreter.download_file( str(remote_artifacts_path.name), str(self.local_artifacts_path) ) - artifacts.load(self.local_artifacts_path) - artifacts.save() return orig_chat, artifacts def streaming_message(self, message: Dict[str, Any]) -> None: diff --git a/vision_agent/agent/vision_agent_coder.py b/vision_agent/agent/vision_agent_coder.py index aa4d83da..1e5030a2 100644 --- a/vision_agent/agent/vision_agent_coder.py +++ b/vision_agent/agent/vision_agent_coder.py @@ -13,7 +13,11 @@ import vision_agent.tools as T from vision_agent.agent import Agent -from vision_agent.agent.agent_utils import extract_code, extract_json +from vision_agent.agent.agent_utils import ( + extract_code, + extract_json, + remove_installs_from_code, +) from vision_agent.agent.vision_agent_coder_prompts import ( CODE, FIX_BUG, @@ -836,8 +840,8 @@ def chat_with_workflow( media=media_list, ) success = cast(bool, results["success"]) - code = cast(str, results["code"]) - test = cast(str, results["test"]) + code = remove_installs_from_code(cast(str, results["code"])) + test = remove_installs_from_code(cast(str, results["test"])) working_memory.extend(results["working_memory"]) # type: ignore plan.append({"code": code, "test": test, "plan": plan_i}) diff --git a/vision_agent/agent/vision_agent_prompts.py b/vision_agent/agent/vision_agent_prompts.py index bc3295ef..a8b1e543 100644 --- a/vision_agent/agent/vision_agent_prompts.py +++ b/vision_agent/agent/vision_agent_prompts.py @@ -28,7 +28,8 @@ 1. **Understand and Clarify**: Make sure you understand the task, ask clarifying questions if the task is not clear. 2. **Code Generation**: Only use code provided in the Documentation in your tags. Only use `edit_vision_code` to modify code written by `generate_vision_code`. 3. **Execute**: Do only what the user asked you to do and no more. If you need to ask the user a question, set `let_user_respond` to `true`. -4. **Output in JSON**: Respond in the following format in JSON: +4. **Response**: Keep your responses short and concise. Provide the user only with the information they need to continue the conversation. +5. **Output in JSON**: Respond in the following format in JSON: ```json {{"thoughts": , "response": , "let_user_respond": }}. @@ -62,7 +63,7 @@ [{'score': 0.99, 'label': 'dog', 'box': [0.1, 0.2, 0.3, 0.4]}, {'score': 0.23, 'label': 'dog', 'box': [0.2, 0.3, 0.4, 0.5]}] -AGENT: {"thoughts": "Two dogs are detected, I will show this to the user and ask them if the result looks good.", "response": "I have written the code to detect dogs and shown the output, do the results look good to you?", "let_user_respond": true} +AGENT: {"thoughts": "Two dogs are detected, I will show this to the user and ask them if the result looks good.", "response": "The code detectd two dogs, do the results look good to you?", "let_user_respond": true} """ EXAMPLES_CODE1_EXTRA = """ @@ -91,7 +92,7 @@ ----- stdout ----- [{'score': 0.99, 'label': 'dog', 'box': [0.1, 0.2, 0.3, 0.4]}] -AGENT: {"thoughts": "One dog is detected, I will show this to the user and ask them if the result looks good.", "response": "I have written the code to detect one dog and shown you the output, do the results look good to you?", "let_user_respond": true} +AGENT: {"thoughts": "One dog is detected, I will show this to the user and ask them if the result looks good.", "response": "The code detected one dog, do these results look good to you?", "let_user_respond": true} """ EXAMPLES_CODE2 = """ @@ -157,16 +158,16 @@ ----- stdout ----- 2 -AGENT: {"thoughts": "Two workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "I have written the code to count the workers wearing helmets in code.py and saved the visualization under 'workers_viz.png'.", "let_user_respond": true} +AGENT: {"thoughts": "Two workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "The code to detect workers with helmets is saved in code.py and the visualization under 'workers_viz.png'.", "let_user_respond": true} USER: The detections are slightly off. Can you fine tune florence2 using these labels? "[{'image_path': 'image1.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}, {'image_path': 'image2.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}]" -AGENT: {"thoughts": "Because the user has supplied me with labels I can call object_detection_fine_tuning on their behalf to fine tune the model", "response": "I will fine tune florence2 with the labels you provided object_detection_fine_tuning([{'image_path': 'image1.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}, {'image_path': 'image2.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}])", "let_user_respond": false} +AGENT: {"thoughts": "Because the user has supplied me with labels I can call object_detection_fine_tuning on their behalf to fine tune the model", "response": "object_detection_fine_tuning([{'image_path': 'image1.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}, {'image_path': 'image2.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}])", "let_user_respond": false} OBSERVATION: [Fine tuning id: 23b3b022-5ebf-4798-9373-20ef36429abf] -AGENT: {"thoughts": "The model has finished fine tuning, I will now replace the original florence2_phrase_grounding call with the fine tuning id.", "response": "I will not update the code to use the fine tuned model. use_object_detection_fine_tuning(artifacts, "code.py", "23b3b022-5ebf-4798-9373-20ef36429abf")", "let_user_respond": false} +AGENT: {"thoughts": "The model has finished fine tuning, I will now replace the original florence2_phrase_grounding call with the fine tuning id.", "response": "use_object_detection_fine_tuning(artifacts, "code.py", "23b3b022-5ebf-4798-9373-20ef36429abf")", "let_user_respond": false} OBSERVATION: [Artifact code.py edits] diff --git a/vision_agent/tools/__init__.py b/vision_agent/tools/__init__.py index e5b7c334..da74f677 100644 --- a/vision_agent/tools/__init__.py +++ b/vision_agent/tools/__init__.py @@ -45,7 +45,6 @@ loca_zero_shot_counting, ocr, overlay_bounding_boxes, - overlay_counting_results, overlay_heat_map, overlay_segmentation_masks, owl_v2_image, diff --git a/vision_agent/tools/meta_tools.py b/vision_agent/tools/meta_tools.py index 976addae..dc910300 100644 --- a/vision_agent/tools/meta_tools.py +++ b/vision_agent/tools/meta_tools.py @@ -116,7 +116,9 @@ def show(self, uploaded_file_path: Optional[Union[str, Path]] = None) -> str: ) output_str = "[Artifacts loaded]\n" for k in self.artifacts.keys(): - output_str += f"Artifact {k} loaded to {str(loaded_path / k)}\n" + output_str += ( + f"Artifact name: {k}, loaded to path: {str(loaded_path / k)}\n" + ) output_str += "[End of artifacts]\n" print(output_str) return output_str diff --git a/vision_agent/tools/tool_utils.py b/vision_agent/tools/tool_utils.py index 924b96e6..b35d6fef 100644 --- a/vision_agent/tools/tool_utils.py +++ b/vision_agent/tools/tool_utils.py @@ -1,6 +1,6 @@ -import os import inspect import logging +import os from base64 import b64encode from typing import Any, Callable, Dict, List, MutableMapping, Optional, Tuple diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py index b33df8ec..71646c45 100644 --- a/vision_agent/tools/tools.py +++ b/vision_agent/tools/tools.py @@ -13,7 +13,7 @@ import cv2 import numpy as np import requests -from PIL import Image, ImageDraw, ImageEnhance, ImageFont +from PIL import Image, ImageDraw, ImageFont from pillow_heif import register_heif_opener # type: ignore from pytube import YouTube # type: ignore @@ -1150,10 +1150,10 @@ def florence2_image_caption(image: np.ndarray, detail_caption: bool = True) -> s def florence2_phrase_grounding( prompt: str, image: np.ndarray, fine_tune_id: Optional[str] = None ) -> List[Dict[str, Any]]: - """'florence2_phrase_grounding' will run florence2 on a image. It can - detect multiple objects given a text prompt which can be object names or caption. - You can optionally separate the object names in the text with commas. It returns - a list of bounding boxes with normalized coordinates, label names and associated + """'florence2_phrase_grounding' is a tool that can detect multiple + objects given a text prompt which can be object names or caption. You + can optionally separate the object names in the text with commas. It returns a list + of bounding boxes with normalized coordinates, label names and associated probability scores of 1.0. Parameters: @@ -1812,6 +1812,11 @@ def save_image(image: np.ndarray, file_path: str) -> None: """ from IPython.display import display + if not isinstance(image, np.ndarray) or ( + image.shape[0] == 0 and image.shape[1] == 0 + ): + raise ValueError("The image is not a valid NumPy array with shape (H, W, C)") + pil_image = Image.fromarray(image.astype(np.uint8)).convert("RGB") display(pil_image) pil_image.save(file_path) @@ -1838,6 +1843,15 @@ def save_video( if fps <= 0: raise ValueError(f"fps must be greater than 0 got {fps}") + if not isinstance(frames, list) or len(frames) == 0: + raise ValueError("Frames must be a list of NumPy arrays") + + for frame in frames: + if not isinstance(frame, np.ndarray) or ( + frame.shape[0] == 0 and frame.shape[1] == 0 + ): + raise ValueError("A frame is not a valid NumPy array with shape (H, W, C)") + if output_video_path is None: output_video_path = tempfile.NamedTemporaryFile( delete=False, suffix=".mp4" @@ -1907,30 +1921,36 @@ def overlay_bounding_boxes( bboxes = bbox_int[i] bboxes = sorted(bboxes, key=lambda x: x["label"], reverse=True) - width, height = pil_image.size - fontsize = max(12, int(min(width, height) / 40)) - draw = ImageDraw.Draw(pil_image) - font = ImageFont.truetype( - str( - resources.files("vision_agent.fonts").joinpath("default_font_ch_en.ttf") - ), - fontsize, - ) - - for elt in bboxes: - label = elt["label"] - box = elt["bbox"] - scores = elt["score"] - - # denormalize the box if it is normalized - box = denormalize_bbox(box, (height, width)) - draw.rectangle(box, outline=color[label], width=4) - text = f"{label}: {scores:.2f}" - text_box = draw.textbbox((box[0], box[1]), text=text, font=font) - draw.rectangle( - (box[0], box[1], text_box[2], text_box[3]), fill=color[label] + if len(bboxes) > 20: + pil_image = _plot_counting(pil_image, bboxes, color) + else: + width, height = pil_image.size + fontsize = max(12, int(min(width, height) / 40)) + draw = ImageDraw.Draw(pil_image) + font = ImageFont.truetype( + str( + resources.files("vision_agent.fonts").joinpath( + "default_font_ch_en.ttf" + ) + ), + fontsize, ) - draw.text((box[0], box[1]), text, fill="black", font=font) + + for elt in bboxes: + label = elt["label"] + box = elt["bbox"] + scores = elt["score"] + + # denormalize the box if it is normalized + box = denormalize_bbox(box, (height, width)) + draw.rectangle(box, outline=color[label], width=4) + text = f"{label}: {scores:.2f}" + text_box = draw.textbbox((box[0], box[1]), text=text, font=font) + draw.rectangle( + (box[0], box[1], text_box[2], text_box[3]), fill=color[label] + ) + draw.text((box[0], box[1]), text, fill="black", font=font) + frame_out.append(np.array(pil_image)) return frame_out[0] if len(frame_out) == 1 else frame_out @@ -2089,39 +2109,19 @@ def overlay_heat_map( return np.array(combined) -def overlay_counting_results( - image: np.ndarray, instances: List[Dict[str, Any]] -) -> np.ndarray: - """'overlay_counting_results' is a utility function that displays counting results on - an image. - - Parameters: - image (np.ndarray): The image to display the bounding boxes on. - instances (List[Dict[str, Any]]): A list of dictionaries containing the bounding - box information of each instance - - Returns: - np.ndarray: The image with the instance_id dislpayed - - Example - ------- - >>> image_with_bboxes = overlay_counting_results( - image, [{'score': 0.99, 'label': 'object', 'bbox': [0.1, 0.11, 0.35, 0.4]}], - ) - """ - pil_image = Image.fromarray(image.astype(np.uint8)).convert("RGB") - color = (158, 218, 229) - - width, height = pil_image.size +def _plot_counting( + image: Image.Image, + bboxes: List[Dict[str, Any]], + colors: Dict[str, Tuple[int, int, int]], +) -> Image.Image: + width, height = image.size fontsize = max(10, int(min(width, height) / 80)) - pil_image = ImageEnhance.Brightness(pil_image).enhance(0.5) - draw = ImageDraw.Draw(pil_image) + draw = ImageDraw.Draw(image) font = ImageFont.truetype( str(resources.files("vision_agent.fonts").joinpath("default_font_ch_en.ttf")), fontsize, ) - - for i, elt in enumerate(instances, 1): + for i, elt in enumerate(bboxes, 1): label = f"{i}" box = elt["bbox"] @@ -2143,7 +2143,7 @@ def overlay_counting_results( text_y1 = cy + text_height / 2 # Draw the rectangle encapsulating the text - draw.rectangle((text_x0, text_y0, text_x1, text_y1), fill=color) + draw.rectangle((text_x0, text_y0, text_x1, text_y1), fill=colors[elt["label"]]) # Draw the text at the center of the bounding box draw.text( @@ -2154,7 +2154,7 @@ def overlay_counting_results( anchor="lt", ) - return np.array(pil_image) + return image FUNCTION_TOOLS = [ @@ -2187,7 +2187,6 @@ def overlay_counting_results( overlay_bounding_boxes, overlay_segmentation_masks, overlay_heat_map, - overlay_counting_results, ] TOOLS = FUNCTION_TOOLS + UTIL_TOOLS diff --git a/vision_agent/tools/tools_types.py b/vision_agent/tools/tools_types.py index 1cc765b6..b2812fc0 100644 --- a/vision_agent/tools/tools_types.py +++ b/vision_agent/tools/tools_types.py @@ -1,6 +1,6 @@ from enum import Enum -from uuid import UUID from typing import List, Optional, Tuple, Union +from uuid import UUID from pydantic import BaseModel, ConfigDict, Field, SerializationInfo, field_serializer