From bbe0a68e5a6ae84c254b34af85e645121d4b5ff1 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Fri, 29 Mar 2024 11:26:05 -0700
Subject: [PATCH] Update Tools to Handle Video (#32)

* added iou tools

* add image visualization for reflection

* update tool return format

* typing and flake8 issues

* added visualized images to all_tool_results
---
 tests/tools/test_tools.py          |  26 ++++
 vision_agent/agent/vision_agent.py |  92 ++++++++++++---
 vision_agent/image_utils.py        | 100 +++++++++++++++-
 vision_agent/tools/__init__.py     |  15 ++-
 vision_agent/tools/tools.py        | 183 +++++++++++++++++++----------
 vision_agent/tools/video.py        |  12 +-
 6 files changed, 339 insertions(+), 89 deletions(-)
 create mode 100644 tests/tools/test_tools.py

diff --git a/tests/tools/test_tools.py b/tests/tools/test_tools.py
new file mode 100644
index 00000000..00f85072
--- /dev/null
+++ b/tests/tools/test_tools.py
@@ -0,0 +1,26 @@
+import os
+import tempfile
+
+import numpy as np
+from PIL import Image
+
+from vision_agent.tools.tools import BboxIoU, SegIoU
+
+
+def test_bbox_iou():
+    bbox1 = [0, 0, 0.75, 0.75]
+    bbox2 = [0.25, 0.25, 1, 1]
+    assert BboxIoU()(bbox1, bbox2) == 0.29
+
+
+def test_seg_iou():
+    mask1 = np.zeros((10, 10), dtype=np.uint8)
+    mask1[2:4, 2:4] = 255
+    mask2 = np.zeros((10, 10), dtype=np.uint8)
+    mask2[3:5, 3:5] = 255
+    with tempfile.TemporaryDirectory() as tmpdir:
+        mask1_path = os.path.join(tmpdir, "mask1.png")
+        mask2_path = os.path.join(tmpdir, "mask2.png")
+        Image.fromarray(mask1).save(mask1_path)
+        Image.fromarray(mask2).save(mask2_path)
+        assert SegIoU()(mask1_path, mask2_path) == 0.14
diff --git a/vision_agent/agent/vision_agent.py b/vision_agent/agent/vision_agent.py
index 5d34fe9e..e9e6d66d 100644
--- a/vision_agent/agent/vision_agent.py
+++ b/vision_agent/agent/vision_agent.py
@@ -1,11 +1,13 @@
 import json
 import logging
 import sys
+import tempfile
 from pathlib import Path
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
 from tabulate import tabulate
 
+from vision_agent.image_utils import overlay_bboxes, overlay_masks
 from vision_agent.llm import LLM, OpenAILLM
 from vision_agent.lmm import LMM, OpenAILMM
 from vision_agent.tools import TOOLS
@@ -248,12 +250,12 @@ def retrieval(
     tools: Dict[int, Any],
     previous_log: str,
     reflections: str,
-) -> Tuple[List[Dict], str]:
+) -> Tuple[Dict, str]:
     tool_id = choose_tool(
         model, question, {k: v["description"] for k, v in tools.items()}, reflections
     )
     if tool_id is None:
-        return [{}], ""
+        return {}, ""
     _LOGGER.info(f"\t(Tool ID, name): ({tool_id}, {tools[tool_id]['name']})")
 
     tool_instructions = tools[tool_id]
@@ -265,14 +267,12 @@ def retrieval(
     )
     _LOGGER.info(f"\tParameters: {parameters} for {tool_name}")
     if parameters is None:
-        return [{}], ""
-    tool_results = [
-        {"task": question, "tool_name": tool_name, "parameters": parameters}
-    ]
+        return {}, ""
+    tool_results = {"task": question, "tool_name": tool_name, "parameters": parameters}
 
     _LOGGER.info(
-        f"""Going to run the following {len(tool_results)} tool(s) in sequence:
-{tabulate(tool_results, headers="keys", tablefmt="mixed_grid")}"""
+        f"""Going to run the following tool(s) in sequence:
+{tabulate([tool_results], headers="keys", tablefmt="mixed_grid")}"""
     )
 
     def parse_tool_results(result: Dict[str, Union[Dict, List]]) -> Any:
@@ -286,12 +286,10 @@ def parse_tool_results(result: Dict[str, Union[Dict, List]]) -> Any:
                 call_results.append(function_call(tools[tool_id]["class"], parameters))
         return call_results
 
-    call_results = []
-    for i, result in enumerate(tool_results):
-        call_results.extend(parse_tool_results(result))
-        tool_results[i]["call_results"] = call_results
+    call_results = parse_tool_results(tool_results)
+    tool_results["call_results"] = call_results
 
-    call_results_str = "\n\n".join([str(e) for e in call_results if e is not None])
+    call_results_str = str(call_results)
     _LOGGER.info(f"\tCall Results: {call_results_str}")
     return tool_results, call_results_str
 
@@ -335,7 +333,11 @@ def self_reflect(
         tool_results=str(tool_result),
         final_answer=final_answer,
     )
-    if issubclass(type(reflect_model), LMM):
+    if (
+        issubclass(type(reflect_model), LMM)
+        and image is not None
+        and Path(image).suffix in [".jpg", ".jpeg", ".png"]
+    ):
         return reflect_model(prompt, image=image)  # type: ignore
     return reflect_model(prompt)
 
@@ -345,6 +347,56 @@ def parse_reflect(reflect: str) -> bool:
     return "finish" in reflect.lower() and len(reflect) < 100
 
 
+def visualize_result(all_tool_results: List[Dict]) -> List[str]:
+    image_to_data: Dict[str, Dict] = {}
+    for tool_result in all_tool_results:
+        if not tool_result["tool_name"] in ["grounding_sam_", "grounding_dino_"]:
+            continue
+
+        parameters = tool_result["parameters"]
+        # parameters can either be a dictionary or list, parameters can also be malformed
+        # becaus the LLM builds them
+        if isinstance(parameters, dict):
+            if "image" not in parameters:
+                continue
+            parameters = [parameters]
+        elif isinstance(tool_result["parameters"], list):
+            if (
+                len(tool_result["parameters"]) < 1
+                and "image" not in tool_result["parameters"][0]
+            ):
+                continue
+
+        for param, call_result in zip(parameters, tool_result["call_results"]):
+
+            # calls can fail, so we need to check if the call was successful
+            if not isinstance(call_result, dict):
+                continue
+            if "bboxes" not in call_result:
+                continue
+
+            # if the call was successful, then we can add the image data
+            image = param["image"]
+            if image not in image_to_data:
+                image_to_data[image] = {"bboxes": [], "masks": [], "labels": []}
+
+            image_to_data[image]["bboxes"].extend(call_result["bboxes"])
+            image_to_data[image]["labels"].extend(call_result["labels"])
+            if "masks" in call_result:
+                image_to_data[image]["masks"].extend(call_result["masks"])
+
+    visualized_images = []
+    for image in image_to_data:
+        image_path = Path(image)
+        image_data = image_to_data[image]
+        image = overlay_masks(image_path, image_data)
+        image = overlay_bboxes(image, image_data)
+        with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as f:
+            image.save(f.name)
+            visualized_images.append(f.name)
+    return visualized_images
+
+
 class VisionAgent(Agent):
     r"""Vision Agent is an agent framework that utilizes tools as well as self
     reflection to accomplish tasks, in particular vision tasks. Vision Agent is based
@@ -389,7 +441,8 @@ def __call__(
         """Invoke the vision agent.
 
         Parameters:
-            input: a prompt that describe the task or a conversation in the format of [{"role": "user", "content": "describe your task here..."}].
+            input: a prompt that describe the task or a conversation in the format of
+                [{"role": "user", "content": "describe your task here..."}].
             image: the input image referenced in the prompt parameter.
 
         Returns:
@@ -436,9 +489,8 @@ def chat_with_workflow(
                     self.answer_model, task_str, call_results, previous_log, reflections
                 )
 
-                for tool_result in tool_results:
-                    tool_result["answer"] = answer
-                all_tool_results.extend(tool_results)
+                tool_results["answer"] = answer
+                all_tool_results.append(tool_results)
 
                 _LOGGER.info(f"\tAnswer: {answer}")
                 answers.append({"task": task_str, "answer": answer})
@@ -448,13 +500,15 @@ def chat_with_workflow(
                 self.answer_model, question, answers, reflections
             )
 
+            visualized_images = visualize_result(all_tool_results)
+            all_tool_results.append({"visualized_images": visualized_images})
             reflection = self_reflect(
                 self.reflect_model,
                 question,
                 self.tools,
                 all_tool_results,
                 final_answer,
-                image,
+                visualized_images[0] if len(visualized_images) > 0 else image,
             )
             _LOGGER.info(f"\tReflection: {reflection}")
             if parse_reflect(reflection):
diff --git a/vision_agent/image_utils.py b/vision_agent/image_utils.py
index 05a129ce..849f912f 100644
--- a/vision_agent/image_utils.py
+++ b/vision_agent/image_utils.py
@@ -3,15 +3,38 @@
 import base64
 from io import BytesIO
 from pathlib import Path
-from typing import Tuple, Union
+from typing import Dict, Tuple, Union
 
 import numpy as np
-from PIL import Image
+from PIL import Image, ImageDraw, ImageFont
 from PIL.Image import Image as ImageType
 
+COLORS = [
+    (158, 218, 229),
+    (219, 219, 141),
+    (23, 190, 207),
+    (188, 189, 34),
+    (199, 199, 199),
+    (247, 182, 210),
+    (127, 127, 127),
+    (227, 119, 194),
+    (196, 156, 148),
+    (197, 176, 213),
+    (140, 86, 75),
+    (148, 103, 189),
+    (255, 152, 150),
+    (152, 223, 138),
+    (214, 39, 40),
+    (44, 160, 44),
+    (255, 187, 120),
+    (174, 199, 232),
+    (255, 127, 14),
+    (31, 119, 180),
+]
+
 
 def b64_to_pil(b64_str: str) -> ImageType:
-    """Convert a base64 string to a PIL Image.
+    r"""Convert a base64 string to a PIL Image.
 
     Parameters:
         b64_str: the base64 encoded image
@@ -26,7 +49,7 @@ def b64_to_pil(b64_str: str) -> ImageType:
 
 
 def get_image_size(data: Union[str, Path, np.ndarray, ImageType]) -> Tuple[int, ...]:
-    """Get the size of an image.
+    r"""Get the size of an image.
 
     Parameters:
         data: the input image
@@ -41,7 +64,7 @@ def get_image_size(data: Union[str, Path, np.ndarray, ImageType]) -> Tuple[int,
 
 
 def convert_to_b64(data: Union[str, Path, np.ndarray, ImageType]) -> str:
-    """Convert an image to a base64 string.
+    r"""Convert an image to a base64 string.
 
     Parameters:
         data: the input image
@@ -60,3 +83,70 @@ def convert_to_b64(data: Union[str, Path, np.ndarray, ImageType]) -> str:
     else:
         arr_bytes = data.tobytes()
         return base64.b64encode(arr_bytes).decode("utf-8")
+
+
+def overlay_bboxes(
+    image: Union[str, Path, np.ndarray, ImageType], bboxes: Dict
+) -> ImageType:
+    r"""Plots bounding boxes on to an image.
+
+    Parameters:
+        image: the input image
+        bboxes: the bounding boxes to overlay
+
+    Returns:
+        The image with the bounding boxes overlayed
+    """
+    if isinstance(image, (str, Path)):
+        image = Image.open(image)
+    elif isinstance(image, np.ndarray):
+        image = Image.fromarray(image)
+
+    color = {label: COLORS[i % len(COLORS)] for i, label in enumerate(bboxes["labels"])}
+
+    draw = ImageDraw.Draw(image)
+    font = ImageFont.load_default()
+    width, height = image.size
+    if "bboxes" not in bboxes:
+        return image.convert("RGB")
+
+    for label, box in zip(bboxes["labels"], bboxes["bboxes"]):
+        box = [box[0] * width, box[1] * height, box[2] * width, box[3] * height]
+        draw.rectangle(box, outline=color[label], width=3)
+        label = f"{label}"
+        text_box = draw.textbbox((box[0], box[1]), text=label, font=font)
+        draw.rectangle(text_box, fill=color[label])
+        draw.text((text_box[0], text_box[1]), label, fill="black", font=font)
+    return image.convert("RGB")
+
+
+def overlay_masks(
+    image: Union[str, Path, np.ndarray, ImageType], masks: Dict, alpha: float = 0.5
+) -> ImageType:
+    r"""Plots masks on to an image.
+
+    Parameters:
+        image: the input image
+        masks: the masks to overlay
+        alpha: the transparency of the overlay
+
+    Returns:
+        The image with the masks overlayed
+    """
+    if isinstance(image, (str, Path)):
+        image = Image.open(image)
+    elif isinstance(image, np.ndarray):
+        image = Image.fromarray(image)
+
+    color = {label: COLORS[i % len(COLORS)] for i, label in enumerate(masks["labels"])}
+    if "masks" not in masks:
+        return image.convert("RGB")
+
+    for label, mask in zip(masks["labels"], masks["masks"]):
+        if isinstance(mask, str):
+            mask = np.array(Image.open(mask))
+        np_mask = np.zeros((image.size[1], image.size[0], 4))
+        np_mask[mask > 0, :] = color[label] + (255 * alpha,)
+        mask_img = Image.fromarray(np_mask.astype(np.uint8))
+        image = Image.alpha_composite(image.convert("RGBA"), mask_img)
+    return image.convert("RGB")
diff --git a/vision_agent/tools/__init__.py b/vision_agent/tools/__init__.py
index 7e5636c2..20dc503a 100644
--- a/vision_agent/tools/__init__.py
+++ b/vision_agent/tools/__init__.py
@@ -1,2 +1,15 @@
 from .prompts import CHOOSE_PARAMS, SYSTEM_PROMPT
-from .tools import CLIP, TOOLS, Counter, Crop, GroundingDINO, GroundingSAM, Tool
+from .tools import (
+    CLIP,
+    TOOLS,
+    BboxArea,
+    BboxIoU,
+    Counter,
+    Crop,
+    ExtractFrames,
+    GroundingDINO,
+    GroundingSAM,
+    SegArea,
+    SegIoU,
+    Tool,
+)
diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py
index 6e55f210..f13c14dd 100644
--- a/vision_agent/tools/tools.py
+++ b/vision_agent/tools/tools.py
@@ -92,7 +92,7 @@ class CLIP(Tool):
     }
 
     # TODO: Add support for input multiple images, which aligns with the output type.
-    def __call__(self, prompt: List[str], image: Union[str, ImageType]) -> List[Dict]:
+    def __call__(self, prompt: List[str], image: Union[str, ImageType]) -> Dict:
         """Invoke the CLIP model.
 
         Parameters:
@@ -122,7 +122,7 @@ def __call__(self, prompt: List[str], image: Union[str, ImageType]) -> List[Dict
         rets = []
         for elt in resp_json["data"]:
             rets.append({"labels": prompt, "scores": [round(prob, 2) for prob in elt]})
-        return cast(List[Dict], rets)
+        return cast(Dict, rets[0])
 
 
 class GroundingDINO(Tool):
@@ -168,7 +168,7 @@ class GroundingDINO(Tool):
     }
 
     # TODO: Add support for input multiple images, which aligns with the output type.
-    def __call__(self, prompt: str, image: Union[str, Path, ImageType]) -> List[Dict]:
+    def __call__(self, prompt: str, image: Union[str, Path, ImageType]) -> Dict:
         """Invoke the Grounding DINO model.
 
         Parameters:
@@ -204,7 +204,7 @@ def __call__(self, prompt: str, image: Union[str, Path, ImageType]) -> List[Dict
             if "scores" in elt:
                 elt["scores"] = [round(score, 2) for score in elt["scores"]]
             elt["size"] = (image_size[1], image_size[0])
-        return cast(List[Dict], resp_data)
+        return cast(Dict, resp_data)
 
 
 class GroundingSAM(Tool):
@@ -259,7 +259,7 @@ class GroundingSAM(Tool):
     }
 
     # TODO: Add support for input multiple images, which aligns with the output type.
-    def __call__(self, prompt: List[str], image: Union[str, ImageType]) -> List[Dict]:
+    def __call__(self, prompt: List[str], image: Union[str, ImageType]) -> Dict:
         """Invoke the Grounding SAM model.
 
         Parameters:
@@ -294,7 +294,7 @@ def __call__(self, prompt: List[str], image: Union[str, ImageType]) -> List[Dict
             ret_pred["labels"].append(pred["label_name"])
             ret_pred["bboxes"].append(normalize_bbox(pred["bbox"], image_size))
             ret_pred["masks"].append(mask)
-        return [ret_pred]
+        return ret_pred
 
 
 class AgentGroundingSAM(GroundingSAM):
@@ -302,15 +302,14 @@ class AgentGroundingSAM(GroundingSAM):
     returns the file name. This makes it easier for agents to use.
     """
 
-    def __call__(self, prompt: List[str], image: Union[str, ImageType]) -> List[Dict]:
+    def __call__(self, prompt: List[str], image: Union[str, ImageType]) -> Dict:
         rets = super().__call__(prompt, image)
-        for ret in rets:
-            mask_files = []
-            for mask in ret["masks"]:
-                with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp:
-                    Image.fromarray(mask * 255).save(tmp)
-                    mask_files.append(tmp.name)
-            ret["masks"] = mask_files
+        mask_files = []
+        for mask in rets["masks"]:
+            with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
+                Image.fromarray(mask * 255).save(tmp)
+                mask_files.append(tmp.name)
+        rets["masks"] = mask_files
         return rets
 
 
@@ -363,7 +362,7 @@ class Crop(Tool):
         ],
     }
 
-    def __call__(self, bbox: List[float], image: Union[str, Path]) -> str:
+    def __call__(self, bbox: List[float], image: Union[str, Path]) -> Dict:
         pil_image = Image.open(image)
         width, height = pil_image.size
         bbox = [
@@ -373,10 +372,10 @@ def __call__(self, bbox: List[float], image: Union[str, Path]) -> str:
             int(bbox[3] * height),
         ]
         cropped_image = pil_image.crop(bbox)  # type: ignore
-        with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp:
+        with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
             cropped_image.save(tmp.name)
 
-        return tmp.name
+        return {"image": tmp.name}
 
 
 class BboxArea(Tool):
@@ -388,7 +387,7 @@ class BboxArea(Tool):
         "required_parameters": [{"name": "bbox", "type": "List[int]"}],
         "examples": [
             {
-                "scenario": "If you want to calculate the area of the bounding box [0, 0, 100, 100]",
+                "scenario": "If you want to calculate the area of the bounding box [0.2, 0.21, 0.34, 0.42]",
                 "parameters": {"bboxes": [0.2, 0.21, 0.34, 0.42]},
             }
         ],
@@ -430,6 +429,109 @@ def __call__(self, masks: Union[str, Path]) -> float:
         return cast(float, round(np.sum(np_mask) / 255, 2))
 
 
+class BboxIoU(Tool):
+    name = "bbox_iou_"
+    description = (
+        "'bbox_iou_' returns the intersection over union of two bounding boxes."
+    )
+    usage = {
+        "required_parameters": [
+            {"name": "bbox1", "type": "List[int]"},
+            {"name": "bbox2", "type": "List[int]"},
+        ],
+        "examples": [
+            {
+                "scenario": "If you want to calculate the intersection over union of the bounding boxes [0.2, 0.21, 0.34, 0.42] and [0.3, 0.31, 0.44, 0.52]",
+                "parameters": {
+                    "bbox1": [0.2, 0.21, 0.34, 0.42],
+                    "bbox2": [0.3, 0.31, 0.44, 0.52],
+                },
+            }
+        ],
+    }
+
+    def __call__(self, bbox1: List[int], bbox2: List[int]) -> float:
+        x1, y1, x2, y2 = bbox1
+        x3, y3, x4, y4 = bbox2
+        xA = max(x1, x3)
+        yA = max(y1, y3)
+        xB = min(x2, x4)
+        yB = min(y2, y4)
+        inter_area = max(0, xB - xA) * max(0, yB - yA)
+        boxa_area = (x2 - x1) * (y2 - y1)
+        boxb_area = (x4 - x3) * (y4 - y3)
+        iou = inter_area / float(boxa_area + boxb_area - inter_area)
+        return round(iou, 2)
+
+
+class SegIoU(Tool):
+    name = "seg_iou_"
+    description = "'seg_iou_' returns the intersection over union of two segmentation masks given their segmentation mask files."
+    usage = {
+        "required_parameters": [
+            {"name": "mask1", "type": "str"},
+            {"name": "mask2", "type": "str"},
+        ],
+        "examples": [
+            {
+                "scenario": "If you want to calculate the intersection over union of the segmentation masks for mask_file1.jpg and mask_file2.jpg",
+                "parameters": {"mask1": "mask_file1.png", "mask2": "mask_file2.png"},
+            }
+        ],
+    }
+
+    def __call__(self, mask1: Union[str, Path], mask2: Union[str, Path]) -> float:
+        pil_mask1 = Image.open(str(mask1))
+        pil_mask2 = Image.open(str(mask2))
+        np_mask1 = np.clip(np.array(pil_mask1), 0, 1)
+        np_mask2 = np.clip(np.array(pil_mask2), 0, 1)
+        intersection = np.logical_and(np_mask1, np_mask2)
+        union = np.logical_or(np_mask1, np_mask2)
+        iou = np.sum(intersection) / np.sum(union)
+        return cast(float, round(iou, 2))
+
+
+class ExtractFrames(Tool):
+    r"""Extract frames from a video."""
+
+    name = "extract_frames_"
+    description = "'extract_frames_' extracts frames where there is motion detected in a video, returns a list of tuples (frame, timestamp), where timestamp is the relative time in seconds where teh frame was captured. The frame is a local image file path."
+    usage = {
+        "required_parameters": [{"name": "video_uri", "type": "str"}],
+        "examples": [
+            {
+                "scenario": "Can you extract the frames from this video? Video: www.foobar.com/video?name=test.mp4",
+                "parameters": {"video_uri": "www.foobar.com/video?name=test.mp4"},
+            },
+            {
+                "scenario": "Can you extract the images from this video file? Video path: tests/data/test.mp4",
+                "parameters": {"video_uri": "tests/data/test.mp4"},
+            },
+        ],
+    }
+
+    def __call__(self, video_uri: str) -> List[Tuple[str, float]]:
+        """Extract frames from a video.
+
+
+        Parameters:
+            video_uri: the path to the video file or a url points to the video data
+
+        Returns:
+            a list of tuples containing the extracted frame and the timestamp in seconds. E.g. [(path_to_frame1, 0.0), (path_to_frame2, 0.5), ...]. The timestamp is the time in seconds from the start of the video. E.g. 12.125 means 12.125 seconds from the start of the video. The frames are sorted by the timestamp in ascending order.
+        """
+        frames = extract_frames_from_video(video_uri)
+        result = []
+        _LOGGER.info(
+            f"Extracted {len(frames)} frames from video {video_uri}. Temporarily saving them as images to disk for downstream tasks."
+        )
+        for frame, ts in frames:
+            with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
+                Image.fromarray(frame).save(tmp)
+            result.append((tmp.name, ts))
+        return result
+
+
 class Add(Tool):
     r"""Add returns the sum of all the arguments passed to it, normalized to 2 decimal places."""
 
@@ -506,47 +608,6 @@ def __call__(self, input: List[int]) -> float:
         return round(input[0] / input[1], 2)
 
 
-class ExtractFrames(Tool):
-    r"""Extract frames from a video."""
-
-    name = "extract_frames_"
-    description = "'extract_frames_' extract image frames from the input video, return a list of tuple (frame, timestamp), where the timestamp is the relative time in seconds of the frame occurred in the video, the frame is a local image file path that stores the frame."
-    usage = {
-        "required_parameters": [{"name": "video_uri", "type": "str"}],
-        "examples": [
-            {
-                "scenario": "Can you extract the frames from this video? Video: www.foobar.com/video?name=test.mp4",
-                "parameters": {"video_uri": "www.foobar.com/video?name=test.mp4"},
-            },
-            {
-                "scenario": "Can you extract the images from this video file? Video path: tests/data/test.mp4",
-                "parameters": {"video_uri": "tests/data/test.mp4"},
-            },
-        ],
-    }
-
-    def __call__(self, video_uri: str) -> list[tuple[str, float]]:
-        """Extract frames from a video.
-
-
-        Parameters:
-            video_uri: the path to the video file or a url points to the video data
-
-        Returns:
-            a list of tuples containing the extracted frame and the timestamp in seconds. E.g. [(path_to_frame1, 0.0), (path_to_frame2, 0.5), ...]. The timestamp is the time in seconds from the start of the video. E.g. 12.125 means 12.125 seconds from the start of the video. The frames are sorted by the timestamp in ascending order.
-        """
-        frames = extract_frames_from_video(video_uri)
-        result = []
-        _LOGGER.info(
-            f"Extracted {len(frames)} frames from video {video_uri}. Temporarily saving them as images to disk for downstream tasks."
-        )
-        for frame, ts in frames:
-            with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp:
-                Image.fromarray(frame).save(tmp)
-            result.append((tmp.name, ts))
-        return result
-
-
 TOOLS = {
     i: {"name": c.name, "description": c.description, "usage": c.usage, "class": c}
     for i, c in enumerate(
@@ -554,15 +615,17 @@ def __call__(self, video_uri: str) -> list[tuple[str, float]]:
             CLIP,
             GroundingDINO,
             AgentGroundingSAM,
+            ExtractFrames,
             Counter,
             Crop,
             BboxArea,
             SegArea,
+            BboxIoU,
+            SegIoU,
             Add,
             Subtract,
             Multiply,
             Divide,
-            ExtractFrames,
         ]
     )
     if (hasattr(c, "name") and hasattr(c, "description") and hasattr(c, "usage"))
diff --git a/vision_agent/tools/video.py b/vision_agent/tools/video.py
index 606166ed..6068725f 100644
--- a/vision_agent/tools/video.py
+++ b/vision_agent/tools/video.py
@@ -22,12 +22,16 @@ def extract_frames_from_video(
     Parameters:
         video_uri: the path to the video file or a video file url
         fps: the frame rate per second to extract the frames
-        motion_detection_threshold: The threshold to detect motion between changes/frames.
-            A value between 0-1, which represents the percentage change required for the frames to be considered in motion.
-            For example, a lower value means more frames will be extracted.
+        motion_detection_threshold: The threshold to detect motion between
+            changes/frames. A value between 0-1, which represents the percentage change
+            required for the frames to be considered in motion. For example, a lower
+            value means more frames will be extracted.
 
     Returns:
-        a list of tuples containing the extracted frame and the timestamp in seconds. E.g. [(frame1, 0.0), (frame2, 0.5), ...]. The timestamp is the time in seconds from the start of the video. E.g. 12.125 means 12.125 seconds from the start of the video. The frames are sorted by the timestamp in ascending order.
+        a list of tuples containing the extracted frame and the timestamp in seconds.
+        E.g. [(frame1, 0.0), (frame2, 0.5), ...]. The timestamp is the time in seconds
+        from the start of the video. E.g. 12.125 means 12.125 seconds from the start of
+        the video. The frames are sorted by the timestamp in ascending order.
     """
     with VideoFileClip(video_uri) as video:
         video_duration: float = video.duration