From e0cbd8ef1bd0ed1806ed830a5f1056cdaf5b28af Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Mon, 23 Sep 2024 15:46:29 -0700
Subject: [PATCH 1/3] overlay bboxes works with frames

---
 vision_agent/tools/tools.py | 77 ++++++++++++++++++++++---------------
 1 file changed, 47 insertions(+), 30 deletions(-)

diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py
index 34c037ab..22c0a000 100644
--- a/vision_agent/tools/tools.py
+++ b/vision_agent/tools/tools.py
@@ -1759,14 +1759,17 @@ def _save_video_to_result(video_uri: str) -> None:
 
 
 def overlay_bounding_boxes(
-    image: np.ndarray, bboxes: List[Dict[str, Any]]
+    medias: Union[np.ndarray, List[np.ndarray]],
+    bboxes: Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]],
 ) -> np.ndarray:
     """'overlay_bounding_boxes' is a utility function that displays bounding boxes on
     an image.
 
     Parameters:
-        image (np.ndarray): The image to display the bounding boxes on.
-        bboxes (List[Dict[str, Any]]): A list of dictionaries containing the bounding
+        medias (Union[np.ndarray, List[np.ndarra]]): The image or frames to display the
+            bounding boxes on.
+        bboxes (Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]]): A list of
+            dictionaries or a list of list of dictionaries containing the bounding
             boxes.
 
     Returns:
@@ -1778,41 +1781,54 @@ def overlay_bounding_boxes(
             image, [{'score': 0.99, 'label': 'dinosaur', 'bbox': [0.1, 0.11, 0.35, 0.4]}],
         )
     """
-    pil_image = Image.fromarray(image.astype(np.uint8)).convert("RGB")
 
-    if len(set([box["label"] for box in bboxes])) > len(COLORS):
+    medias_int: List[np.ndarray] = (
+        [medias] if isinstance(medias, np.ndarray) else medias
+    )
+    bbox_int = [bboxes] if isinstance(bboxes[0], dict) else bboxes
+    bbox_int = cast(List[List[Dict[str, Any]]], bbox_int)
+    labels = set([bb["label"] for b in bbox_int for bb in b])
+
+    if len(labels) > len(COLORS):
         _LOGGER.warning(
             "Number of unique labels exceeds the number of available colors. Some labels may have the same color."
         )
 
-    color = {
-        label: COLORS[i % len(COLORS)]
-        for i, label in enumerate(set([box["label"] for box in bboxes]))
-    }
-    bboxes = sorted(bboxes, key=lambda x: x["label"], reverse=True)
+    color = {label: COLORS[i % len(COLORS)] for i, label in enumerate(labels)}
 
-    width, height = pil_image.size
-    fontsize = max(12, int(min(width, height) / 40))
-    draw = ImageDraw.Draw(pil_image)
-    font = ImageFont.truetype(
-        str(resources.files("vision_agent.fonts").joinpath("default_font_ch_en.ttf")),
-        fontsize,
-    )
+    frame_out = []
+    for i, frame in enumerate(medias_int):
+        pil_image = Image.fromarray(frame.astype(np.uint8)).convert("RGB")
 
-    for elt in bboxes:
-        label = elt["label"]
-        box = elt["bbox"]
-        scores = elt["score"]
+        bboxes = bbox_int[i]
+        bboxes = sorted(bboxes, key=lambda x: x["label"], reverse=True)
 
-        # denormalize the box if it is normalized
-        box = denormalize_bbox(box, (height, width))
+        width, height = pil_image.size
+        fontsize = max(12, int(min(width, height) / 40))
+        draw = ImageDraw.Draw(pil_image)
+        font = ImageFont.truetype(
+            str(
+                resources.files("vision_agent.fonts").joinpath("default_font_ch_en.ttf")
+            ),
+            fontsize,
+        )
 
-        draw.rectangle(box, outline=color[label], width=4)
-        text = f"{label}: {scores:.2f}"
-        text_box = draw.textbbox((box[0], box[1]), text=text, font=font)
-        draw.rectangle((box[0], box[1], text_box[2], text_box[3]), fill=color[label])
-        draw.text((box[0], box[1]), text, fill="black", font=font)
-    return np.array(pil_image)
+        for elt in bboxes:
+            label = elt["label"]
+            box = elt["bbox"]
+            scores = elt["score"]
+
+            # denormalize the box if it is normalized
+            box = denormalize_bbox(box, (height, width))
+            draw.rectangle(box, outline=color[label], width=4)
+            text = f"{label}: {scores:.2f}"
+            text_box = draw.textbbox((box[0], box[1]), text=text, font=font)
+            draw.rectangle(
+                (box[0], box[1], text_box[2], text_box[3]), fill=color[label]
+            )
+            draw.text((box[0], box[1]), text, fill="black", font=font)
+        frame_out.append(np.array(pil_image))
+    return frame_out[0] if len(frame_out) == 1 else frame_out
 
 
 def _get_text_coords_from_mask(
@@ -1852,7 +1868,8 @@ def overlay_segmentation_masks(
         medias (Union[np.ndarray, List[np.ndarray]]): The image or frames to display
             the masks on.
         masks (Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]]): A list of
-            dictionaries containing the masks, labels and scores.
+            dictionaries or a list of list of dictionaries containing the masks, labels
+            and scores.
         draw_label (bool, optional): If True, the labels will be displayed on the image.
         secondary_label_key (str, optional): The key to use for the secondary
             tracking label which is needed in videos to display tracking information.

From 10d5731bfde54947049fb6ec5d88f1c6b69d096a Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Tue, 24 Sep 2024 08:13:56 -0700
Subject: [PATCH 2/3] fix mkdocs

---
 docs/api/lmm.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/api/lmm.md b/docs/api/lmm.md
index da7ea71d..96087016 100644
--- a/docs/api/lmm.md
+++ b/docs/api/lmm.md
@@ -4,4 +4,4 @@
 
 ::: vision_agent.lmm.OllamaLMM
 
-::: vision_agent.lmm.ClaudeSonnetLMM
+::: vision_agent.lmm.AnthropicLMM

From d889de51977517bb5eeb0b9286f928609667a638 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Tue, 24 Sep 2024 08:14:05 -0700
Subject: [PATCH 3/3] fix return type

---
 vision_agent/tools/tools.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py
index 22c0a000..95fdd56c 100644
--- a/vision_agent/tools/tools.py
+++ b/vision_agent/tools/tools.py
@@ -1761,7 +1761,7 @@ def _save_video_to_result(video_uri: str) -> None:
 def overlay_bounding_boxes(
     medias: Union[np.ndarray, List[np.ndarray]],
     bboxes: Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]],
-) -> np.ndarray:
+) -> Union[np.ndarray, List[np.ndarray]]:
     """'overlay_bounding_boxes' is a utility function that displays bounding boxes on
     an image.