From 0d6f980f6bb904538b3483b98d9cf91e8a4bd177 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Fri, 6 Sep 2024 08:07:57 -0700
Subject: [PATCH 01/12] add owlv2 video

---
 vision_agent/tools/__init__.py |  3 +-
 vision_agent/tools/tools.py    | 86 ++++++++++++++++++++++++++++++----
 2 files changed, 79 insertions(+), 10 deletions(-)

diff --git a/vision_agent/tools/__init__.py b/vision_agent/tools/__init__.py
index 90858569..3f966d65 100644
--- a/vision_agent/tools/__init__.py
+++ b/vision_agent/tools/__init__.py
@@ -46,7 +46,8 @@
     overlay_counting_results,
     overlay_heat_map,
     overlay_segmentation_masks,
-    owl_v2,
+    owl_v2_image,
+    owl_v2_video,
     save_image,
     save_json,
     save_video,
diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py
index 31d53f98..d142204d 100644
--- a/vision_agent/tools/tools.py
+++ b/vision_agent/tools/tools.py
@@ -145,15 +145,15 @@ def grounding_dino(
     return return_data
 
 
-def owl_v2(
+def owl_v2_image(
     prompt: str,
     image: np.ndarray,
-    box_threshold: float = 0.10,
+    box_threshold: float = 0.30,
 ) -> List[Dict[str, Any]]:
-    """'owl_v2' is a tool that can detect and count multiple objects given a text
-    prompt such as category names or referring expressions. The categories in text
-    prompt are separated by commas. It returns a list of bounding boxes with normalized
-    coordinates, label names and associated probability scores.
+    """'owl_v2_image' is a tool that can detect and count multiple objects given a text
+    prompt such as category names or referring expressions on images. The categories in
+    text prompt are separated by commas. It returns a list of bounding boxes with
+    normalized coordinates, label names and associated probability scores.
 
     Parameters:
         prompt (str): The prompt to ground to the image.
@@ -170,7 +170,7 @@ def owl_v2(
 
     Example
     -------
-        >>> owl_v2("car, dinosaur", image)
+        >>> owl_v2_image("car, dinosaur", image)
         [
             {'score': 0.99, 'label': 'dinosaur', 'bbox': [0.1, 0.11, 0.35, 0.4]},
             {'score': 0.98, 'label': 'car', 'bbox': [0.2, 0.21, 0.45, 0.5},
@@ -198,6 +198,72 @@ def owl_v2(
     return return_data
 
 
+def owl_v2_video(
+    prompt: str,
+    frames: List[np.ndarray],
+    box_threshold: float = 0.30,
+) -> List[List[Dict[str, Any]]]:
+    """'owl_v2_video' is a tool that can detect and count multiple objects given a text
+    prompt such as category names or referring expressions on videos. The categories in
+    text prompt are separated by commas. It returns a list of bounding boxes with
+    normalized coordinates, label names and associated probability scores per frame.
+
+    Parameters:
+        prompt (str): The prompt to ground to the video.
+        frames (List[np.ndarray]): The list of frames to ground the prompt to.
+        box_threshold (float, optional): The threshold for the box detection. Defaults
+            to 0.30.
+
+    Returns:
+        List[List[Dict[str, Any]]]: A list of dictionaries per frame containing the
+            score, label, and bounding box of the detected objects with normalized
+            coordinates between 0 and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the
+            coordinates of the top-left and xmax and ymax are the coordinates of the
+            bottom-right of the bounding box.
+
+    Example
+    -------
+        >>> owl_v2_video("car, dinosaur", frames)
+        [
+            [
+                {'score': 0.99, 'label': 'dinosaur', 'bbox': [0.1, 0.11, 0.35, 0.4]},
+                {'score': 0.98, 'label': 'car', 'bbox': [0.2, 0.21, 0.45, 0.5},
+            ],
+            ...
+        ]
+    """
+    if len(frames) == 0:
+        raise ValueError("No frames provided")
+
+    image_size = frames[0].shape[:2]
+    buffer_bytes = frames_to_bytes(frames)
+    files = [("video", buffer_bytes)]
+    payload = {
+        "prompts": [s.strip() for s in prompt.split(",")],
+        "model": "owlv2",
+        "function_name": "owl_v2_video",
+    }
+    data: Dict[str, Any] = send_inference_request(
+        payload, "text-to-object-detection", files=files, v2=True
+    )
+    return_data = []
+    if data is not None:
+        for frame_data in data:
+            return_data_frame = []
+            for elt in frame_data:
+                if elt["score"] >= box_threshold:
+                    return_data_frame.append(
+                        {
+                            "bbox": normalize_bbox(elt["bbox"], image_size),  # type: ignore
+                            "label": elt["label"],  # type: ignore
+                            "score": round(elt["score"], 2),  # type: ignore
+                        }
+                    )
+            return_data.append(return_data_frame)
+
+    return return_data
+
+
 def grounding_sam(
     prompt: str,
     image: np.ndarray,
@@ -351,13 +417,14 @@ def florence2_sam2_video(
                         [0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
                 },
             ],
+            ...
         ]
     """
 
     buffer_bytes = frames_to_bytes(frames)
     files = [("video", buffer_bytes)]
     payload = {
-        "prompts": prompt.split(","),
+        "prompts": [s.strip() for s in prompt.split(",")],
         "function_name": "florence2_sam2_video",
     }
     data: Dict[str, Any] = send_inference_request(
@@ -1820,7 +1887,8 @@ def overlay_counting_results(
 
 
 FUNCTION_TOOLS = [
-    owl_v2,
+    owl_v2_image,
+    owl_v2_video,
     ocr,
     clip,
     vit_image_classification,

From 34eca878f91ca00ab898f6745cdf3baebcb7dc4b Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Fri, 6 Sep 2024 09:09:04 -0700
Subject: [PATCH 02/12] update doc extract_frames to include urls

---
 vision_agent/tools/tools.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py
index d142204d..1ea92461 100644
--- a/vision_agent/tools/tools.py
+++ b/vision_agent/tools/tools.py
@@ -1441,12 +1441,12 @@ def closest_box_distance(
 def extract_frames(
     video_uri: Union[str, Path], fps: float = 1
 ) -> List[Tuple[np.ndarray, float]]:
-    """'extract_frames' extracts frames from a video which can be a file path or youtube
-    link, returns a list of tuples (frame, timestamp), where timestamp is the relative
-    time in seconds where the frame was captured. The frame is a numpy array.
+    """'extract_frames' extracts frames from a video which can be a file path, url or
+    youtube link, returns a list of tuples (frame, timestamp), where timestamp is the
+    relative time in seconds where the frame was captured. The frame is a numpy array.
 
     Parameters:
-        video_uri (Union[str, Path]): The path to the video file or youtube link
+        video_uri (Union[str, Path]): The path to the video file, url or youtube link
         fps (float, optional): The frame rate per second to extract the frames. Defaults
             to 10.
 
@@ -1888,7 +1888,7 @@ def overlay_counting_results(
 
 FUNCTION_TOOLS = [
     owl_v2_image,
-    owl_v2_video,
+    # owl_v2_video,
     ocr,
     clip,
     vit_image_classification,

From b0ea47f0c759624ccbed916e79b4693805773cf5 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Fri, 6 Sep 2024 09:29:04 -0700
Subject: [PATCH 03/12] fix countgd return decimal places

---
 vision_agent/tools/tools.py | 31 ++++++++++++++++++++++---------
 1 file changed, 22 insertions(+), 9 deletions(-)

diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py
index 1ea92461..098c7b70 100644
--- a/vision_agent/tools/tools.py
+++ b/vision_agent/tools/tools.py
@@ -251,14 +251,13 @@ def owl_v2_video(
         for frame_data in data:
             return_data_frame = []
             for elt in frame_data:
-                if elt["score"] >= box_threshold:
-                    return_data_frame.append(
-                        {
-                            "bbox": normalize_bbox(elt["bbox"], image_size),  # type: ignore
-                            "label": elt["label"],  # type: ignore
-                            "score": round(elt["score"], 2),  # type: ignore
-                        }
+                return_data_frame.append(
+                    ODResponseData(
+                        label=elt["label"],
+                        bbox=normalize_bbox(elt["bbox"], image_size),
+                        score=round(elt["score"], 2),
                     )
+                )
             return_data.append(return_data_frame)
 
     return return_data
@@ -616,7 +615,14 @@ def countgd_counting(
         payload, "text-to-object-detection", files=files, metadata=metadata
     )
     bboxes_per_frame = resp_data[0]
-    bboxes_formatted = [ODResponseData(**bbox) for bbox in bboxes_per_frame]
+    bboxes_formatted = [
+        ODResponseData(
+            label=bbox["label"],  # type: ignore
+            bbox=list(map(lambda x: round(x, 2), bbox["bounding_box"])),  # type: ignore
+            score=round(bbox["score"], 2),  # type: ignore
+        )
+        for bbox in bboxes_per_frame
+    ]
     filtered_bboxes = filter_bboxes_by_threshold(bboxes_formatted, box_threshold)
     return [bbox.model_dump() for bbox in filtered_bboxes]
 
@@ -668,7 +674,14 @@ def countgd_example_based_counting(
         payload, "visual-prompts-to-object-detection", files=files, metadata=metadata
     )
     bboxes_per_frame = resp_data[0]
-    bboxes_formatted = [ODResponseData(**bbox) for bbox in bboxes_per_frame]
+    bboxes_formatted = [
+        ODResponseData(
+            label=bbox["label"],  # type: ignore
+            bbox=list(map(lambda x: round(x, 2), bbox["bounding_box"])),  # type: ignore
+            score=round(bbox["score"], 2),  # type: ignore
+        )
+        for bbox in bboxes_per_frame
+    ]
     filtered_bboxes = filter_bboxes_by_threshold(bboxes_formatted, box_threshold)
     return [bbox.model_dump() for bbox in filtered_bboxes]
 

From a8d200608681d1fc951ca6297f57dc044a056cbe Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Fri, 6 Sep 2024 12:37:44 -0700
Subject: [PATCH 04/12] fixed return types

---
 vision_agent/tools/tools.py | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py
index 098c7b70..cc2a877d 100644
--- a/vision_agent/tools/tools.py
+++ b/vision_agent/tools/tools.py
@@ -185,6 +185,7 @@ def owl_v2_image(
         "function_name": "owl_v2",
     }
     data: Dict[str, Any] = send_inference_request(request_data, "owlv2", v2=True)
+    print(data)
     return_data = []
     if data is not None:
         for elt in data:
@@ -246,21 +247,23 @@ def owl_v2_video(
     data: Dict[str, Any] = send_inference_request(
         payload, "text-to-object-detection", files=files, v2=True
     )
-    return_data = []
+    print(data)
+    bboxes_formatted = []
     if data is not None:
         for frame_data in data:
-            return_data_frame = []
+            bboxes_formated_frame = []
             for elt in frame_data:
-                return_data_frame.append(
+                bboxes_formated_frame.append(
                     ODResponseData(
-                        label=elt["label"],
-                        bbox=normalize_bbox(elt["bbox"], image_size),
-                        score=round(elt["score"], 2),
+                        label=elt["label"], # type: ignore
+                        bbox=normalize_bbox(elt["bounding_box"], image_size), # type: ignore
+                        score=round(elt["score"], 2), # type: ignore
                     )
                 )
-            return_data.append(return_data_frame)
+            bboxes_formatted.append(bboxes_formated_frame)
 
-    return return_data
+    filtered_bboxes = [filter_bboxes_by_threshold(elt, box_threshold) for elt in bboxes_formatted]
+    return [[bbox.model_dump() for bbox in frame] for frame in filtered_bboxes]
 
 
 def grounding_sam(

From 4af5053d730ae0e37263868a030910af5aad97e0 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Fri, 6 Sep 2024 12:50:14 -0700
Subject: [PATCH 05/12] prompt tests to run faster

---
 .../agent/vision_agent_coder_prompts.py       | 48 ++++++++++++++++---
 1 file changed, 41 insertions(+), 7 deletions(-)

diff --git a/vision_agent/agent/vision_agent_coder_prompts.py b/vision_agent/agent/vision_agent_coder_prompts.py
index b4c8a9bf..df68372c 100644
--- a/vision_agent/agent/vision_agent_coder_prompts.py
+++ b/vision_agent/agent/vision_agent_coder_prompts.py
@@ -70,30 +70,64 @@
 2. Create a dictionary where the keys are the tool name and the values are the tool outputs. Remove numpy arrays from the printed dictionary.
 3. Your test case MUST run only on the given images which are {media}
 4. Print this final dictionary.
+5. For video input, sample at 1 FPS and use the first 10 frames only to reduce processing time.
 
 **Example**:
+--- EXAMPLE1 ---
 plan1:
 - Load the image from the provided file path 'image.jpg'.
-- Use the 'owl_v2' tool with the prompt 'person' to detect and count the number of people in the image.
+- Use the 'owl_v2_image' tool with the prompt 'person' to detect and count the number of people in the image.
 plan2:
 - Load the image from the provided file path 'image.jpg'.
-- Use the 'grounding_sam' tool with the prompt 'person' to detect and count the number of people in the image.
+- Use the 'florence2_sam2_image' tool with the prompt 'person' to detect and count the number of people in the image.
 - Count the number of detected objects labeled as 'person'.
 plan3:
 - Load the image from the provided file path 'image.jpg'.
 - Use the 'countgd_counting' tool to count the dominant foreground object, which in this case is people.
 
 ```python
-from vision_agent.tools import load_image, owl_v2, grounding_sam, countgd_counting
+from vision_agent.tools import load_image, owl_v2_image, florence2_sam2_image, countgd_counting
 image = load_image("image.jpg")
-owl_v2_out = owl_v2("person", image)
+owl_v2_out = owl_v2_image("person", image)
 
-gsam_out = grounding_sam("person", image)
-gsam_out = [{{k: v for k, v in o.items() if k != "mask"}} for o in gsam_out]
+f2s2_out = florence2_sam2_image("person", image)
+# strip out the masks from the output becuase they don't provide useful information when printed
+f2s2_out = [{{k: v for k, v in o.items() if k != "mask"}} for o in f2s2_out]
 
 cgd_out = countgd_counting(image)
 
-final_out = {{"owl_v2": owl_v2_out, "florencev2_object_detection": florencev2_out, "countgd_counting": cgd_out}}
+final_out = {{"owl_v2_image": owl_v2_out, "florence2_sam2_image": f2s2, "countgd_counting": cgd_out}}
+print(final_out)
+
+--- EXAMPLE2 ---
+plan1:
+- Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames' tool.
+- Use the 'owl_v2_image' tool with the prompt 'person' to detect where the people are in the video.
+plan2:
+- Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames' tool.
+- Use the 'florence2_phrase_grounding' tool with the prompt 'person' to detect where the people are in the video.
+plan3:
+- Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames' tool.
+- Use the 'countgd_counting' tool with the prompt 'person' to detect where the people are in the video.
+
+
+```python
+from vision_agent.tools import extract_frames, owl_v2_image, florence2_phrase_grounding, countgd_counting
+
+# sample at 1 FPS and use the first 10 frames to reduce processing time
+frames = extract_frames("video.mp4", 1)
+frames = [f[0] for f in frames][:10]
+
+# plan1
+owl_v2_out = [owl_v2_image("person", f) for f in frames]
+
+# plan2
+florence2_out = [florence2_phrase_grounding("person", f) for f in frames]
+
+# plan3
+countgd_out = [countgd_counting(f) for f in frames]
+
+final_out = {{"owl_v2_image": owl_v2_out, "florencev2_object_detection": florencev2_out, "countgd_counting": cgd_out}}
 print(final_out)
 ```
 """

From f1d5f1f859499ed530cd8594ef90152983688cec Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Fri, 6 Sep 2024 12:51:40 -0700
Subject: [PATCH 06/12] testing owlv2_video

---
 vision_agent/tools/__init__.py |  1 +
 vision_agent/tools/tools.py    | 41 +++++++++++++++++++++++++++++-----
 2 files changed, 36 insertions(+), 6 deletions(-)

diff --git a/vision_agent/tools/__init__.py b/vision_agent/tools/__init__.py
index 3f966d65..d730f3a8 100644
--- a/vision_agent/tools/__init__.py
+++ b/vision_agent/tools/__init__.py
@@ -47,6 +47,7 @@
     overlay_heat_map,
     overlay_segmentation_masks,
     owl_v2_image,
+    owl_v2_image2,
     owl_v2_video,
     save_image,
     save_json,
diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py
index cc2a877d..555c58dc 100644
--- a/vision_agent/tools/tools.py
+++ b/vision_agent/tools/tools.py
@@ -185,7 +185,6 @@ def owl_v2_image(
         "function_name": "owl_v2",
     }
     data: Dict[str, Any] = send_inference_request(request_data, "owlv2", v2=True)
-    print(data)
     return_data = []
     if data is not None:
         for elt in data:
@@ -199,6 +198,35 @@ def owl_v2_image(
     return return_data
 
 
+def owl_v2_image2(
+    prompt: str,
+    image: np.ndarray,
+    box_threshold: float = 0.30,
+) -> List[Dict[str, Any]]:
+    image_size = image.shape[:2]
+    buffer_bytes = numpy_to_bytes(image)
+    files = [("image", buffer_bytes)]
+    payload = {
+        "prompts": [s.strip() for s in prompt.split(",")],
+        "model": "owlv2",
+        "function_name": "owl_v2_image",
+    }
+    resp_data = send_inference_request(
+        payload, "text-to-object-detection", files=files, v2=True
+    )
+    bboxes = resp_data[0]
+    bboxes_formatted = [
+        ODResponseData(
+            label=bbox["label"],  # type: ignore
+            bbox=normalize_bbox(bbox["bounding_box"], image_size),  # type: ignore
+            score=round(bbox["score"], 2),  # type: ignore
+        )
+        for bbox in bboxes
+    ]
+    filtered_bboxes = filter_bboxes_by_threshold(bboxes_formatted, box_threshold)
+    return [bbox.model_dump() for bbox in filtered_bboxes]
+
+
 def owl_v2_video(
     prompt: str,
     frames: List[np.ndarray],
@@ -247,7 +275,6 @@ def owl_v2_video(
     data: Dict[str, Any] = send_inference_request(
         payload, "text-to-object-detection", files=files, v2=True
     )
-    print(data)
     bboxes_formatted = []
     if data is not None:
         for frame_data in data:
@@ -255,14 +282,16 @@ def owl_v2_video(
             for elt in frame_data:
                 bboxes_formated_frame.append(
                     ODResponseData(
-                        label=elt["label"], # type: ignore
-                        bbox=normalize_bbox(elt["bounding_box"], image_size), # type: ignore
-                        score=round(elt["score"], 2), # type: ignore
+                        label=elt["label"],  # type: ignore
+                        bbox=normalize_bbox(elt["bounding_box"], image_size),  # type: ignore
+                        score=round(elt["score"], 2),  # type: ignore
                     )
                 )
             bboxes_formatted.append(bboxes_formated_frame)
 
-    filtered_bboxes = [filter_bboxes_by_threshold(elt, box_threshold) for elt in bboxes_formatted]
+    filtered_bboxes = [
+        filter_bboxes_by_threshold(elt, box_threshold) for elt in bboxes_formatted
+    ]
     return [[bbox.model_dump() for bbox in frame] for frame in filtered_bboxes]
 
 

From e4140035f0a02dc0bbf60e2d22c80074e9505481 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Fri, 6 Sep 2024 14:10:41 -0700
Subject: [PATCH 07/12] updated name to florence2_sam2_video_tracking

---
 vision_agent/tools/__init__.py |  3 +-
 vision_agent/tools/tools.py    | 56 +++++++++-------------------------
 2 files changed, 16 insertions(+), 43 deletions(-)

diff --git a/vision_agent/tools/__init__.py b/vision_agent/tools/__init__.py
index d730f3a8..f7b1e4c0 100644
--- a/vision_agent/tools/__init__.py
+++ b/vision_agent/tools/__init__.py
@@ -27,7 +27,7 @@
     florence2_phrase_grounding,
     florence2_roberta_vqa,
     florence2_sam2_image,
-    florence2_sam2_video,
+    florence2_sam2_video_tracking,
     generate_pose_image,
     generate_soft_edge_image,
     get_tool_documentation,
@@ -47,7 +47,6 @@
     overlay_heat_map,
     overlay_segmentation_masks,
     owl_v2_image,
-    owl_v2_image2,
     owl_v2_video,
     save_image,
     save_json,
diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py
index 555c58dc..ab1cafec 100644
--- a/vision_agent/tools/tools.py
+++ b/vision_agent/tools/tools.py
@@ -176,33 +176,6 @@ def owl_v2_image(
             {'score': 0.98, 'label': 'car', 'bbox': [0.2, 0.21, 0.45, 0.5},
         ]
     """
-    image_size = image.shape[:2]
-    image_b64 = convert_to_b64(image)
-    request_data = {
-        "prompts": [s.strip() for s in prompt.split(",")],
-        "image": image_b64,
-        "confidence": box_threshold,
-        "function_name": "owl_v2",
-    }
-    data: Dict[str, Any] = send_inference_request(request_data, "owlv2", v2=True)
-    return_data = []
-    if data is not None:
-        for elt in data:
-            return_data.append(
-                {
-                    "bbox": normalize_bbox(elt["bbox"], image_size),  # type: ignore
-                    "label": elt["label"],  # type: ignore
-                    "score": round(elt["score"], 2),  # type: ignore
-                }
-            )
-    return return_data
-
-
-def owl_v2_image2(
-    prompt: str,
-    image: np.ndarray,
-    box_threshold: float = 0.30,
-) -> List[Dict[str, Any]]:
     image_size = image.shape[:2]
     buffer_bytes = numpy_to_bytes(image)
     files = [("image", buffer_bytes)]
@@ -232,10 +205,11 @@ def owl_v2_video(
     frames: List[np.ndarray],
     box_threshold: float = 0.30,
 ) -> List[List[Dict[str, Any]]]:
-    """'owl_v2_video' is a tool that can detect and count multiple objects given a text
-    prompt such as category names or referring expressions on videos. The categories in
-    text prompt are separated by commas. It returns a list of bounding boxes with
-    normalized coordinates, label names and associated probability scores per frame.
+    """'owl_v2_video' will run owl_v2 on each frame of a video. It can detect multiple
+    objects per frame given a text prompt sucha s a category name or referring
+    expression. The categories in text prompt are separated by commas. It returns a list
+    of lists where each inner list contains the score, label, and bounding box of the
+    detections for that frame.
 
     Parameters:
         prompt (str): The prompt to ground to the video.
@@ -244,7 +218,7 @@ def owl_v2_video(
             to 0.30.
 
     Returns:
-        List[List[Dict[str, Any]]]: A list of dictionaries per frame containing the
+        List[List[Dict[str, Any]]]: A list of lists of dictionaries containing the
             score, label, and bounding box of the detected objects with normalized
             coordinates between 0 and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the
             coordinates of the top-left and xmax and ymax are the coordinates of the
@@ -414,14 +388,14 @@ def florence2_sam2_image(prompt: str, image: np.ndarray) -> List[Dict[str, Any]]
     return return_data
 
 
-def florence2_sam2_video(
+def florence2_sam2_video_tracking(
     prompt: str, frames: List[np.ndarray]
 ) -> List[List[Dict[str, Any]]]:
-    """'florence2_sam2_video' is a tool that can segment and track multiple entities
-    in a video given a text prompt such as category names or referring expressions. You
-    can optionally separate the categories in the text with commas. It only tracks
-    entities present in the first frame and only returns segmentation masks. It is
-    useful for tracking and counting without duplicating counts.
+    """'florence2_sam2_video_tracking' is a tool that can segment and track multiple
+    entities in a video given a text prompt such as category names or referring
+    expressions. You can optionally separate the categories in the text with commas. It
+    only tracks entities present in the first frame and only returns segmentation
+    masks. It is useful for tracking and counting without duplicating counts.
 
     Parameters:
         prompt (str): The prompt to ground to the video.
@@ -456,7 +430,7 @@ def florence2_sam2_video(
     files = [("video", buffer_bytes)]
     payload = {
         "prompts": [s.strip() for s in prompt.split(",")],
-        "function_name": "florence2_sam2_video",
+        "function_name": "florence2_sam2_video_tracking",
     }
     data: Dict[str, Any] = send_inference_request(
         payload, "florence2-sam2", files=files, v2=True
@@ -1933,7 +1907,7 @@ def overlay_counting_results(
 
 FUNCTION_TOOLS = [
     owl_v2_image,
-    # owl_v2_video,
+    owl_v2_video,
     ocr,
     clip,
     vit_image_classification,
@@ -1942,7 +1916,7 @@ def overlay_counting_results(
     florence2_image_caption,
     florence2_ocr,
     florence2_sam2_image,
-    florence2_sam2_video,
+    florence2_sam2_video_tracking,
     florence2_phrase_grounding,
     ixc25_image_vqa,
     ixc25_video_vqa,

From ffef05bdfe85018809f1b7f4c8e8f61b83dfd294 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Fri, 6 Sep 2024 15:51:40 -0700
Subject: [PATCH 08/12] lowered threshold

---
 vision_agent/tools/tools.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py
index ab1cafec..1cb90cf3 100644
--- a/vision_agent/tools/tools.py
+++ b/vision_agent/tools/tools.py
@@ -148,7 +148,7 @@ def grounding_dino(
 def owl_v2_image(
     prompt: str,
     image: np.ndarray,
-    box_threshold: float = 0.30,
+    box_threshold: float = 0.10,
 ) -> List[Dict[str, Any]]:
     """'owl_v2_image' is a tool that can detect and count multiple objects given a text
     prompt such as category names or referring expressions on images. The categories in
@@ -203,7 +203,7 @@ def owl_v2_image(
 def owl_v2_video(
     prompt: str,
     frames: List[np.ndarray],
-    box_threshold: float = 0.30,
+    box_threshold: float = 0.10,
 ) -> List[List[Dict[str, Any]]]:
     """'owl_v2_video' will run owl_v2 on each frame of a video. It can detect multiple
     objects per frame given a text prompt sucha s a category name or referring

From 9bdbf1db0c5bede4f17ccce1efeb64f575054a98 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Fri, 6 Sep 2024 15:51:56 -0700
Subject: [PATCH 09/12] ran isort

---
 vision_agent/utils/video.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vision_agent/utils/video.py b/vision_agent/utils/video.py
index 51774279..d306f295 100644
--- a/vision_agent/utils/video.py
+++ b/vision_agent/utils/video.py
@@ -4,8 +4,8 @@
 from functools import lru_cache
 from typing import List, Optional, Tuple
 
-import cv2
 import av  # type: ignore
+import cv2
 import numpy as np
 from decord import VideoReader  # type: ignore
 

From 0fcfe04d3af900c2df4dd8580b57a875e493aded Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Fri, 6 Sep 2024 15:53:18 -0700
Subject: [PATCH 10/12] fix mypy errors

---
 vision_agent/tools/tools.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py
index 1cb90cf3..e8e23ba6 100644
--- a/vision_agent/tools/tools.py
+++ b/vision_agent/tools/tools.py
@@ -190,9 +190,9 @@ def owl_v2_image(
     bboxes = resp_data[0]
     bboxes_formatted = [
         ODResponseData(
-            label=bbox["label"],  # type: ignore
-            bbox=normalize_bbox(bbox["bounding_box"], image_size),  # type: ignore
-            score=round(bbox["score"], 2),  # type: ignore
+            label=bbox["label"],
+            bbox=normalize_bbox(bbox["bounding_box"], image_size),
+            score=round(bbox["score"], 2),
         )
         for bbox in bboxes
     ]
@@ -623,9 +623,9 @@ def countgd_counting(
     bboxes_per_frame = resp_data[0]
     bboxes_formatted = [
         ODResponseData(
-            label=bbox["label"],  # type: ignore
-            bbox=list(map(lambda x: round(x, 2), bbox["bounding_box"])),  # type: ignore
-            score=round(bbox["score"], 2),  # type: ignore
+            label=bbox["label"],
+            bbox=list(map(lambda x: round(x, 2), bbox["bounding_box"])),
+            score=round(bbox["score"], 2),
         )
         for bbox in bboxes_per_frame
     ]
@@ -682,9 +682,9 @@ def countgd_example_based_counting(
     bboxes_per_frame = resp_data[0]
     bboxes_formatted = [
         ODResponseData(
-            label=bbox["label"],  # type: ignore
-            bbox=list(map(lambda x: round(x, 2), bbox["bounding_box"])),  # type: ignore
-            score=round(bbox["score"], 2),  # type: ignore
+            label=bbox["label"],
+            bbox=list(map(lambda x: round(x, 2), bbox["bounding_box"])),
+            score=round(bbox["score"], 2),
         )
         for bbox in bboxes_per_frame
     ]

From 45a160a922fc688cec4dd4668c1fcc6afe4fd6f7 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Fri, 6 Sep 2024 16:03:44 -0700
Subject: [PATCH 11/12] fix tests'

---
 tests/integ/test_tools.py           | 26 ++++++++++++++++++++------
 tests/integration_dev/test_tools.py |  5 +----
 2 files changed, 21 insertions(+), 10 deletions(-)

diff --git a/tests/integ/test_tools.py b/tests/integ/test_tools.py
index bca1f6ea..ec45f7c9 100644
--- a/tests/integ/test_tools.py
+++ b/tests/integ/test_tools.py
@@ -10,11 +10,11 @@
     detr_segmentation,
     dpt_hybrid_midas,
     florence2_image_caption,
-    florence2_phrase_grounding,
     florence2_ocr,
+    florence2_phrase_grounding,
     florence2_roberta_vqa,
     florence2_sam2_image,
-    florence2_sam2_video,
+    florence2_sam2_video_tracking,
     generate_pose_image,
     generate_soft_edge_image,
     git_vqa_v2,
@@ -25,7 +25,8 @@
     loca_visual_prompt_counting,
     loca_zero_shot_counting,
     ocr,
-    owl_v2,
+    owl_v2_image,
+    owl_v2_video,
     template_match,
     vit_image_classification,
     vit_nsfw_classification,
@@ -53,9 +54,9 @@ def test_grounding_dino_tiny():
     assert [res["label"] for res in result] == ["coin"] * 24
 
 
-def test_owl():
+def test_owl_v2_image():
     img = ski.data.coins()
-    result = owl_v2(
+    result = owl_v2_image(
         prompt="coin",
         image=img,
     )
@@ -63,6 +64,19 @@ def test_owl():
     assert [res["label"] for res in result] == ["coin"] * 25
 
 
+def test_owl_v2_video():
+    frames = [
+        np.array(Image.fromarray(ski.data.coins()).convert("RGB")) for _ in range(10)
+    ]
+    result = owl_v2_video(
+        prompt="coin",
+        frames=frames,
+    )
+
+    assert len(result) == 10
+    assert len([res["label"] for res in result[0]]) == 25
+
+
 def test_object_detection():
     img = ski.data.coins()
     result = florence2_phrase_grounding(
@@ -108,7 +122,7 @@ def test_florence2_sam2_video():
     frames = [
         np.array(Image.fromarray(ski.data.coins()).convert("RGB")) for _ in range(10)
     ]
-    result = florence2_sam2_video(
+    result = florence2_sam2_video_tracking(
         prompt="coin",
         frames=frames,
     )
diff --git a/tests/integration_dev/test_tools.py b/tests/integration_dev/test_tools.py
index 29262245..246c5642 100644
--- a/tests/integration_dev/test_tools.py
+++ b/tests/integration_dev/test_tools.py
@@ -1,9 +1,6 @@
 import skimage as ski
 
-from vision_agent.tools import (
-    countgd_counting,
-    countgd_example_based_counting,
-)
+from vision_agent.tools import countgd_counting, countgd_example_based_counting
 
 
 def test_countgd_counting() -> None:

From c34d9d1fc1ce1d4545205154d216dd0725e87160 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Fri, 6 Sep 2024 16:08:19 -0700
Subject: [PATCH 12/12] fix tests

---
 tests/integ/test_tools.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/integ/test_tools.py b/tests/integ/test_tools.py
index ec45f7c9..24bd259f 100644
--- a/tests/integ/test_tools.py
+++ b/tests/integ/test_tools.py
@@ -60,8 +60,8 @@ def test_owl_v2_image():
         prompt="coin",
         image=img,
     )
-    assert len(result) == 25
-    assert [res["label"] for res in result] == ["coin"] * 25
+    assert 24 <= len(result) <= 26
+    assert [res["label"] for res in result] == ["coin"] * len(result)
 
 
 def test_owl_v2_video():
@@ -74,7 +74,7 @@ def test_owl_v2_video():
     )
 
     assert len(result) == 10
-    assert len([res["label"] for res in result[0]]) == 25
+    assert 24 <= len([res["label"] for res in result[0]]) <= 26
 
 
 def test_object_detection():