diff --git a/vision_agent/tools/__init__.py b/vision_agent/tools/__init__.py
index d730f3a8..f7b1e4c0 100644
--- a/vision_agent/tools/__init__.py
+++ b/vision_agent/tools/__init__.py
@@ -27,7 +27,7 @@
     florence2_phrase_grounding,
     florence2_roberta_vqa,
     florence2_sam2_image,
-    florence2_sam2_video,
+    florence2_sam2_video_tracking,
     generate_pose_image,
     generate_soft_edge_image,
     get_tool_documentation,
@@ -47,7 +47,6 @@
     overlay_heat_map,
     overlay_segmentation_masks,
     owl_v2_image,
-    owl_v2_image2,
     owl_v2_video,
     save_image,
     save_json,
diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py
index 555c58dc..ab1cafec 100644
--- a/vision_agent/tools/tools.py
+++ b/vision_agent/tools/tools.py
@@ -176,33 +176,6 @@ def owl_v2_image(
             {'score': 0.98, 'label': 'car', 'bbox': [0.2, 0.21, 0.45, 0.5},
         ]
     """
-    image_size = image.shape[:2]
-    image_b64 = convert_to_b64(image)
-    request_data = {
-        "prompts": [s.strip() for s in prompt.split(",")],
-        "image": image_b64,
-        "confidence": box_threshold,
-        "function_name": "owl_v2",
-    }
-    data: Dict[str, Any] = send_inference_request(request_data, "owlv2", v2=True)
-    return_data = []
-    if data is not None:
-        for elt in data:
-            return_data.append(
-                {
-                    "bbox": normalize_bbox(elt["bbox"], image_size),  # type: ignore
-                    "label": elt["label"],  # type: ignore
-                    "score": round(elt["score"], 2),  # type: ignore
-                }
-            )
-    return return_data
-
-
-def owl_v2_image2(
-    prompt: str,
-    image: np.ndarray,
-    box_threshold: float = 0.30,
-) -> List[Dict[str, Any]]:
     image_size = image.shape[:2]
     buffer_bytes = numpy_to_bytes(image)
     files = [("image", buffer_bytes)]
@@ -232,10 +205,11 @@ def owl_v2_video(
     frames: List[np.ndarray],
     box_threshold: float = 0.30,
 ) -> List[List[Dict[str, Any]]]:
-    """'owl_v2_video' is a tool that can detect and count multiple objects given a text
-    prompt such as category names or referring expressions on videos. The categories in
-    text prompt are separated by commas. It returns a list of bounding boxes with
-    normalized coordinates, label names and associated probability scores per frame.
+    """'owl_v2_video' will run owl_v2 on each frame of a video. It can detect multiple
+    objects per frame given a text prompt sucha s a category name or referring
+    expression. The categories in text prompt are separated by commas. It returns a list
+    of lists where each inner list contains the score, label, and bounding box of the
+    detections for that frame.
 
     Parameters:
         prompt (str): The prompt to ground to the video.
@@ -244,7 +218,7 @@ def owl_v2_video(
             to 0.30.
 
     Returns:
-        List[List[Dict[str, Any]]]: A list of dictionaries per frame containing the
+        List[List[Dict[str, Any]]]: A list of lists of dictionaries containing the
             score, label, and bounding box of the detected objects with normalized
             coordinates between 0 and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the
             coordinates of the top-left and xmax and ymax are the coordinates of the
@@ -414,14 +388,14 @@ def florence2_sam2_image(prompt: str, image: np.ndarray) -> List[Dict[str, Any]]
     return return_data
 
 
-def florence2_sam2_video(
+def florence2_sam2_video_tracking(
     prompt: str, frames: List[np.ndarray]
 ) -> List[List[Dict[str, Any]]]:
-    """'florence2_sam2_video' is a tool that can segment and track multiple entities
-    in a video given a text prompt such as category names or referring expressions. You
-    can optionally separate the categories in the text with commas. It only tracks
-    entities present in the first frame and only returns segmentation masks. It is
-    useful for tracking and counting without duplicating counts.
+    """'florence2_sam2_video_tracking' is a tool that can segment and track multiple
+    entities in a video given a text prompt such as category names or referring
+    expressions. You can optionally separate the categories in the text with commas. It
+    only tracks entities present in the first frame and only returns segmentation
+    masks. It is useful for tracking and counting without duplicating counts.
 
     Parameters:
         prompt (str): The prompt to ground to the video.
@@ -456,7 +430,7 @@ def florence2_sam2_video(
     files = [("video", buffer_bytes)]
     payload = {
         "prompts": [s.strip() for s in prompt.split(",")],
-        "function_name": "florence2_sam2_video",
+        "function_name": "florence2_sam2_video_tracking",
     }
     data: Dict[str, Any] = send_inference_request(
         payload, "florence2-sam2", files=files, v2=True
@@ -1933,7 +1907,7 @@ def overlay_counting_results(
 
 FUNCTION_TOOLS = [
     owl_v2_image,
-    # owl_v2_video,
+    owl_v2_video,
     ocr,
     clip,
     vit_image_classification,
@@ -1942,7 +1916,7 @@ def overlay_counting_results(
     florence2_image_caption,
     florence2_ocr,
     florence2_sam2_image,
-    florence2_sam2_video,
+    florence2_sam2_video_tracking,
     florence2_phrase_grounding,
     ixc25_image_vqa,
     ixc25_video_vqa,