From 791455b282f1952181fc860d96c2302ac8b4813b Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Sun, 13 Oct 2024 10:07:44 -0700
Subject: [PATCH] remove temporal localization

---
 tests/integ/test_tools.py      | 12 ----------
 vision_agent/tools/__init__.py |  1 -
 vision_agent/tools/tools.py    | 41 ----------------------------------
 3 files changed, 54 deletions(-)

diff --git a/tests/integ/test_tools.py b/tests/integ/test_tools.py
index 93680398..42c8e62e 100644
--- a/tests/integ/test_tools.py
+++ b/tests/integ/test_tools.py
@@ -24,7 +24,6 @@
     grounding_dino,
     grounding_sam,
     ixc25_image_vqa,
-    ixc25_temporal_localization,
     ixc25_video_vqa,
     loca_visual_prompt_counting,
     loca_zero_shot_counting,
@@ -363,17 +362,6 @@ def test_ixc25_video_vqa():
     assert "cat" in result.strip()
 
 
-def test_ixc25_temporal_localization():
-    frames = [
-        np.array(Image.fromarray(ski.data.cat()).convert("RGB")) for _ in range(10)
-    ]
-    result = ixc25_temporal_localization(
-        prompt="What animal is in this video?",
-        frames=frames,
-    )
-    assert result == [True] * 10
-
-
 def test_ocr():
     img = ski.data.page()
     result = ocr(
diff --git a/vision_agent/tools/__init__.py b/vision_agent/tools/__init__.py
index 2a75aa2b..16df1193 100644
--- a/vision_agent/tools/__init__.py
+++ b/vision_agent/tools/__init__.py
@@ -37,7 +37,6 @@
     grounding_dino,
     grounding_sam,
     ixc25_image_vqa,
-    ixc25_temporal_localization,
     ixc25_video_vqa,
     load_image,
     loca_visual_prompt_counting,
diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py
index 9c03467c..86e484b2 100644
--- a/vision_agent/tools/tools.py
+++ b/vision_agent/tools/tools.py
@@ -882,47 +882,6 @@ def ixc25_video_vqa(prompt: str, frames: List[np.ndarray]) -> str:
     return cast(str, data["answer"])
 
 
-def ixc25_temporal_localization(prompt: str, frames: List[np.ndarray]) -> List[bool]:
-    """'ixc25_temporal_localization' uses ixc25_video_vqa to temporally segment a video
-    given a prompt that can be other an object or a phrase. It returns a list of
-    boolean values indicating whether the object or phrase is present in the
-    corresponding frame.
-
-    Parameters:
-        prompt (str): The question about the video
-        frames (List[np.ndarray]): The reference frames used for the question
-
-    Returns:
-        List[bool]: A list of boolean values indicating whether the object or phrase is
-            present in the corresponding frame.
-
-    Example
-    -------
-        >>> output = ixc25_temporal_localization('soccer goal', frames)
-        >>> print(output)
-        [False, False, False, True, True, True, False, False, False, False]
-        >>> save_video([f for i, f in enumerate(frames) if output[i]], 'output.mp4')
-    """
-
-    buffer_bytes = frames_to_bytes(frames)
-    files = [("video", buffer_bytes)]
-    payload = {
-        "prompt": prompt,
-        "chunk_length": 2,
-        "function_name": "ixc25_temporal_localization",
-    }
-    data: List[int] = send_inference_request(
-        payload,
-        "video-temporal-localization?model=internlm-xcomposer",
-        files=files,
-        v2=True,
-    )
-    chunk_size = round(len(frames) / len(data))
-    data_explode = [[elt] * chunk_size for elt in data]
-    data_bool = [bool(elt) for sublist in data_explode for elt in sublist]
-    return data_bool[: len(frames)]
-
-
 def gpt4o_image_vqa(prompt: str, image: np.ndarray) -> str:
     """'gpt4o_image_vqa' is a tool that can answer any questions about arbitrary images
     including regular images or images of documents or presentations. It returns text