From 791455b282f1952181fc860d96c2302ac8b4813b Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Sun, 13 Oct 2024 10:07:44 -0700 Subject: [PATCH] remove temporal localization --- tests/integ/test_tools.py | 12 ---------- vision_agent/tools/__init__.py | 1 - vision_agent/tools/tools.py | 41 ---------------------------------- 3 files changed, 54 deletions(-) diff --git a/tests/integ/test_tools.py b/tests/integ/test_tools.py index 93680398..42c8e62e 100644 --- a/tests/integ/test_tools.py +++ b/tests/integ/test_tools.py @@ -24,7 +24,6 @@ grounding_dino, grounding_sam, ixc25_image_vqa, - ixc25_temporal_localization, ixc25_video_vqa, loca_visual_prompt_counting, loca_zero_shot_counting, @@ -363,17 +362,6 @@ def test_ixc25_video_vqa(): assert "cat" in result.strip() -def test_ixc25_temporal_localization(): - frames = [ - np.array(Image.fromarray(ski.data.cat()).convert("RGB")) for _ in range(10) - ] - result = ixc25_temporal_localization( - prompt="What animal is in this video?", - frames=frames, - ) - assert result == [True] * 10 - - def test_ocr(): img = ski.data.page() result = ocr( diff --git a/vision_agent/tools/__init__.py b/vision_agent/tools/__init__.py index 2a75aa2b..16df1193 100644 --- a/vision_agent/tools/__init__.py +++ b/vision_agent/tools/__init__.py @@ -37,7 +37,6 @@ grounding_dino, grounding_sam, ixc25_image_vqa, - ixc25_temporal_localization, ixc25_video_vqa, load_image, loca_visual_prompt_counting, diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py index 9c03467c..86e484b2 100644 --- a/vision_agent/tools/tools.py +++ b/vision_agent/tools/tools.py @@ -882,47 +882,6 @@ def ixc25_video_vqa(prompt: str, frames: List[np.ndarray]) -> str: return cast(str, data["answer"]) -def ixc25_temporal_localization(prompt: str, frames: List[np.ndarray]) -> List[bool]: - """'ixc25_temporal_localization' uses ixc25_video_vqa to temporally segment a video - given a prompt that can be other an object or a phrase. It returns a list of - boolean values indicating whether the object or phrase is present in the - corresponding frame. - - Parameters: - prompt (str): The question about the video - frames (List[np.ndarray]): The reference frames used for the question - - Returns: - List[bool]: A list of boolean values indicating whether the object or phrase is - present in the corresponding frame. - - Example - ------- - >>> output = ixc25_temporal_localization('soccer goal', frames) - >>> print(output) - [False, False, False, True, True, True, False, False, False, False] - >>> save_video([f for i, f in enumerate(frames) if output[i]], 'output.mp4') - """ - - buffer_bytes = frames_to_bytes(frames) - files = [("video", buffer_bytes)] - payload = { - "prompt": prompt, - "chunk_length": 2, - "function_name": "ixc25_temporal_localization", - } - data: List[int] = send_inference_request( - payload, - "video-temporal-localization?model=internlm-xcomposer", - files=files, - v2=True, - ) - chunk_size = round(len(frames) / len(data)) - data_explode = [[elt] * chunk_size for elt in data] - data_bool = [bool(elt) for sublist in data_explode for elt in sublist] - return data_bool[: len(frames)] - - def gpt4o_image_vqa(prompt: str, image: np.ndarray) -> str: """'gpt4o_image_vqa' is a tool that can answer any questions about arbitrary images including regular images or images of documents or presentations. It returns text