Skip to content

Commit

Permalink
remove temporal localization
Browse files Browse the repository at this point in the history
  • Loading branch information
dillonalaird committed Oct 13, 2024
1 parent ff982ba commit 791455b
Show file tree
Hide file tree
Showing 3 changed files with 0 additions and 54 deletions.
12 changes: 0 additions & 12 deletions tests/integ/test_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@
grounding_dino,
grounding_sam,
ixc25_image_vqa,
ixc25_temporal_localization,
ixc25_video_vqa,
loca_visual_prompt_counting,
loca_zero_shot_counting,
Expand Down Expand Up @@ -363,17 +362,6 @@ def test_ixc25_video_vqa():
assert "cat" in result.strip()


def test_ixc25_temporal_localization():
frames = [
np.array(Image.fromarray(ski.data.cat()).convert("RGB")) for _ in range(10)
]
result = ixc25_temporal_localization(
prompt="What animal is in this video?",
frames=frames,
)
assert result == [True] * 10


def test_ocr():
img = ski.data.page()
result = ocr(
Expand Down
1 change: 0 additions & 1 deletion vision_agent/tools/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@
grounding_dino,
grounding_sam,
ixc25_image_vqa,
ixc25_temporal_localization,
ixc25_video_vqa,
load_image,
loca_visual_prompt_counting,
Expand Down
41 changes: 0 additions & 41 deletions vision_agent/tools/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -882,47 +882,6 @@ def ixc25_video_vqa(prompt: str, frames: List[np.ndarray]) -> str:
return cast(str, data["answer"])


def ixc25_temporal_localization(prompt: str, frames: List[np.ndarray]) -> List[bool]:
"""'ixc25_temporal_localization' uses ixc25_video_vqa to temporally segment a video
given a prompt that can be other an object or a phrase. It returns a list of
boolean values indicating whether the object or phrase is present in the
corresponding frame.
Parameters:
prompt (str): The question about the video
frames (List[np.ndarray]): The reference frames used for the question
Returns:
List[bool]: A list of boolean values indicating whether the object or phrase is
present in the corresponding frame.
Example
-------
>>> output = ixc25_temporal_localization('soccer goal', frames)
>>> print(output)
[False, False, False, True, True, True, False, False, False, False]
>>> save_video([f for i, f in enumerate(frames) if output[i]], 'output.mp4')
"""

buffer_bytes = frames_to_bytes(frames)
files = [("video", buffer_bytes)]
payload = {
"prompt": prompt,
"chunk_length": 2,
"function_name": "ixc25_temporal_localization",
}
data: List[int] = send_inference_request(
payload,
"video-temporal-localization?model=internlm-xcomposer",
files=files,
v2=True,
)
chunk_size = round(len(frames) / len(data))
data_explode = [[elt] * chunk_size for elt in data]
data_bool = [bool(elt) for sublist in data_explode for elt in sublist]
return data_bool[: len(frames)]


def gpt4o_image_vqa(prompt: str, image: np.ndarray) -> str:
"""'gpt4o_image_vqa' is a tool that can answer any questions about arbitrary images
including regular images or images of documents or presentations. It returns text
Expand Down

0 comments on commit 791455b

Please sign in to comment.