Skip to content

Commit

Permalink
add video support
Browse files Browse the repository at this point in the history
  • Loading branch information
Dayof committed Oct 1, 2024
1 parent 075b897 commit 294fecd
Show file tree
Hide file tree
Showing 7 changed files with 155 additions and 41 deletions.
37 changes: 32 additions & 5 deletions tests/integ/test_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@
dpt_hybrid_midas,
florence2_image_caption,
florence2_ocr,
florence2_phrase_grounding,
florence2_phrase_grounding_image,
florence2_phrase_grounding_video,
florence2_roberta_vqa,
florence2_sam2_image,
florence2_sam2_video_tracking,
Expand Down Expand Up @@ -92,19 +93,19 @@ def test_owl_v2_video():
assert 24 <= len([res["label"] for res in result[0]]) <= 26


def test_florence2_phrase_grounding():
def test_florence2_phrase_grounding_image():
img = ski.data.coins()
result = florence2_phrase_grounding(
result = florence2_phrase_grounding_image(
image=img,
prompt="coin",
)
assert len(result) == 25
assert [res["label"] for res in result] == ["coin"] * 25


def test_florence2_phrase_grounding_fine_tune_id():
def test_florence2_phrase_grounding_image_fine_tune_id():
img = ski.data.coins()
result = florence2_phrase_grounding(
result = florence2_phrase_grounding_image(
prompt="coin",
image=img,
fine_tune_id=FINE_TUNE_ID,
Expand All @@ -114,6 +115,32 @@ def test_florence2_phrase_grounding_fine_tune_id():
assert [res["label"] for res in result] == ["coin"] * len(result)


def test_florence2_phrase_grounding_video():
frames = [
np.array(Image.fromarray(ski.data.coins()).convert("RGB")) for _ in range(10)
]
result = florence2_phrase_grounding_video(
prompt="coin",
frames=frames,
)
assert len(result) == 10
assert 24 <= len([res["label"] for res in result[0]]) <= 26


def test_florence2_phrase_grounding_video_fine_tune_id():
frames = [
np.array(Image.fromarray(ski.data.coins()).convert("RGB")) for _ in range(10)
]
# this calls a fine-tuned florence2 model which is going to be worse at this task
result = florence2_phrase_grounding_video(
prompt="coin",
frames=frames,
fine_tune_id=FINE_TUNE_ID,
)
assert len(result) == 10
assert 24 <= len([res["label"] for res in result[0]]) <= 26


def test_template_match():
img = ski.data.coins()
result = template_match(
Expand Down
16 changes: 8 additions & 8 deletions tests/unit/test_meta_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,41 +33,41 @@ def test_use_object_detection_fine_tuning_none():

def test_use_object_detection_fine_tuning():
artifacts = Artifacts("test")
code = """florence2_phrase_grounding('one', image1)
code = """florence2_phrase_grounding_image('one', image1)
owl_v2_image('two', image2)
florence2_sam2_image('three', image3)"""
expected_code = """florence2_phrase_grounding("one", image1, "123")
expected_code = """florence2_phrase_grounding_image("one", image1, "123")
owl_v2_image("two", image2, "123")
florence2_sam2_image("three", image3, "123")"""
artifacts["code"] = code

output = use_object_detection_fine_tuning(artifacts, "code", "123")
assert 'florence2_phrase_grounding("one", image1, "123")' in output
assert 'florence2_phrase_grounding_image("one", image1, "123")' in output
assert 'owl_v2_image("two", image2, "123")' in output
assert 'florence2_sam2_image("three", image3, "123")' in output
assert artifacts["code"] == expected_code


def test_use_object_detection_fine_tuning_twice():
artifacts = Artifacts("test")
code = """florence2_phrase_grounding('one', image1)
code = """florence2_phrase_grounding_image('one', image1)
owl_v2_image('two', image2)
florence2_sam2_image('three', image3)"""
expected_code1 = """florence2_phrase_grounding("one", image1, "123")
expected_code1 = """florence2_phrase_grounding_image("one", image1, "123")
owl_v2_image("two", image2, "123")
florence2_sam2_image("three", image3, "123")"""
expected_code2 = """florence2_phrase_grounding("one", image1, "456")
expected_code2 = """florence2_phrase_grounding_image("one", image1, "456")
owl_v2_image("two", image2, "456")
florence2_sam2_image("three", image3, "456")"""
artifacts["code"] = code
output = use_object_detection_fine_tuning(artifacts, "code", "123")
assert 'florence2_phrase_grounding("one", image1, "123")' in output
assert 'florence2_phrase_grounding_image("one", image1, "123")' in output
assert 'owl_v2_image("two", image2, "123")' in output
assert 'florence2_sam2_image("three", image3, "123")' in output
assert artifacts["code"] == expected_code1

output = use_object_detection_fine_tuning(artifacts, "code", "456")
assert 'florence2_phrase_grounding("one", image1, "456")' in output
assert 'florence2_phrase_grounding_image("one", image1, "456")' in output
assert 'owl_v2_image("two", image2, "456")' in output
assert 'florence2_sam2_image("three", image3, "456")' in output
assert artifacts["code"] == expected_code2
10 changes: 5 additions & 5 deletions vision_agent/agent/vision_agent_coder_prompts.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,15 +101,15 @@
- Use the 'owl_v2_video' tool with the prompt 'person' to detect where the people are in the video.
plan2:
- Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames_and_timestamps' tool.
- Use the 'florence2_phrase_grounding' tool with the prompt 'person' to detect where the people are in the video.
- Use the 'florence2_phrase_grounding_image' tool with the prompt 'person' to detect where the people are in the video.
plan3:
- Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames_and_timestamps' tool.
- Use the 'florence2_sam2_video_tracking' tool with the prompt 'person' to detect where the people are in the video.
```python
import numpy as np
from vision_agent.tools import extract_frames_and_timestamps, owl_v2_video, florence2_phrase_grounding, florence2_sam2_video_tracking
from vision_agent.tools import extract_frames_and_timestamps, owl_v2_video, florence2_phrase_grounding_image, florence2_sam2_video_tracking
# sample at 1 FPS and use the first 10 frames to reduce processing time
frames = extract_frames_and_timestamps("video.mp4", 1)
Expand Down Expand Up @@ -143,7 +143,7 @@ def get_counts(preds):
owl_v2_counts = get_counts(owl_v2_out)
# plan2
florence2_out = [florence2_phrase_grounding("person", f) for f in frames]
florence2_out = [florence2_phrase_grounding_image("person", f) for f in frames]
florence2_counts = get_counts(florence2_out)
# plan3
Expand All @@ -153,13 +153,13 @@ def get_counts(preds):
final_out = {{
"owl_v2_video": owl_v2_out,
"florence2_phrase_grounding": florence2_out,
"florence2_phrase_grounding_image": florence2_out,
"florence2_sam2_video_tracking": f2s2_out,
}}
counts = {{
"owl_v2_video": owl_v2_counts,
"florence2_phrase_grounding": florence2_counts,
"florence2_phrase_grounding_image": florence2_counts,
"florence2_sam2_video_tracking": f2s2_counts,
}}
Expand Down
14 changes: 7 additions & 7 deletions vision_agent/agent/vision_agent_prompts.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,10 +131,10 @@
OBSERVATION:
[Artifact code.py]
0|from vision_agent.tools import load_image, florence2_phrase_grounding, closest_box_distance, overlay_bounding_boxes, save_image
0|from vision_agent.tools import load_image, florence2_phrase_grounding_image, closest_box_distance, overlay_bounding_boxes, save_image
1|def count_workers_with_helmets(image_path: str, output_path: str):
2| image = load_image(image_path)
3| detections = florence2_phrase_grounding("worker, helmet", image)
3| detections = florence2_phrase_grounding_image("worker, helmet", image)
4| workers = [d for d in detections if d['label'] == 'worker']
5| helmets = [d for d in detections if d['label'] == 'helmet']
6| count = 0
Expand Down Expand Up @@ -166,18 +166,18 @@
OBSERVATION:
[Fine tuning id: 23b3b022-5ebf-4798-9373-20ef36429abf]
AGENT: {"thoughts": "The model has finished fine tuning, I will now replace the original florence2_phrase_grounding call with the fine tuning id.", "response": "I will not update the code to use the fine tuned model. <execute_python>use_object_detection_fine_tuning(artifacts, "code.py", "23b3b022-5ebf-4798-9373-20ef36429abf")</execute_python>", "let_user_respond": false}
AGENT: {"thoughts": "The model has finished fine tuning, I will now replace the original florence2_phrase_grounding_image call with the fine tuning id.", "response": "I will not update the code to use the fine tuned model. <execute_python>use_object_detection_fine_tuning(artifacts, "code.py", "23b3b022-5ebf-4798-9373-20ef36429abf")</execute_python>", "let_user_respond": false}
OBSERVATION:
[Artifact code.py edits]
---
+++
@@ -1,7 +1,7 @@
from vision_agent.tools import load_image, florence2_phrase_grounding, closest_box_distance, overlay_bounding_boxes, save_image
from vision_agent.tools import load_image, florence2_phrase_grounding_image, closest_box_distance, overlay_bounding_boxes, save_image
def count_workers_with_helmets(image_path: str, output_path: str):
image = load_image(image_path)
- detections = florence2_phrase_grounding("worker, helmet", image)
+ detections = florence2_phrase_grounding("worker, helmet", image, "23b3b022-5ebf-4798-9373-20ef36429abf")
- detections = florence2_phrase_grounding_image("worker, helmet", image)
+ detections = florence2_phrase_grounding_image("worker, helmet", image, "23b3b022-5ebf-4798-9373-20ef36429abf")
workers = [d for d in detections if d['label'] == 'worker']
helmets = [d for d in detections if d['label'] == 'helmet']
count = 0
Expand All @@ -189,5 +189,5 @@ def count_workers_with_helmets(image_path: str, output_path: str):
----- stdout -----
3
AGENT: {"thoughts": "Three workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "I have updated the code using the fine tuned florence2_phrase_grounding model to count the workers wearing helmets in code.py and saved the visualization under 'workers_viz.png'.", "let_user_respond": true}
AGENT: {"thoughts": "Three workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "I have updated the code using the fine tuned florence2_phrase_grounding_image model to count the workers wearing helmets in code.py and saved the visualization under 'workers_viz.png'.", "let_user_respond": true}
"""
3 changes: 2 additions & 1 deletion vision_agent/tools/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@
extract_frames_and_timestamps,
florence2_image_caption,
florence2_ocr,
florence2_phrase_grounding,
florence2_phrase_grounding_image,
florence2_phrase_grounding_video,
florence2_roberta_vqa,
florence2_sam2_image,
florence2_sam2_video_tracking,
Expand Down
8 changes: 6 additions & 2 deletions vision_agent/tools/meta_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -665,8 +665,12 @@ def use_object_detection_fine_tuning(

patterns_with_fine_tune_id = [
(
r'florence2_phrase_grounding\(\s*["\']([^"\']+)["\']\s*,\s*([^,]+)(?:,\s*["\'][^"\']+["\'])?\s*\)',
lambda match: f'florence2_phrase_grounding("{match.group(1)}", {match.group(2)}, "{fine_tune_id}")',
r'florence2_phrase_grounding_image\(\s*["\']([^"\']+)["\']\s*,\s*([^,]+)(?:,\s*["\'][^"\']+["\'])?\s*\)',
lambda match: f'florence2_phrase_grounding_image("{match.group(1)}", {match.group(2)}, "{fine_tune_id}")',
),
(
r'florence2_phrase_grounding_video\(\s*["\']([^"\']+)["\']\s*,\s*([^,]+)(?:,\s*["\'][^"\']+["\'])?\s*\)',
lambda match: f'florence2_phrase_grounding_video("{match.group(1)}", {match.group(2)}, "{fine_tune_id}")',
),
(
r'owl_v2_image\(\s*["\']([^"\']+)["\']\s*,\s*([^,]+)(?:,\s*["\'][^"\']+["\'])?\s*\)',
Expand Down
108 changes: 95 additions & 13 deletions vision_agent/tools/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -1141,16 +1141,13 @@ def florence2_image_caption(image: np.ndarray, detail_caption: bool = True) -> s
return answer[task] # type: ignore


# TODO: add video


def florence2_phrase_grounding(
def florence2_phrase_grounding_image(
prompt: str, image: np.ndarray, fine_tune_id: Optional[str] = None
) -> List[Dict[str, Any]]:
"""'florence2_phrase_grounding' is a tool that can detect multiple
objects given a text prompt which can be object names or caption. You
can optionally separate the object names in the text with commas. It returns a list
of bounding boxes with normalized coordinates, label names and associated
"""'florence2_phrase_grounding_image' will run florence2 on a image. It can
detect multiple objects given a text prompt which can be object names or caption.
You can optionally separate the object names in the text with commas. It returns
a list of bounding boxes with normalized coordinates, label names and associated
probability scores of 1.0.
Parameters:
Expand All @@ -1168,7 +1165,7 @@ def florence2_phrase_grounding(
Example
-------
>>> florence2_phrase_grounding('person looking at a coyote', image)
>>> florence2_phrase_grounding_image('person looking at a coyote', image)
[
{'score': 1.0, 'label': 'person', 'bbox': [0.1, 0.11, 0.35, 0.4]},
{'score': 1.0, 'label': 'coyote', 'bbox': [0.34, 0.21, 0.85, 0.5},
Expand Down Expand Up @@ -1196,7 +1193,7 @@ def florence2_phrase_grounding(
data,
"florence2-ft",
v2=True,
metadata_payload={"function_name": "florence2_phrase_grounding"},
metadata_payload={"function_name": "florence2_phrase_grounding_image"},
)
# get the first frame
detection = detections[0]
Expand All @@ -1205,7 +1202,7 @@ def florence2_phrase_grounding(
"image": image_b64,
"task": "<CAPTION_TO_PHRASE_GROUNDING>",
"prompt": prompt,
"function_name": "florence2_phrase_grounding",
"function_name": "florence2_phrase_grounding_image",
}
detections = send_inference_request(data, "florence2", v2=True)
detection = detections["<CAPTION_TO_PHRASE_GROUNDING>"]
Expand All @@ -1222,6 +1219,90 @@ def florence2_phrase_grounding(
return [bbox.model_dump() for bbox in return_data]


def florence2_phrase_grounding_video(
prompt: str, frames: List[np.ndarray], fine_tune_id: Optional[str] = None
) -> List[Dict[str, Any]]:
"""'florence2_phrase_grounding_video' will run florence2 on each frame of a video.
It can detect multiple objects given a text prompt which can be object names or
caption. You can optionally separate the object names in the text with commas.
It returns a list of lists where each inner list contains bounding boxes with
normalized coordinates, label names and associated probability scores of 1.0.
Parameters:
prompt (str): The prompt to ground to the video.
frames (List[np.ndarray]): The list of frames to detect objects.
fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
fine-tuned model ID here to use it.
Returns:
List[List[Dict[str, Any]]]: A list of lists of dictionaries containing the score,
label, and bounding box of the detected objects with normalized coordinates
between 0 and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates
of the top-left and xmax and ymax are the coordinates of the bottom-right of
the bounding box. The scores are always 1.0 and cannot be thresholded.
Example
-------
>>> florence2_phrase_grounding_video('person looking at a coyote', frames)
[
[
{'score': 1.0, 'label': 'person', 'bbox': [0.1, 0.11, 0.35, 0.4]},
{'score': 1.0, 'label': 'coyote', 'bbox': [0.34, 0.21, 0.85, 0.5},
],
...
]
"""
if len(frames) == 0:
raise ValueError("No frames provided")

image_size = frames[0].shape[:2]
buffer_bytes = frames_to_bytes(frames)
files = [("video", buffer_bytes)]

if fine_tune_id is not None:
landing_api = LandingPublicAPI()
status = landing_api.check_fine_tuning_job(UUID(fine_tune_id))
if status is not JobStatus.SUCCEEDED:
raise FineTuneModelIsNotReady(
f"Fine-tuned model {fine_tune_id} is not ready yet"
)

data_obj = Florence2FtRequest(
video=buffer_bytes,
task=PromptTask.PHRASE_GROUNDING,
prompt=prompt,
job_id=UUID(fine_tune_id),
)
data = data_obj.model_dump(by_alias=True, exclude_none=True)
else:
data_obj = Florence2FtRequest(
video=buffer_bytes, task=PromptTask.PHRASE_GROUNDING, prompt=prompt
)
data = data_obj.model_dump(by_alias=True, exclude_none=True)

detections = send_inference_request(
data,
"florence2-ft",
v2=True,
files=files,
metadata_payload={"function_name": "florence2_phrase_grounding_video"},
)

bboxes_formatted = []
for frame_data in detections:
bboxes_formatted_per_frame = []
for idx in range(len(frame_data["bboxes"])):
bboxes_formatted_per_frame.append(
ODResponseData(
label=frame_data["labels"][idx],
bbox=normalize_bbox(frame_data["bboxes"][idx], image_size),
score=1.0,
)
)
bboxes_formatted.append(bboxes_formatted_per_frame)
return [[bbox.model_dump() for bbox in frame] for frame in bboxes_formatted]


def florence2_ocr(image: np.ndarray) -> List[Dict[str, Any]]:
"""'florence2_ocr' is a tool that can detect text and text regions in an image.
Each text region contains one line of text. It returns a list of detected text,
Expand All @@ -1233,7 +1314,7 @@ def florence2_ocr(image: np.ndarray) -> List[Dict[str, Any]]:
Returns:
List[Dict[str, Any]]: A list of dictionaries containing the detected text, bbox
with nornmalized coordinates, and confidence score.
with normalized coordinates, and confidence score.
Example
-------
Expand Down Expand Up @@ -2077,7 +2158,8 @@ def overlay_counting_results(
florence2_ocr,
florence2_sam2_image,
florence2_sam2_video_tracking,
florence2_phrase_grounding,
florence2_phrase_grounding_image,
florence2_phrase_grounding_video,
ixc25_image_vqa,
ixc25_video_vqa,
detr_segmentation,
Expand Down

0 comments on commit 294fecd

Please sign in to comment.