diff --git a/.github/workflows/ci_cd.yml b/.github/workflows/ci_cd.yml
index 3576e10c..ce25f286 100644
--- a/.github/workflows/ci_cd.yml
+++ b/.github/workflows/ci_cd.yml
@@ -83,9 +83,6 @@ jobs:
- name: Test with pytest
run: |
poetry run pytest -v tests/integ
- - name: Test with pytest, dev env
- run: |
- LANDINGAI_API_KEY=$LANDINGAI_DEV_API_KEY LANDINGAI_URL=https://api.dev.landing.ai poetry run pytest -v tests/integration_dev
release:
name: Release
diff --git a/tests/integ/test_tools.py b/tests/integ/test_tools.py
index 4954738c..796fcdce 100644
--- a/tests/integ/test_tools.py
+++ b/tests/integ/test_tools.py
@@ -11,7 +11,8 @@
dpt_hybrid_midas,
florence2_image_caption,
florence2_ocr,
- florence2_phrase_grounding,
+ florence2_phrase_grounding_image,
+ # florence2_phrase_grounding_video,
florence2_roberta_vqa,
florence2_sam2_image,
florence2_sam2_video_tracking,
@@ -31,6 +32,8 @@
template_match,
vit_image_classification,
vit_nsfw_classification,
+ countgd_counting,
+ countgd_example_based_counting,
)
FINE_TUNE_ID = "65ebba4a-88b7-419f-9046-0750e30250da"
@@ -92,9 +95,9 @@ def test_owl_v2_video():
assert 24 <= len([res["label"] for res in result[0]]) <= 26
-def test_florence2_phrase_grounding():
+def test_florence2_phrase_grounding_image():
img = ski.data.coins()
- result = florence2_phrase_grounding(
+ result = florence2_phrase_grounding_image(
image=img,
prompt="coin",
)
@@ -102,9 +105,9 @@ def test_florence2_phrase_grounding():
assert [res["label"] for res in result] == ["coin"] * 25
-def test_florence2_phrase_grounding_fine_tune_id():
+def test_florence2_phrase_grounding_image_fine_tune_id():
img = ski.data.coins()
- result = florence2_phrase_grounding(
+ result = florence2_phrase_grounding_image(
prompt="coin",
image=img,
fine_tune_id=FINE_TUNE_ID,
@@ -114,6 +117,32 @@ def test_florence2_phrase_grounding_fine_tune_id():
assert [res["label"] for res in result] == ["coin"] * len(result)
+# def test_florence2_phrase_grounding_video():
+# frames = [
+# np.array(Image.fromarray(ski.data.coins()).convert("RGB")) for _ in range(10)
+# ]
+# result = florence2_phrase_grounding_video(
+# prompt="coin",
+# frames=frames,
+# )
+# assert len(result) == 10
+# assert 2 <= len([res["label"] for res in result[0]]) <= 26
+
+
+# def test_florence2_phrase_grounding_video_fine_tune_id():
+# frames = [
+# np.array(Image.fromarray(ski.data.coins()).convert("RGB")) for _ in range(10)
+# ]
+# # this calls a fine-tuned florence2 model which is going to be worse at this task
+# result = florence2_phrase_grounding_video(
+# prompt="coin",
+# frames=frames,
+# fine_tune_id=FINE_TUNE_ID,
+# )
+# assert len(result) == 10
+# assert 16 <= len([res["label"] for res in result[0]]) <= 26
+
+
def test_template_match():
img = ski.data.coins()
result = template_match(
@@ -360,3 +389,18 @@ def test_generate_hed():
)
assert result.shape == img.shape
+
+
+def test_countgd_counting() -> None:
+ img = ski.data.coins()
+ result = countgd_counting(image=img, prompt="coin")
+ assert len(result) == 24
+
+
+def test_countgd_example_based_counting() -> None:
+ img = ski.data.coins()
+ result = countgd_example_based_counting(
+ visual_prompts=[[85, 106, 122, 145]],
+ image=img,
+ )
+ assert len(result) == 24
diff --git a/tests/integration_dev/__init__.py b/tests/integration_dev/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/tests/integration_dev/test_tools.py b/tests/integration_dev/test_tools.py
deleted file mode 100644
index 246c5642..00000000
--- a/tests/integration_dev/test_tools.py
+++ /dev/null
@@ -1,18 +0,0 @@
-import skimage as ski
-
-from vision_agent.tools import countgd_counting, countgd_example_based_counting
-
-
-def test_countgd_counting() -> None:
- img = ski.data.coins()
- result = countgd_counting(image=img, prompt="coin")
- assert len(result) == 24
-
-
-def test_countgd_example_based_counting() -> None:
- img = ski.data.coins()
- result = countgd_example_based_counting(
- visual_prompts=[[85, 106, 122, 145]],
- image=img,
- )
- assert len(result) == 24
diff --git a/tests/unit/test_meta_tools.py b/tests/unit/test_meta_tools.py
index fced644b..ef07bb9e 100644
--- a/tests/unit/test_meta_tools.py
+++ b/tests/unit/test_meta_tools.py
@@ -33,16 +33,16 @@ def test_use_object_detection_fine_tuning_none():
def test_use_object_detection_fine_tuning():
artifacts = Artifacts("test")
- code = """florence2_phrase_grounding('one', image1)
+ code = """florence2_phrase_grounding_image('one', image1)
owl_v2_image('two', image2)
florence2_sam2_image('three', image3)"""
- expected_code = """florence2_phrase_grounding("one", image1, "123")
+ expected_code = """florence2_phrase_grounding_image("one", image1, "123")
owl_v2_image("two", image2, "123")
florence2_sam2_image("three", image3, "123")"""
artifacts["code"] = code
output = use_object_detection_fine_tuning(artifacts, "code", "123")
- assert 'florence2_phrase_grounding("one", image1, "123")' in output
+ assert 'florence2_phrase_grounding_image("one", image1, "123")' in output
assert 'owl_v2_image("two", image2, "123")' in output
assert 'florence2_sam2_image("three", image3, "123")' in output
assert artifacts["code"] == expected_code
@@ -50,24 +50,24 @@ def test_use_object_detection_fine_tuning():
def test_use_object_detection_fine_tuning_twice():
artifacts = Artifacts("test")
- code = """florence2_phrase_grounding('one', image1)
+ code = """florence2_phrase_grounding_image('one', image1)
owl_v2_image('two', image2)
florence2_sam2_image('three', image3)"""
- expected_code1 = """florence2_phrase_grounding("one", image1, "123")
+ expected_code1 = """florence2_phrase_grounding_image("one", image1, "123")
owl_v2_image("two", image2, "123")
florence2_sam2_image("three", image3, "123")"""
- expected_code2 = """florence2_phrase_grounding("one", image1, "456")
+ expected_code2 = """florence2_phrase_grounding_image("one", image1, "456")
owl_v2_image("two", image2, "456")
florence2_sam2_image("three", image3, "456")"""
artifacts["code"] = code
output = use_object_detection_fine_tuning(artifacts, "code", "123")
- assert 'florence2_phrase_grounding("one", image1, "123")' in output
+ assert 'florence2_phrase_grounding_image("one", image1, "123")' in output
assert 'owl_v2_image("two", image2, "123")' in output
assert 'florence2_sam2_image("three", image3, "123")' in output
assert artifacts["code"] == expected_code1
output = use_object_detection_fine_tuning(artifacts, "code", "456")
- assert 'florence2_phrase_grounding("one", image1, "456")' in output
+ assert 'florence2_phrase_grounding_image("one", image1, "456")' in output
assert 'owl_v2_image("two", image2, "456")' in output
assert 'florence2_sam2_image("three", image3, "456")' in output
assert artifacts["code"] == expected_code2
diff --git a/vision_agent/agent/vision_agent_coder_prompts.py b/vision_agent/agent/vision_agent_coder_prompts.py
index 07f2c6e2..45fc02ed 100644
--- a/vision_agent/agent/vision_agent_coder_prompts.py
+++ b/vision_agent/agent/vision_agent_coder_prompts.py
@@ -101,7 +101,7 @@
- Use the 'owl_v2_video' tool with the prompt 'person' to detect where the people are in the video.
plan2:
- Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames_and_timestamps' tool.
-- Use the 'florence2_phrase_grounding' tool with the prompt 'person' to detect where the people are in the video.
+- Use the 'florence2_phrase_grounding_image' tool with the prompt 'person' to detect where the people are in the video.
plan3:
- Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames_and_timestamps' tool.
- Use the 'florence2_sam2_video_tracking' tool with the prompt 'person' to detect where the people are in the video.
@@ -109,7 +109,7 @@
```python
import numpy as np
-from vision_agent.tools import extract_frames_and_timestamps, owl_v2_video, florence2_phrase_grounding, florence2_sam2_video_tracking
+from vision_agent.tools import extract_frames_and_timestamps, owl_v2_video, florence2_phrase_grounding_image, florence2_sam2_video_tracking
# sample at 1 FPS and use the first 10 frames to reduce processing time
frames = extract_frames_and_timestamps("video.mp4", 1)
@@ -143,7 +143,7 @@ def get_counts(preds):
owl_v2_counts = get_counts(owl_v2_out)
# plan2
-florence2_out = [florence2_phrase_grounding("person", f) for f in frames]
+florence2_out = [florence2_phrase_grounding_image("person", f) for f in frames]
florence2_counts = get_counts(florence2_out)
# plan3
@@ -153,13 +153,13 @@ def get_counts(preds):
final_out = {{
"owl_v2_video": owl_v2_out,
- "florence2_phrase_grounding": florence2_out,
+ "florence2_phrase_grounding_image": florence2_out,
"florence2_sam2_video_tracking": f2s2_out,
}}
counts = {{
"owl_v2_video": owl_v2_counts,
- "florence2_phrase_grounding": florence2_counts,
+ "florence2_phrase_grounding_image": florence2_counts,
"florence2_sam2_video_tracking": f2s2_counts,
}}
diff --git a/vision_agent/agent/vision_agent_prompts.py b/vision_agent/agent/vision_agent_prompts.py
index bc3295ef..4a668bda 100644
--- a/vision_agent/agent/vision_agent_prompts.py
+++ b/vision_agent/agent/vision_agent_prompts.py
@@ -131,10 +131,10 @@
OBSERVATION:
[Artifact code.py]
-0|from vision_agent.tools import load_image, florence2_phrase_grounding, closest_box_distance, overlay_bounding_boxes, save_image
+0|from vision_agent.tools import load_image, florence2_phrase_grounding_image, closest_box_distance, overlay_bounding_boxes, save_image
1|def count_workers_with_helmets(image_path: str, output_path: str):
2| image = load_image(image_path)
-3| detections = florence2_phrase_grounding("worker, helmet", image)
+3| detections = florence2_phrase_grounding_image("worker, helmet", image)
4| workers = [d for d in detections if d['label'] == 'worker']
5| helmets = [d for d in detections if d['label'] == 'helmet']
6| count = 0
@@ -166,18 +166,18 @@
OBSERVATION:
[Fine tuning id: 23b3b022-5ebf-4798-9373-20ef36429abf]
-AGENT: {"thoughts": "The model has finished fine tuning, I will now replace the original florence2_phrase_grounding call with the fine tuning id.", "response": "I will not update the code to use the fine tuned model. use_object_detection_fine_tuning(artifacts, "code.py", "23b3b022-5ebf-4798-9373-20ef36429abf")", "let_user_respond": false}
+AGENT: {"thoughts": "The model has finished fine tuning, I will now replace the original florence2_phrase_grounding_image call with the fine tuning id.", "response": "I will not update the code to use the fine tuned model. use_object_detection_fine_tuning(artifacts, "code.py", "23b3b022-5ebf-4798-9373-20ef36429abf")", "let_user_respond": false}
OBSERVATION:
[Artifact code.py edits]
---
+++
@@ -1,7 +1,7 @@
- from vision_agent.tools import load_image, florence2_phrase_grounding, closest_box_distance, overlay_bounding_boxes, save_image
+ from vision_agent.tools import load_image, florence2_phrase_grounding_image, closest_box_distance, overlay_bounding_boxes, save_image
def count_workers_with_helmets(image_path: str, output_path: str):
image = load_image(image_path)
-- detections = florence2_phrase_grounding("worker, helmet", image)
-+ detections = florence2_phrase_grounding("worker, helmet", image, "23b3b022-5ebf-4798-9373-20ef36429abf")
+- detections = florence2_phrase_grounding_image("worker, helmet", image)
++ detections = florence2_phrase_grounding_image("worker, helmet", image, "23b3b022-5ebf-4798-9373-20ef36429abf")
workers = [d for d in detections if d['label'] == 'worker']
helmets = [d for d in detections if d['label'] == 'helmet']
count = 0
@@ -189,5 +189,5 @@ def count_workers_with_helmets(image_path: str, output_path: str):
----- stdout -----
3
-AGENT: {"thoughts": "Three workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "I have updated the code using the fine tuned florence2_phrase_grounding model to count the workers wearing helmets in code.py and saved the visualization under 'workers_viz.png'.", "let_user_respond": true}
+AGENT: {"thoughts": "Three workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "I have updated the code using the fine tuned florence2_phrase_grounding_image model to count the workers wearing helmets in code.py and saved the visualization under 'workers_viz.png'.", "let_user_respond": true}
"""
diff --git a/vision_agent/tools/__init__.py b/vision_agent/tools/__init__.py
index 22453224..2ed88789 100644
--- a/vision_agent/tools/__init__.py
+++ b/vision_agent/tools/__init__.py
@@ -24,7 +24,7 @@
extract_frames_and_timestamps,
florence2_image_caption,
florence2_ocr,
- florence2_phrase_grounding,
+ florence2_phrase_grounding_image,
florence2_roberta_vqa,
florence2_sam2_image,
florence2_sam2_video_tracking,
diff --git a/vision_agent/tools/meta_tools.py b/vision_agent/tools/meta_tools.py
index 7d70e031..597bf5cc 100644
--- a/vision_agent/tools/meta_tools.py
+++ b/vision_agent/tools/meta_tools.py
@@ -665,8 +665,12 @@ def use_object_detection_fine_tuning(
patterns_with_fine_tune_id = [
(
- r'florence2_phrase_grounding\(\s*["\']([^"\']+)["\']\s*,\s*([^,]+)(?:,\s*["\'][^"\']+["\'])?\s*\)',
- lambda match: f'florence2_phrase_grounding("{match.group(1)}", {match.group(2)}, "{fine_tune_id}")',
+ r'florence2_phrase_grounding_image\(\s*["\']([^"\']+)["\']\s*,\s*([^,]+)(?:,\s*["\'][^"\']+["\'])?\s*\)',
+ lambda match: f'florence2_phrase_grounding_image("{match.group(1)}", {match.group(2)}, "{fine_tune_id}")',
+ ),
+ (
+ r'florence2_phrase_grounding_video\(\s*["\']([^"\']+)["\']\s*,\s*([^,]+)(?:,\s*["\'][^"\']+["\'])?\s*\)',
+ lambda match: f'florence2_phrase_grounding_video("{match.group(1)}", {match.group(2)}, "{fine_tune_id}")',
),
(
r'owl_v2_image\(\s*["\']([^"\']+)["\']\s*,\s*([^,]+)(?:,\s*["\'][^"\']+["\'])?\s*\)',
diff --git a/vision_agent/tools/tool_utils.py b/vision_agent/tools/tool_utils.py
index 605f5511..924b96e6 100644
--- a/vision_agent/tools/tool_utils.py
+++ b/vision_agent/tools/tool_utils.py
@@ -1,6 +1,6 @@
+import os
import inspect
import logging
-import os
from base64 import b64encode
from typing import Any, Callable, Dict, List, MutableMapping, Optional, Tuple
@@ -37,8 +37,9 @@ def send_inference_request(
files: Optional[List[Tuple[Any, ...]]] = None,
v2: bool = False,
metadata_payload: Optional[Dict[str, Any]] = None,
+ is_form: bool = False,
) -> Any:
- # TODO: runtime_tag and function_name should be metadata_payload and now included
+ # TODO: runtime_tag and function_name should be metadata_payload and not included
# in the service payload
if runtime_tag := os.environ.get("RUNTIME_TAG", ""):
payload["runtime_tag"] = runtime_tag
@@ -64,7 +65,7 @@ def send_inference_request(
elif metadata_payload is not None and "function_name" in metadata_payload:
function_name = metadata_payload["function_name"]
- response = _call_post(url, payload, session, files, function_name)
+ response = _call_post(url, payload, session, files, function_name, is_form)
# TODO: consider making the response schema the same between below two sources
return response if "TOOL_ENDPOINT_AUTH" in os.environ else response["data"]
@@ -75,6 +76,7 @@ def send_task_inference_request(
task_name: str,
files: Optional[List[Tuple[Any, ...]]] = None,
metadata: Optional[Dict[str, Any]] = None,
+ is_form: bool = False,
) -> Any:
url = f"{_LND_API_URL_v2}/{task_name}"
headers = {"apikey": _LND_API_KEY}
@@ -87,7 +89,7 @@ def send_task_inference_request(
function_name = "unknown"
if metadata is not None and "function_name" in metadata:
function_name = metadata["function_name"]
- response = _call_post(url, payload, session, files, function_name)
+ response = _call_post(url, payload, session, files, function_name, is_form)
return response["data"]
@@ -203,6 +205,7 @@ def _call_post(
session: Session,
files: Optional[List[Tuple[Any, ...]]] = None,
function_name: str = "unknown",
+ is_form: bool = False,
) -> Any:
files_in_b64 = None
if files:
@@ -210,6 +213,8 @@ def _call_post(
try:
if files is not None:
response = session.post(url, data=payload, files=files)
+ elif is_form:
+ response = session.post(url, data=payload)
else:
response = session.post(url, json=payload)
diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py
index 95fdd56c..67f78307 100644
--- a/vision_agent/tools/tools.py
+++ b/vision_agent/tools/tools.py
@@ -1,3 +1,4 @@
+import base64
import io
import json
import logging
@@ -28,7 +29,6 @@
send_task_inference_request,
)
from vision_agent.tools.tools_types import (
- FineTuning,
Florence2FtRequest,
JobStatus,
ODResponseData,
@@ -194,20 +194,26 @@ def owl_v2_image(
data_obj = Florence2FtRequest(
image=image_b64,
task=PromptTask.PHRASE_GROUNDING,
- tool="florencev2_fine_tuning",
prompt=prompt,
- fine_tuning=FineTuning(job_id=UUID(fine_tune_id)),
+ job_id=UUID(fine_tune_id),
)
- data = data_obj.model_dump(by_alias=True)
- detections = send_inference_request(data, "tools", v2=False)
- detections = detections[""]
+ data = data_obj.model_dump(by_alias=True, exclude_none=True)
+ detections = send_inference_request(
+ data,
+ "florence2-ft",
+ v2=True,
+ is_form=True,
+ metadata_payload={"function_name": "owl_v2_image"},
+ )
+ # get the first frame
+ detection = detections[0]
bboxes_formatted = [
ODResponseData(
- label=detections["labels"][i],
- bbox=normalize_bbox(detections["bboxes"][i], image_size),
+ label=detection["labels"][i],
+ bbox=normalize_bbox(detection["bboxes"][i], image_size),
score=1.0,
)
- for i in range(len(detections["bboxes"]))
+ for i in range(len(detection["bboxes"]))
]
return [bbox.model_dump() for bbox in bboxes_formatted]
@@ -419,25 +425,30 @@ def florence2_sam2_image(
req_data_obj = Florence2FtRequest(
image=image_b64,
task=PromptTask.PHRASE_GROUNDING,
- tool="florencev2_fine_tuning",
prompt=prompt,
- fine_tuning=FineTuning(
- job_id=UUID(fine_tune_id),
- postprocessing="sam2",
- ),
+ postprocessing="sam2",
+ job_id=UUID(fine_tune_id),
+ )
+ req_data = req_data_obj.model_dump(by_alias=True, exclude_none=True)
+ detections_ft = send_inference_request(
+ req_data,
+ "florence2-ft",
+ v2=True,
+ is_form=True,
+ metadata_payload={"function_name": "florence2_sam2_image"},
)
- req_data = req_data_obj.model_dump(by_alias=True)
- detections_ft = send_inference_request(req_data, "tools", v2=False)
- detections_ft = detections_ft[""]
+ # get the first frame
+ detection = detections_ft[0]
return_data = []
- all_masks = np.array(detections_ft["masks"])
- for i in range(len(detections_ft["bboxes"])):
+ for i in range(len(detection["bboxes"])):
return_data.append(
{
"score": 1.0,
- "label": detections_ft["labels"][i],
- "bbox": detections_ft["bboxes"][i],
- "mask": all_masks[i, :, :].astype(np.uint8),
+ "label": detection["labels"][i],
+ "bbox": normalize_bbox(
+ detection["bboxes"][i], detection["masks"][i]["size"]
+ ),
+ "mask": rle_decode_array(detection["masks"][i]),
}
)
return return_data
@@ -451,6 +462,7 @@ def florence2_sam2_image(
detections: Dict[str, Any] = send_inference_request(
payload, "florence2-sam2", files=files, v2=True
)
+
return_data = []
for _, data_i in detections["0"].items():
mask = rle_decode_array(data_i["mask"])
@@ -688,22 +700,18 @@ def countgd_counting(
{'score': 0.98, 'label': 'flower', 'bbox': [0.44, 0.24, 0.49, 0.58},
]
"""
- buffer_bytes = numpy_to_bytes(image)
- files = [("image", buffer_bytes)]
+ image_b64 = convert_to_b64(image)
prompt = prompt.replace(", ", " .")
- payload = {"prompts": [prompt], "model": "countgd"}
+ payload = {"prompt": prompt, "image": image_b64}
metadata = {"function_name": "countgd_counting"}
- resp_data = send_task_inference_request(
- payload, "text-to-object-detection", files=files, metadata=metadata
- )
- bboxes_per_frame = resp_data[0]
+ resp_data = send_task_inference_request(payload, "countgd", metadata=metadata)
bboxes_formatted = [
ODResponseData(
label=bbox["label"],
- bbox=list(map(lambda x: round(x, 2), bbox["bounding_box"])),
+ bbox=list(map(lambda x: round(x, 2), bbox["bbox"])),
score=round(bbox["score"], 2),
)
- for bbox in bboxes_per_frame
+ for bbox in resp_data
]
filtered_bboxes = filter_bboxes_by_threshold(bboxes_formatted, box_threshold)
return [bbox.model_dump() for bbox in filtered_bboxes]
@@ -887,7 +895,10 @@ def ixc25_temporal_localization(prompt: str, frames: List[np.ndarray]) -> List[b
"function_name": "ixc25_temporal_localization",
}
data: List[int] = send_inference_request(
- payload, "video-temporal-localization", files=files, v2=True
+ payload,
+ "video-temporal-localization?model=internlm-xcomposer",
+ files=files,
+ v2=True,
)
chunk_size = round(len(frames) / len(data))
data_explode = [[elt] * chunk_size for elt in data]
@@ -1132,13 +1143,13 @@ def florence2_image_caption(image: np.ndarray, detail_caption: bool = True) -> s
return answer[task] # type: ignore
-def florence2_phrase_grounding(
+def florence2_phrase_grounding_image(
prompt: str, image: np.ndarray, fine_tune_id: Optional[str] = None
) -> List[Dict[str, Any]]:
- """'florence2_phrase_grounding' is a tool that can detect multiple
- objects given a text prompt which can be object names or caption. You
- can optionally separate the object names in the text with commas. It returns a list
- of bounding boxes with normalized coordinates, label names and associated
+ """'florence2_phrase_grounding_image' will run florence2 on a image. It can
+ detect multiple objects given a text prompt which can be object names or caption.
+ You can optionally separate the object names in the text with commas. It returns
+ a list of bounding boxes with normalized coordinates, label names and associated
probability scores of 1.0.
Parameters:
@@ -1156,7 +1167,7 @@ def florence2_phrase_grounding(
Example
-------
- >>> florence2_phrase_grounding('person looking at a coyote', image)
+ >>> florence2_phrase_grounding_image('person looking at a coyote', image)
[
{'score': 1.0, 'label': 'person', 'bbox': [0.1, 0.11, 0.35, 0.4]},
{'score': 1.0, 'label': 'coyote', 'bbox': [0.34, 0.21, 0.85, 0.5},
@@ -1176,39 +1187,128 @@ def florence2_phrase_grounding(
data_obj = Florence2FtRequest(
image=image_b64,
task=PromptTask.PHRASE_GROUNDING,
- tool="florencev2_fine_tuning",
prompt=prompt,
- fine_tuning=FineTuning(job_id=UUID(fine_tune_id)),
+ job_id=UUID(fine_tune_id),
)
- data = data_obj.model_dump(by_alias=True)
+ data = data_obj.model_dump(by_alias=True, exclude_none=True)
detections = send_inference_request(
data,
- "tools",
- v2=False,
- metadata_payload={"function_name": "florence2_phrase_grounding"},
+ "florence2-ft",
+ v2=True,
+ is_form=True,
+ metadata_payload={"function_name": "florence2_phrase_grounding_image"},
)
+ # get the first frame
+ detection = detections[0]
else:
data = {
"image": image_b64,
"task": "",
"prompt": prompt,
- "function_name": "florence2_phrase_grounding",
+ "function_name": "florence2_phrase_grounding_image",
}
detections = send_inference_request(data, "florence2", v2=True)
+ detection = detections[""]
- detections = detections[""]
return_data = []
- for i in range(len(detections["bboxes"])):
+ for i in range(len(detection["bboxes"])):
return_data.append(
ODResponseData(
- label=detections["labels"][i],
- bbox=normalize_bbox(detections["bboxes"][i], image_size),
+ label=detection["labels"][i],
+ bbox=normalize_bbox(detection["bboxes"][i], image_size),
score=1.0,
)
)
return [bbox.model_dump() for bbox in return_data]
+def florence2_phrase_grounding_video(
+ prompt: str, frames: List[np.ndarray], fine_tune_id: Optional[str] = None
+) -> List[List[Dict[str, Any]]]:
+ """'florence2_phrase_grounding_video' will run florence2 on each frame of a video.
+ It can detect multiple objects given a text prompt which can be object names or
+ caption. You can optionally separate the object names in the text with commas.
+ It returns a list of lists where each inner list contains bounding boxes with
+ normalized coordinates, label names and associated probability scores of 1.0.
+
+ Parameters:
+ prompt (str): The prompt to ground to the video.
+ frames (List[np.ndarray]): The list of frames to detect objects.
+ fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
+ fine-tuned model ID here to use it.
+
+ Returns:
+ List[List[Dict[str, Any]]]: A list of lists of dictionaries containing the score,
+ label, and bounding box of the detected objects with normalized coordinates
+ between 0 and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates
+ of the top-left and xmax and ymax are the coordinates of the bottom-right of
+ the bounding box. The scores are always 1.0 and cannot be thresholded.
+
+ Example
+ -------
+ >>> florence2_phrase_grounding_video('person looking at a coyote', frames)
+ [
+ [
+ {'score': 1.0, 'label': 'person', 'bbox': [0.1, 0.11, 0.35, 0.4]},
+ {'score': 1.0, 'label': 'coyote', 'bbox': [0.34, 0.21, 0.85, 0.5},
+ ],
+ ...
+ ]
+ """
+ if len(frames) == 0:
+ raise ValueError("No frames provided")
+
+ image_size = frames[0].shape[:2]
+ buffer_bytes = frames_to_bytes(frames)
+ files = [("video", buffer_bytes)]
+
+ if fine_tune_id is not None:
+ landing_api = LandingPublicAPI()
+ status = landing_api.check_fine_tuning_job(UUID(fine_tune_id))
+ if status is not JobStatus.SUCCEEDED:
+ raise FineTuneModelIsNotReady(
+ f"Fine-tuned model {fine_tune_id} is not ready yet"
+ )
+
+ data_obj = Florence2FtRequest(
+ task=PromptTask.PHRASE_GROUNDING,
+ prompt=prompt,
+ job_id=UUID(fine_tune_id),
+ )
+
+ data = data_obj.model_dump(by_alias=True, exclude_none=True, mode="json")
+ detections = send_inference_request(
+ data,
+ "florence2-ft",
+ v2=True,
+ files=files,
+ metadata_payload={"function_name": "florence2_phrase_grounding_video"},
+ )
+ else:
+ data = {
+ "prompt": prompt,
+ "task": "",
+ "function_name": "florence2_phrase_grounding_video",
+ "video": base64.b64encode(buffer_bytes).decode("utf-8"),
+ }
+ detections = send_inference_request(data, "florence2", v2=True)
+ detections = [d[""] for d in detections]
+
+ bboxes_formatted = []
+ for frame_data in detections:
+ bboxes_formatted_per_frame = []
+ for idx in range(len(frame_data["bboxes"])):
+ bboxes_formatted_per_frame.append(
+ ODResponseData(
+ label=frame_data["labels"][idx],
+ bbox=normalize_bbox(frame_data["bboxes"][idx], image_size),
+ score=1.0,
+ )
+ )
+ bboxes_formatted.append(bboxes_formatted_per_frame)
+ return [[bbox.model_dump() for bbox in frame] for frame in bboxes_formatted]
+
+
def florence2_ocr(image: np.ndarray) -> List[Dict[str, Any]]:
"""'florence2_ocr' is a tool that can detect text and text regions in an image.
Each text region contains one line of text. It returns a list of detected text,
@@ -1220,7 +1320,7 @@ def florence2_ocr(image: np.ndarray) -> List[Dict[str, Any]]:
Returns:
List[Dict[str, Any]]: A list of dictionaries containing the detected text, bbox
- with nornmalized coordinates, and confidence score.
+ with normalized coordinates, and confidence score.
Example
-------
@@ -1603,7 +1703,7 @@ def extract_frames_and_timestamps(
"""
def reformat(
- frames_and_timestamps: List[Tuple[np.ndarray, float]]
+ frames_and_timestamps: List[Tuple[np.ndarray, float]],
) -> List[Dict[str, Union[np.ndarray, float]]]:
return [
{"frame": frame, "timestamp": timestamp}
@@ -2064,7 +2164,8 @@ def overlay_counting_results(
florence2_ocr,
florence2_sam2_image,
florence2_sam2_video_tracking,
- florence2_phrase_grounding,
+ florence2_phrase_grounding_image,
+ florence2_phrase_grounding_video,
ixc25_image_vqa,
ixc25_video_vqa,
detr_segmentation,
diff --git a/vision_agent/tools/tools_types.py b/vision_agent/tools/tools_types.py
index aa0e430f..1cc765b6 100644
--- a/vision_agent/tools/tools_types.py
+++ b/vision_agent/tools/tools_types.py
@@ -1,6 +1,6 @@
from enum import Enum
-from typing import List, Optional, Tuple, Union
from uuid import UUID
+from typing import List, Optional, Tuple, Union
from pydantic import BaseModel, ConfigDict, Field, SerializationInfo, field_serializer
@@ -24,27 +24,22 @@ class PromptTask(str, Enum):
PHRASE_GROUNDING = ""
-class FineTuning(BaseModel):
+class Florence2FtRequest(BaseModel):
model_config = ConfigDict(populate_by_name=True)
- job_id: UUID = Field(alias="jobId")
+ image: Optional[str] = None
+ video: Optional[bytes] = None
+ task: PromptTask
+ prompt: Optional[str] = ""
+ chunk_length_frames: Optional[int] = None
postprocessing: Optional[str] = None
+ job_id: Optional[UUID] = Field(None, alias="jobId")
@field_serializer("job_id")
def serialize_job_id(self, job_id: UUID, _info: SerializationInfo) -> str:
return str(job_id)
-class Florence2FtRequest(BaseModel):
- model_config = ConfigDict(populate_by_name=True)
-
- image: str
- task: PromptTask
- tool: str
- prompt: Optional[str] = ""
- fine_tuning: Optional[FineTuning] = Field(None, alias="fineTuning")
-
-
class JobStatus(str, Enum):
"""The status of a fine-tuning job.