From dca03f717fa953d26fd6a63d19636317f6f23879 Mon Sep 17 00:00:00 2001 From: shankar-landing-ai Date: Fri, 9 Aug 2024 11:23:07 -0700 Subject: [PATCH] updated florence OD to phrase_grounding in the backend which requires a text prompt --- tests/integ/test_tools.py | 5 +++-- vision_agent/tools/tools.py | 21 ++++++++++++--------- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/tests/integ/test_tools.py b/tests/integ/test_tools.py index 57d536bd..5a1cde74 100644 --- a/tests/integ/test_tools.py +++ b/tests/integ/test_tools.py @@ -61,9 +61,10 @@ def test_object_detection(): img = ski.data.coins() result = florencev2_object_detection( image=img, + prompt="coin", ) - assert len(result) == 24 - assert [res["label"] for res in result] == ["coin"] * 24 + assert len(result) == 25 + assert [res["label"] for res in result] == ["coin"] * 25 def test_template_match(): diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py index a78457ed..71901c66 100644 --- a/vision_agent/tools/tools.py +++ b/vision_agent/tools/tools.py @@ -614,13 +614,17 @@ def florencev2_image_caption(image: np.ndarray, detail_caption: bool = True) -> return answer["text"][0] # type: ignore -def florencev2_object_detection(image: np.ndarray) -> List[Dict[str, Any]]: - """'florencev2_object_detection' is a tool that can detect common objects in an - image without any text prompt or thresholding. It returns a list of detected objects - as labels and their location as bounding boxes. +def florencev2_object_detection( + image: np.ndarray, + prompt: str, +) -> List[Dict[str, Any]]: + """'florencev2_object_detection' is a tool that can detect objects given a text + prompt such as a phrase or class names separated by commas. It returns a list of + detected objects as labels and their location as bounding boxes with score of 1.0. Parameters: image (np.ndarray): The image to used to detect objects + prompt (str): Phrase or classes to detect objects in the image Returns: List[Dict[str, Any]]: A list of dictionaries containing the score, label, and @@ -631,17 +635,17 @@ def florencev2_object_detection(image: np.ndarray) -> List[Dict[str, Any]]: Example ------- - >>> florencev2_object_detection(image) + >>> florencev2_object_detection(image, 'person looking out at coyote') [ - {'score': 1.0, 'label': 'window', 'bbox': [0.1, 0.11, 0.35, 0.4]}, - {'score': 1.0, 'label': 'car', 'bbox': [0.2, 0.21, 0.45, 0.5}, - {'score': 1.0, 'label': 'person', 'bbox': [0.34, 0.21, 0.85, 0.5}, + {'score': 1.0, 'label': 'person', 'bbox': [0.1, 0.11, 0.35, 0.4]}, + {'score': 1.0, 'label': 'coyote', 'bbox': [0.34, 0.21, 0.85, 0.5}, ] """ image_size = image.shape[:2] image_b64 = convert_to_b64(image) data = { "image": image_b64, + "prompt": prompt, "tool": "object_detection", "function_name": "florencev2_object_detection", } @@ -1253,7 +1257,6 @@ def overlay_heat_map( loca_visual_prompt_counting, florencev2_roberta_vqa, florencev2_image_caption, - florencev2_object_detection, detr_segmentation, depth_anything_v2, generate_soft_edge_image,