From 46343e0ff956546215abdd9d309dbbb296b9d075 Mon Sep 17 00:00:00 2001 From: shankar-landing-ai Date: Tue, 27 Aug 2024 10:52:20 -0700 Subject: [PATCH] fixed florence OD as phrase grounding --- tests/integ/test_tools.py | 4 ++-- vision_agent/tools/__init__.py | 2 +- vision_agent/tools/tools.py | 14 +++++++------- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/tests/integ/test_tools.py b/tests/integ/test_tools.py index afa9dcb4..bca1f6ea 100644 --- a/tests/integ/test_tools.py +++ b/tests/integ/test_tools.py @@ -10,7 +10,7 @@ detr_segmentation, dpt_hybrid_midas, florence2_image_caption, - florence2_object_detection, + florence2_phrase_grounding, florence2_ocr, florence2_roberta_vqa, florence2_sam2_image, @@ -65,7 +65,7 @@ def test_owl(): def test_object_detection(): img = ski.data.coins() - result = florence2_object_detection( + result = florence2_phrase_grounding( image=img, prompt="coin", ) diff --git a/vision_agent/tools/__init__.py b/vision_agent/tools/__init__.py index a90b7181..3372fcbb 100644 --- a/vision_agent/tools/__init__.py +++ b/vision_agent/tools/__init__.py @@ -21,7 +21,7 @@ dpt_hybrid_midas, extract_frames, florence2_image_caption, - florence2_object_detection, + florence2_phrase_grounding, florence2_ocr, florence2_roberta_vqa, florence2_sam2_image, diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py index 594fcf6d..250d6d78 100644 --- a/vision_agent/tools/tools.py +++ b/vision_agent/tools/tools.py @@ -760,10 +760,10 @@ def florence2_image_caption(image: np.ndarray, detail_caption: bool = True) -> s return answer[task] # type: ignore -def florence2_object_detection(prompt: str, image: np.ndarray) -> List[Dict[str, Any]]: - """'florencev2_object_detection' is a tool that can detect and count multiple - objects given a text prompt such as category names or referring expressions. You - can optionally separate the categories in the text with commas. It returns a list +def florence2_phrase_grounding(prompt: str, image: np.ndarray) -> List[Dict[str, Any]]: + """'florence2_phrase_grounding' is a tool that can detect multiple + objects given a text prompt which can be object names or caption. You + can optionally separate the object names in the text with commas. It returns a list of bounding boxes with normalized coordinates, label names and associated probability scores of 1.0. @@ -780,7 +780,7 @@ def florence2_object_detection(prompt: str, image: np.ndarray) -> List[Dict[str, Example ------- - >>> florence2_object_detection('person looking at a coyote', image) + >>> florence2_phrase_grounding('person looking at a coyote', image) [ {'score': 1.0, 'label': 'person', 'bbox': [0.1, 0.11, 0.35, 0.4]}, {'score': 1.0, 'label': 'coyote', 'bbox': [0.34, 0.21, 0.85, 0.5}, @@ -792,7 +792,7 @@ def florence2_object_detection(prompt: str, image: np.ndarray) -> List[Dict[str, "image": image_b64, "task": "", "prompt": prompt, - "function_name": "florence2_object_detection", + "function_name": "florence2_phrase_grounding", } detections = send_inference_request(data, "florence2", v2=True) @@ -1663,7 +1663,7 @@ def florencev2_fine_tuned_object_detection( florence2_ocr, florence2_sam2_image, florence2_sam2_video, - florence2_object_detection, + florence2_phrase_grounding, ixc25_image_vqa, ixc25_video_vqa, detr_segmentation,