From 46343e0ff956546215abdd9d309dbbb296b9d075 Mon Sep 17 00:00:00 2001
From: shankar-landing-ai <shankar.anand@landing.ai>
Date: Tue, 27 Aug 2024 10:52:20 -0700
Subject: [PATCH] fixed florence OD as phrase grounding

---
 tests/integ/test_tools.py      |  4 ++--
 vision_agent/tools/__init__.py |  2 +-
 vision_agent/tools/tools.py    | 14 +++++++-------
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/tests/integ/test_tools.py b/tests/integ/test_tools.py
index afa9dcb4..bca1f6ea 100644
--- a/tests/integ/test_tools.py
+++ b/tests/integ/test_tools.py
@@ -10,7 +10,7 @@
     detr_segmentation,
     dpt_hybrid_midas,
     florence2_image_caption,
-    florence2_object_detection,
+    florence2_phrase_grounding,
     florence2_ocr,
     florence2_roberta_vqa,
     florence2_sam2_image,
@@ -65,7 +65,7 @@ def test_owl():
 
 def test_object_detection():
     img = ski.data.coins()
-    result = florence2_object_detection(
+    result = florence2_phrase_grounding(
         image=img,
         prompt="coin",
     )
diff --git a/vision_agent/tools/__init__.py b/vision_agent/tools/__init__.py
index a90b7181..3372fcbb 100644
--- a/vision_agent/tools/__init__.py
+++ b/vision_agent/tools/__init__.py
@@ -21,7 +21,7 @@
     dpt_hybrid_midas,
     extract_frames,
     florence2_image_caption,
-    florence2_object_detection,
+    florence2_phrase_grounding,
     florence2_ocr,
     florence2_roberta_vqa,
     florence2_sam2_image,
diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py
index 594fcf6d..250d6d78 100644
--- a/vision_agent/tools/tools.py
+++ b/vision_agent/tools/tools.py
@@ -760,10 +760,10 @@ def florence2_image_caption(image: np.ndarray, detail_caption: bool = True) -> s
     return answer[task]  # type: ignore
 
 
-def florence2_object_detection(prompt: str, image: np.ndarray) -> List[Dict[str, Any]]:
-    """'florencev2_object_detection' is a tool that can detect and count multiple
-    objects given a text prompt such as category names or referring expressions. You
-    can optionally separate the categories in the text with commas. It returns a list
+def florence2_phrase_grounding(prompt: str, image: np.ndarray) -> List[Dict[str, Any]]:
+    """'florence2_phrase_grounding' is a tool that can detect multiple
+    objects given a text prompt which can be object names or caption. You
+    can optionally separate the object names in the text with commas. It returns a list
     of bounding boxes with normalized coordinates, label names and associated
     probability scores of 1.0.
 
@@ -780,7 +780,7 @@ def florence2_object_detection(prompt: str, image: np.ndarray) -> List[Dict[str,
 
     Example
     -------
-        >>> florence2_object_detection('person looking at a coyote', image)
+        >>> florence2_phrase_grounding('person looking at a coyote', image)
         [
             {'score': 1.0, 'label': 'person', 'bbox': [0.1, 0.11, 0.35, 0.4]},
             {'score': 1.0, 'label': 'coyote', 'bbox': [0.34, 0.21, 0.85, 0.5},
@@ -792,7 +792,7 @@ def florence2_object_detection(prompt: str, image: np.ndarray) -> List[Dict[str,
         "image": image_b64,
         "task": "<CAPTION_TO_PHRASE_GROUNDING>",
         "prompt": prompt,
-        "function_name": "florence2_object_detection",
+        "function_name": "florence2_phrase_grounding",
     }
 
     detections = send_inference_request(data, "florence2", v2=True)
@@ -1663,7 +1663,7 @@ def florencev2_fine_tuned_object_detection(
     florence2_ocr,
     florence2_sam2_image,
     florence2_sam2_video,
-    florence2_object_detection,
+    florence2_phrase_grounding,
     ixc25_image_vqa,
     ixc25_video_vqa,
     detr_segmentation,