From dca03f717fa953d26fd6a63d19636317f6f23879 Mon Sep 17 00:00:00 2001
From: shankar-landing-ai <shankar.anand@landing.ai>
Date: Fri, 9 Aug 2024 11:23:07 -0700
Subject: [PATCH] updated florence OD to phrase_grounding in the backend which
 requires a text prompt

---
 tests/integ/test_tools.py   |  5 +++--
 vision_agent/tools/tools.py | 21 ++++++++++++---------
 2 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/tests/integ/test_tools.py b/tests/integ/test_tools.py
index 57d536bd..5a1cde74 100644
--- a/tests/integ/test_tools.py
+++ b/tests/integ/test_tools.py
@@ -61,9 +61,10 @@ def test_object_detection():
     img = ski.data.coins()
     result = florencev2_object_detection(
         image=img,
+        prompt="coin",
     )
-    assert len(result) == 24
-    assert [res["label"] for res in result] == ["coin"] * 24
+    assert len(result) == 25
+    assert [res["label"] for res in result] == ["coin"] * 25
 
 
 def test_template_match():
diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py
index a78457ed..71901c66 100644
--- a/vision_agent/tools/tools.py
+++ b/vision_agent/tools/tools.py
@@ -614,13 +614,17 @@ def florencev2_image_caption(image: np.ndarray, detail_caption: bool = True) ->
     return answer["text"][0]  # type: ignore
 
 
-def florencev2_object_detection(image: np.ndarray) -> List[Dict[str, Any]]:
-    """'florencev2_object_detection' is a tool that can detect common objects in an
-    image without any text prompt or thresholding. It returns a list of detected objects
-    as labels and their location as bounding boxes.
+def florencev2_object_detection(
+    image: np.ndarray,
+    prompt: str,
+) -> List[Dict[str, Any]]:
+    """'florencev2_object_detection' is a tool that can detect objects given a text
+    prompt such as a phrase or class names separated by commas. It returns a list of
+    detected objects as labels and their location as bounding boxes with score of 1.0.
 
     Parameters:
         image (np.ndarray): The image to used to detect objects
+        prompt (str): Phrase or classes to detect objects in the image
 
     Returns:
         List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
@@ -631,17 +635,17 @@ def florencev2_object_detection(image: np.ndarray) -> List[Dict[str, Any]]:
 
     Example
     -------
-        >>> florencev2_object_detection(image)
+        >>> florencev2_object_detection(image, 'person looking out at coyote')
         [
-            {'score': 1.0, 'label': 'window', 'bbox': [0.1, 0.11, 0.35, 0.4]},
-            {'score': 1.0, 'label': 'car', 'bbox': [0.2, 0.21, 0.45, 0.5},
-            {'score': 1.0, 'label': 'person', 'bbox': [0.34, 0.21, 0.85, 0.5},
+            {'score': 1.0, 'label': 'person', 'bbox': [0.1, 0.11, 0.35, 0.4]},
+            {'score': 1.0, 'label': 'coyote', 'bbox': [0.34, 0.21, 0.85, 0.5},
         ]
     """
     image_size = image.shape[:2]
     image_b64 = convert_to_b64(image)
     data = {
         "image": image_b64,
+        "prompt": prompt,
         "tool": "object_detection",
         "function_name": "florencev2_object_detection",
     }
@@ -1253,7 +1257,6 @@ def overlay_heat_map(
     loca_visual_prompt_counting,
     florencev2_roberta_vqa,
     florencev2_image_caption,
-    florencev2_object_detection,
     detr_segmentation,
     depth_anything_v2,
     generate_soft_edge_image,