landing-ai · shankar-vision-eng · Aug 9, 2024
diff --git a/tests/integ/test_tools.py b/tests/integ/test_tools.py
@@ -61,9 +61,10 @@ def test_object_detection():
  img = ski.data.coins()
  result = florencev2_object_detection(
  image=img,
+ prompt="coin",
  )
- assert len(result) == 24
- assert [res["label"] for res in result] == ["coin"] * 24
+ assert len(result) == 25
+ assert [res["label"] for res in result] == ["coin"] * 25
 
 
 def test_template_match():

diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py
@@ -614,13 +614,17 @@ def florencev2_image_caption(image: np.ndarray, detail_caption: bool = True) ->
  return answer["text"][0] # type: ignore
 
 
-def florencev2_object_detection(image: np.ndarray) -> List[Dict[str, Any]]:
- """'florencev2_object_detection' is a tool that can detect common objects in an
- image without any text prompt or thresholding. It returns a list of detected objects
- as labels and their location as bounding boxes.
+def florencev2_object_detection(
+ image: np.ndarray,
+ prompt: str,
+) -> List[Dict[str, Any]]:
+ """'florencev2_object_detection' is a tool that can detect objects given a text
+ prompt such as a phrase or class names separated by commas. It returns a list of
+ detected objects as labels and their location as bounding boxes with score of 1.0.
 
  Parameters:
  image (np.ndarray): The image to used to detect objects
+ prompt (str): Phrase or classes to detect objects in the image
 
  Returns:
  List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
@@ -631,17 +635,17 @@ def florencev2_object_detection(image: np.ndarray) -> List[Dict[str, Any]]:
 
  Example
  -------
- >>> florencev2_object_detection(image)
+ >>> florencev2_object_detection(image, 'person looking out at coyote')
  [
- {'score': 1.0, 'label': 'window', 'bbox': [0.1, 0.11, 0.35, 0.4]},
- {'score': 1.0, 'label': 'car', 'bbox': [0.2, 0.21, 0.45, 0.5},
- {'score': 1.0, 'label': 'person', 'bbox': [0.34, 0.21, 0.85, 0.5},
+ {'score': 1.0, 'label': 'person', 'bbox': [0.1, 0.11, 0.35, 0.4]},
+ {'score': 1.0, 'label': 'coyote', 'bbox': [0.34, 0.21, 0.85, 0.5},
  ]
  """
  image_size = image.shape[:2]
  image_b64 = convert_to_b64(image)
  data = {
  "image": image_b64,
+ "prompt": prompt,
  "tool": "object_detection",
  "function_name": "florencev2_object_detection",
  }
@@ -1253,7 +1257,6 @@ def overlay_heat_map(
  loca_visual_prompt_counting,
  florencev2_roberta_vqa,
  florencev2_image_caption,
- florencev2_object_detection,
  detr_segmentation,
  depth_anything_v2,
  generate_soft_edge_image,