landing-ai · dillonalaird · Aug 13, 2024 · Aug 11, 2024 · Aug 11, 2024 · Aug 13, 2024
diff --git a/tests/integ/test_tools.py b/tests/integ/test_tools.py
@@ -5,17 +5,18 @@
  blip_image_caption,
  clip,
  closest_mask_distance,
- florencev2_image_caption,
  depth_anything_v2,
+ detr_segmentation,
  dpt_hybrid_midas,
+ florencev2_image_caption,
+ florencev2_object_detection,
+ florencev2_roberta_vqa,
+ florencev2_ocr,
  generate_pose_image,
  generate_soft_edge_image,
- florencev2_object_detection,
- detr_segmentation,
  git_vqa_v2,
  grounding_dino,
  grounding_sam,
- florencev2_roberta_vqa,
  loca_visual_prompt_counting,
  loca_zero_shot_counting,
  ocr,
@@ -182,6 +183,14 @@ def test_ocr() -> None:
  assert any("Region-based segmentation" in res["label"] for res in result)
 
 
+def test_florencev2_ocr() -> None:
+ img = ski.data.page()
+ result = florencev2_ocr(
+ image=img,
+ )
+ assert any("Region-based segmentation" in res["label"] for res in result)
+
+
 def test_mask_distance():
  # Create two binary masks
  mask1 = np.zeros((100, 100), dtype=np.uint8)

diff --git a/vision_agent/tools/__init__.py b/vision_agent/tools/__init__.py
@@ -19,6 +19,7 @@
  florencev2_image_caption,
  florencev2_object_detection,
  florencev2_roberta_vqa,
+ florencev2_ocr,
  generate_pose_image,
  generate_soft_edge_image,
  get_tool_documentation,

diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py
@@ -28,6 +28,7 @@
  denormalize_bbox,
  get_image_size,
  normalize_bbox,
+ convert_quad_box_to_bbox,
  rle_decode,
 )
 
@@ -652,6 +653,50 @@ def florencev2_object_detection(image: np.ndarray, prompt: str) -> List[Dict[str
  return return_data
 
 
+def florencev2_ocr(image: np.ndarray) -> List[Dict[str, Any]]:
+ """'florencev2_ocr' is a tool that can detect text in an image without any text
+ prompt. It returns a list of detected text, bounding boxes with normalized coordinates,
+ and confidence scores. The results are sorted from top-left to bottom right.
+
+ Parameters:
+ image (np.ndarray): The image to extract text from.
+
+ Returns:
+ List[Dict[str, Any]]: A list of dictionaries containing the detected text, bbox
+ with nornmalized coordinates, and confidence score.
+
+ Example
+ -------
+ >>> florencev2_ocr(image)
+ [
+ {'label': 'hello world', 'bbox': [0.1, 0.11, 0.35, 0.4], 'score': 0.99},
+ ]
+ """
+
+ image_size = image.shape[:2]
+ image_b64 = convert_to_b64(image)
+ data = {
+ "image": image_b64,
+ "task": "<OCR_WITH_REGION>",
+ "function_name": "florencev2_ocr",
+ }
+
+ detections = send_inference_request(data, "florence2", v2=True)
+ detections = detections["<OCR_WITH_REGION>"]
+ return_data = []
+ for i in range(len(detections["quad_boxes"])):
+ return_data.append(
+ {
+ "label": detections["labels"][i],
+ "bbox": normalize_bbox(
+ convert_quad_box_to_bbox(detections["quad_boxes"][i]), image_size
+ ),
+ "score": 1.0,
+ }
+ )
+ return return_data
+
+
 def detr_segmentation(image: np.ndarray) -> List[Dict[str, Any]]:
  """'detr_segmentation' is a tool that can segment common objects in an
  image without any text prompt. It returns a list of detected objects
@@ -1248,6 +1293,7 @@ def overlay_heat_map(
  loca_visual_prompt_counting,
  florencev2_roberta_vqa,
  florencev2_image_caption,
+ florencev2_ocr,
  detr_segmentation,
  depth_anything_v2,
  generate_soft_edge_image,

diff --git a/vision_agent/utils/image_utils.py b/vision_agent/utils/image_utils.py
@@ -140,6 +140,23 @@ def denormalize_bbox(
  return bbox
 
 
+def convert_quad_box_to_bbox(quad_box: List[Union[int, float]]) -> List[float]:
+ r"""Convert a quadrilateral bounding box to a rectangular bounding box.
+
+ Parameters:
+ quad_box: the quadrilateral bounding box
+
+ Returns:
+ The rectangular bounding box
+ """
+ x1, y1, x2, y2, x3, y3, x4, y4 = quad_box
+ x_min = min(x1, x2, x3, x4)
+ x_max = max(x1, x2, x3, x4)
+ y_min = min(y1, y2, y3, y4)
+ y_max = max(y1, y2, y3, y4)
+ return [x_min, y_min, x_max, y_max]
+
+
 def overlay_bboxes(
  image: Union[str, Path, np.ndarray, ImageType], bboxes: Dict
 ) -> ImageType: