Skip to content

Commit

Permalink
Add Florence2 OCR (#194)
Browse files Browse the repository at this point in the history
  • Loading branch information
dillonalaird authored Aug 13, 2024
1 parent d1f59bb commit 7c9b059
Show file tree
Hide file tree
Showing 4 changed files with 78 additions and 4 deletions.
17 changes: 13 additions & 4 deletions tests/integ/test_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,18 @@
blip_image_caption,
clip,
closest_mask_distance,
florencev2_image_caption,
depth_anything_v2,
detr_segmentation,
dpt_hybrid_midas,
florencev2_image_caption,
florencev2_object_detection,
florencev2_roberta_vqa,
florencev2_ocr,
generate_pose_image,
generate_soft_edge_image,
florencev2_object_detection,
detr_segmentation,
git_vqa_v2,
grounding_dino,
grounding_sam,
florencev2_roberta_vqa,
loca_visual_prompt_counting,
loca_zero_shot_counting,
ocr,
Expand Down Expand Up @@ -182,6 +183,14 @@ def test_ocr() -> None:
assert any("Region-based segmentation" in res["label"] for res in result)


def test_florencev2_ocr() -> None:
img = ski.data.page()
result = florencev2_ocr(
image=img,
)
assert any("Region-based segmentation" in res["label"] for res in result)


def test_mask_distance():
# Create two binary masks
mask1 = np.zeros((100, 100), dtype=np.uint8)
Expand Down
1 change: 1 addition & 0 deletions vision_agent/tools/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
florencev2_image_caption,
florencev2_object_detection,
florencev2_roberta_vqa,
florencev2_ocr,
generate_pose_image,
generate_soft_edge_image,
get_tool_documentation,
Expand Down
47 changes: 47 additions & 0 deletions vision_agent/tools/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
denormalize_bbox,
get_image_size,
normalize_bbox,
convert_quad_box_to_bbox,
rle_decode,
)

Expand Down Expand Up @@ -652,6 +653,51 @@ def florencev2_object_detection(image: np.ndarray, prompt: str) -> List[Dict[str
return return_data


def florencev2_ocr(image: np.ndarray) -> List[Dict[str, Any]]:
"""'florencev2_ocr' is a tool that can detect text and text regions in an image.
Each text region contains one line of text. It returns a list of detected text,
the text region as a bounding box with normalized coordinates, and confidence
scores. The results are sorted from top-left to bottom right.
Parameters:
image (np.ndarray): The image to extract text from.
Returns:
List[Dict[str, Any]]: A list of dictionaries containing the detected text, bbox
with nornmalized coordinates, and confidence score.
Example
-------
>>> florencev2_ocr(image)
[
{'label': 'hello world', 'bbox': [0.1, 0.11, 0.35, 0.4], 'score': 0.99},
]
"""

image_size = image.shape[:2]
image_b64 = convert_to_b64(image)
data = {
"image": image_b64,
"task": "<OCR_WITH_REGION>",
"function_name": "florencev2_ocr",
}

detections = send_inference_request(data, "florence2", v2=True)
detections = detections["<OCR_WITH_REGION>"]
return_data = []
for i in range(len(detections["quad_boxes"])):
return_data.append(
{
"label": detections["labels"][i],
"bbox": normalize_bbox(
convert_quad_box_to_bbox(detections["quad_boxes"][i]), image_size
),
"score": 1.0,
}
)
return return_data


def detr_segmentation(image: np.ndarray) -> List[Dict[str, Any]]:
"""'detr_segmentation' is a tool that can segment common objects in an
image without any text prompt. It returns a list of detected objects
Expand Down Expand Up @@ -1248,6 +1294,7 @@ def overlay_heat_map(
loca_visual_prompt_counting,
florencev2_roberta_vqa,
florencev2_image_caption,
florencev2_ocr,
detr_segmentation,
depth_anything_v2,
generate_soft_edge_image,
Expand Down
17 changes: 17 additions & 0 deletions vision_agent/utils/image_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,23 @@ def denormalize_bbox(
return bbox


def convert_quad_box_to_bbox(quad_box: List[Union[int, float]]) -> List[float]:
r"""Convert a quadrilateral bounding box to a rectangular bounding box.
Parameters:
quad_box: the quadrilateral bounding box
Returns:
The rectangular bounding box
"""
x1, y1, x2, y2, x3, y3, x4, y4 = quad_box
x_min = min(x1, x2, x3, x4)
x_max = max(x1, x2, x3, x4)
y_min = min(y1, y2, y3, y4)
y_max = max(y1, y2, y3, y4)
return [x_min, y_min, x_max, y_max]


def overlay_bboxes(
image: Union[str, Path, np.ndarray, ImageType], bboxes: Dict
) -> ImageType:
Expand Down

0 comments on commit 7c9b059

Please sign in to comment.