Skip to content

Commit

Permalink
added florence2 for OCR
Browse files Browse the repository at this point in the history
  • Loading branch information
dillonalaird committed Aug 12, 2024
1 parent 432ddca commit 7ba2638
Show file tree
Hide file tree
Showing 3 changed files with 64 additions and 0 deletions.
1 change: 1 addition & 0 deletions vision_agent/tools/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
florencev2_image_caption,
florencev2_object_detection,
florencev2_roberta_vqa,
florencev2_ocr,
generate_pose_image,
generate_soft_edge_image,
get_tool_documentation,
Expand Down
46 changes: 46 additions & 0 deletions vision_agent/tools/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
denormalize_bbox,
get_image_size,
normalize_bbox,
convert_quad_box_to_bbox,
rle_decode,
)

Expand Down Expand Up @@ -652,6 +653,50 @@ def florencev2_object_detection(image: np.ndarray, prompt: str) -> List[Dict[str
return return_data


def florencev2_ocr(image: np.ndarray) -> List[Dict[str, Any]]:
"""'florencev2_ocr' is a tool that can detect text in an image without any text
prompt. It returns a list of detected text, bounding boxes with normalized coordinates,
and confidence scores. The results are sorted from top-left to bottom right.
Parameters:
image (np.ndarray): The image to extract text from.
Returns:
List[Dict[str, Any]]: A list of dictionaries containing the detected text, bbox
with nornmalized coordinates, and confidence score.
Example
-------
>>> florencev2_ocr(image)
[
{'label': 'hello world', 'bbox': [0.1, 0.11, 0.35, 0.4], 'score': 0.99},
]
"""

image_size = image.shape[:2]
image_b64 = convert_to_b64(image)
data = {
"image": image_b64,
"task": "<OCR_WITH_REGION>",
"function_name": "florencev2_ocr",
}

detections = send_inference_request(data, "florence2", v2=True)
detections = detections["<OCR_WITH_REGION>"]
return_data = []
for i in range(len(detections["quad_boxes"])):
return_data.append(
{
"label": detections["labels"][i],
"bbox": normalize_bbox(
convert_quad_box_to_bbox(detections["quad_boxes"][i]), image_size
),
"score": 1.0,
}
)
return return_data


def detr_segmentation(image: np.ndarray) -> List[Dict[str, Any]]:
"""'detr_segmentation' is a tool that can segment common objects in an
image without any text prompt. It returns a list of detected objects
Expand Down Expand Up @@ -1248,6 +1293,7 @@ def overlay_heat_map(
loca_visual_prompt_counting,
florencev2_roberta_vqa,
florencev2_image_caption,
florencev2_ocr,
detr_segmentation,
depth_anything_v2,
generate_soft_edge_image,
Expand Down
17 changes: 17 additions & 0 deletions vision_agent/utils/image_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,23 @@ def denormalize_bbox(
return bbox


def convert_quad_box_to_bbox(quad_box: List[Union[int, float]]) -> List[float]:
r"""Convert a quadrilateral bounding box to a rectangular bounding box.
Parameters:
quad_box: the quadrilateral bounding box
Returns:
The rectangular bounding box
"""
x1, y1, x2, y2, x3, y3, x4, y4 = quad_box
x_min = min(x1, x2, x3, x4)
x_max = max(x1, x2, x3, x4)
y_min = min(y1, y2, y3, y4)
y_max = max(y1, y2, y3, y4)
return [x_min, y_min, x_max, y_max]


def overlay_bboxes(
image: Union[str, Path, np.ndarray, ImageType], bboxes: Dict
) -> ImageType:
Expand Down

0 comments on commit 7ba2638

Please sign in to comment.