diff --git a/vision_agent/fonts/__init__.py b/vision_agent/fonts/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/vision_agent/fonts/arial.ttf b/vision_agent/fonts/arial.ttf new file mode 100644 index 00000000..1537c5b8 Binary files /dev/null and b/vision_agent/fonts/arial.ttf differ diff --git a/vision_agent/image_utils.py b/vision_agent/image_utils.py index 849f912f..d482056f 100644 --- a/vision_agent/image_utils.py +++ b/vision_agent/image_utils.py @@ -1,6 +1,7 @@ """Utility functions for image processing.""" import base64 +from importlib import resources from io import BytesIO from pathlib import Path from typing import Dict, Tuple, Union @@ -104,19 +105,27 @@ def overlay_bboxes( color = {label: COLORS[i % len(COLORS)] for i, label in enumerate(bboxes["labels"])} - draw = ImageDraw.Draw(image) - font = ImageFont.load_default() width, height = image.size + fontsize = max(12, int(min(width, height) / 40)) + draw = ImageDraw.Draw(image) + font = ImageFont.truetype( + str(resources.files("vision_agent.fonts").joinpath("arial.ttf")), fontsize + ) if "bboxes" not in bboxes: return image.convert("RGB") - for label, box in zip(bboxes["labels"], bboxes["bboxes"]): - box = [box[0] * width, box[1] * height, box[2] * width, box[3] * height] - draw.rectangle(box, outline=color[label], width=3) - label = f"{label}" - text_box = draw.textbbox((box[0], box[1]), text=label, font=font) - draw.rectangle(text_box, fill=color[label]) - draw.text((text_box[0], text_box[1]), label, fill="black", font=font) + for label, box, scores in zip(bboxes["labels"], bboxes["bboxes"], bboxes["scores"]): + box = [ + int(box[0] * width), + int(box[1] * height), + int(box[2] * width), + int(box[3] * height), + ] + draw.rectangle(box, outline=color[label], width=4) + text = f"{label}: {scores:.2f}" + text_box = draw.textbbox((box[0], box[1]), text=text, font=font) + draw.rectangle((box[0], box[1], text_box[2], text_box[3]), fill=color[label]) + draw.text((box[0], box[1]), text, fill="black", font=font) return image.convert("RGB")