diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py index e21dd95c..3c23e569 100644 --- a/vision_agent/tools/tools.py +++ b/vision_agent/tools/tools.py @@ -1,4 +1,3 @@ -import io import json import logging import tempfile @@ -375,6 +374,9 @@ def ocr(image: np.ndarray) -> List[Dict[str, Any]]: ] """ + pil_image = Image.fromarray(image).convert("RGB") + image_size = pil_image.size[::-1] + buffer_bytes = numpy_to_bytes(image) res = requests.post( _OCR_URL, @@ -521,7 +523,7 @@ def ixc25_image_vqa(prompt: str, image: np.ndarray) -> str: data: Dict[str, Any] = send_inference_request( payload, "internlm-xcomposer2", files=files, v2=True ) - return data["answer"] + return cast(str, data["answer"]) def ixc25_video_vqa(prompt: str, frames: List[np.ndarray]) -> str: @@ -540,7 +542,7 @@ def ixc25_video_vqa(prompt: str, frames: List[np.ndarray]) -> str: data: Dict[str, Any] = send_inference_request( payload, "internlm-xcomposer2", files=files, v2=True ) - return data["answer"] + return cast(str, data["answer"]) def git_vqa_v2(prompt: str, image: np.ndarray) -> str: @@ -1441,7 +1443,7 @@ def overlay_segmentation_masks( text_box = draw.textbbox((x, y), text=label, font=font) draw.rectangle((x, y, text_box[2], text_box[3]), fill=color[label]) draw.text((x, y), label, fill="black", font=font) - frame_out.append(np.array(pil_image)) # type: ignore + frame_out.append(np.array(pil_image)) return frame_out[0] if len(frame_out) == 1 else frame_out