diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py index 958b2cf6..8dddf8bc 100644 --- a/vision_agent/tools/tools.py +++ b/vision_agent/tools/tools.py @@ -13,26 +13,27 @@ import numpy as np import requests from moviepy.editor import ImageSequenceClip -from PIL import Image, ImageDraw, ImageFont, ImageEnhance +from PIL import Image, ImageDraw, ImageEnhance, ImageFont from pillow_heif import register_heif_opener # type: ignore from pytube import YouTube # type: ignore from vision_agent.clients.landing_public_api import LandingPublicAPI +from vision_agent.lmm.lmm import OpenAILMM from vision_agent.tools.tool_utils import ( + filter_bboxes_by_threshold, get_tool_descriptions, get_tool_documentation, get_tools_df, get_tools_info, send_inference_request, send_task_inference_request, - filter_bboxes_by_threshold, ) from vision_agent.tools.tools_types import ( FineTuning, Florence2FtRequest, JobStatus, - PromptTask, ODResponseData, + PromptTask, ) from vision_agent.utils import extract_frames_from_video from vision_agent.utils.exceptions import FineTuneModelIsNotReady @@ -42,6 +43,7 @@ convert_quad_box_to_bbox, convert_to_b64, denormalize_bbox, + encode_image_bytes, frames_to_bytes, get_image_size, normalize_bbox, @@ -691,6 +693,69 @@ def ixc25_video_vqa(prompt: str, frames: List[np.ndarray]) -> str: return cast(str, data["answer"]) +def gpt4o_image_vqa(prompt: str, image: np.ndarray) -> str: + """'gpt4o_image_vqa' is a tool that can answer any questions about arbitrary images + including regular images or images of documents or presentations. It returns text + as an answer to the question. + + Parameters: + prompt (str): The question about the image + image (np.ndarray): The reference image used for the question + + Returns: + str: A string which is the answer to the given prompt. + + Example + ------- + >>> gpt4o_image_vqa('What is the cat doing?', image) + 'drinking milk' + """ + + lmm = OpenAILMM() + buffer = io.BytesIO() + Image.fromarray(image).save(buffer, format="PNG") + image_bytes = buffer.getvalue() + image_b64 = "data:image/png;base64," + encode_image_bytes(image_bytes) + resp = lmm.generate(prompt, [image_b64]) + return cast(str, resp) + + +def gpt4o_video_vqa(prompt: str, frames: List[np.ndarray]) -> str: + """'gpt4o_video_vqa' is a tool that can answer any questions about arbitrary videos + including regular videos or videos of documents or presentations. It returns text + as an answer to the question. + + Parameters: + prompt (str): The question about the video + frames (List[np.ndarray]): The reference frames used for the question + + Returns: + str: A string which is the answer to the given prompt. + + Example + ------- + >>> gpt4o_video_vqa('Which football player made the goal?', frames) + 'Lionel Messi' + """ + + lmm = OpenAILMM() + + if len(frames) > 10: + step = len(frames) / 10 + frames = [frames[int(i * step)] for i in range(10)] + + frames_b64 = [] + for frame in frames: + buffer = io.BytesIO() + Image.fromarray(frame).save(buffer, format="PNG") + image_bytes = buffer.getvalue() + image_b64 = "data:image/png;base64," + encode_image_bytes(image_bytes) + frames_b64.append(image_b64) + + resp = lmm.generate(prompt, frames_b64) + return cast(str, resp) + + def git_vqa_v2(prompt: str, image: np.ndarray) -> str: """'git_vqa_v2' is a tool that can answer questions about the visual contents of an image given a question and an image. It returns an answer to the