diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py index d2687683..6e323b79 100644 --- a/vision_agent/tools/tools.py +++ b/vision_agent/tools/tools.py @@ -1,17 +1,15 @@ import io import json -import inspect import logging import tempfile from uuid import UUID from pathlib import Path from importlib import resources -from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast +from typing import Any, Dict, List, Optional, Tuple, Union, cast import cv2 import requests import numpy as np -import pandas as pd from pytube import YouTube # type: ignore from moviepy.editor import ImageSequenceClip from PIL import Image, ImageDraw, ImageFont @@ -19,7 +17,12 @@ from vision_agent.clients.landing_public_api import LandingPublicAPI from vision_agent.tools.tool_types import BboxInput, BboxInputBase64 -from vision_agent.tools.tool_utils import send_inference_request +from vision_agent.tools.tool_utils import ( + send_inference_request, + get_tool_descriptions, + get_tool_documentation, + get_tools_df, +) from vision_agent.utils import extract_frames_from_video from vision_agent.utils.execute import FileSerializer, MimeType from vision_agent.utils.image_utils import ( @@ -57,7 +60,6 @@ ] _API_KEY = "land_sk_WVYwP00xA3iXely2vuar6YUDZ3MJT9yLX6oW5noUkwICzYLiDV" _OCR_URL = "https://app.landing.ai/ocr/v1/detect-text" -logging.basicConfig(level=logging.INFO) _LOGGER = logging.getLogger(__name__) @@ -108,6 +110,7 @@ def grounding_dino( "visual_grounding" if model_size == "large" else "visual_grounding_tiny" ), "kwargs": {"box_threshold": box_threshold, "iou_threshold": iou_threshold}, + "function_name": "grounding_dino", } data: Dict[str, Any] = send_inference_request(request_data, "tools") return_data = [] @@ -163,6 +166,7 @@ def owl_v2( "image": image_b64, "tool": "open_vocab_detection", "kwargs": {"box_threshold": box_threshold, "iou_threshold": iou_threshold}, + "function_name": "owl_v2", } data: Dict[str, Any] = send_inference_request(request_data, "tools") return_data = [] @@ -227,6 +231,7 @@ def grounding_sam( "image": image_b64, "tool": "visual_grounding_segment", "kwargs": {"box_threshold": box_threshold, "iou_threshold": iou_threshold}, + "function_name": "grounding_sam", } data: Dict[str, Any] = send_inference_request(request_data, "tools") return_data = [] @@ -366,6 +371,7 @@ def loca_zero_shot_counting(image: np.ndarray) -> Dict[str, Any]: data = { "image": image_b64, "tool": "zero_shot_counting", + "function_name": "loca_zero_shot_counting", } resp_data = send_inference_request(data, "tools") resp_data["heat_map"] = np.array(b64_to_pil(resp_data["heat_map"][0])) @@ -401,6 +407,7 @@ def loca_visual_prompt_counting( "image": image_b64, "prompt": bbox_str, "tool": "few_shot_counting", + "function_name": "loca_visual_prompt_counting", } resp_data = send_inference_request(data, "tools") resp_data["heat_map"] = np.array(b64_to_pil(resp_data["heat_map"][0])) @@ -430,6 +437,7 @@ def florencev2_roberta_vqa(prompt: str, image: np.ndarray) -> str: "image": image_b64, "prompt": prompt, "tool": "image_question_answering_with_context", + "function_name": "florencev2_roberta_vqa", } answer = send_inference_request(data, "tools") @@ -459,6 +467,7 @@ def git_vqa_v2(prompt: str, image: np.ndarray) -> str: "image": image_b64, "prompt": prompt, "tool": "image_question_answering", + "function_name": "git_vqa_v2", } answer = send_inference_request(data, "tools") @@ -489,6 +498,7 @@ def clip(image: np.ndarray, classes: List[str]) -> Dict[str, Any]: "prompt": ",".join(classes), "image": image_b64, "tool": "closed_set_image_classification", + "function_name": "clip", } resp_data = send_inference_request(data, "tools") resp_data["scores"] = [round(prob, 4) for prob in resp_data["scores"]] @@ -516,6 +526,7 @@ def vit_image_classification(image: np.ndarray) -> Dict[str, Any]: data = { "image": image_b64, "tool": "image_classification", + "function_name": "vit_image_classification", } resp_data = send_inference_request(data, "tools") resp_data["scores"] = [round(prob, 4) for prob in resp_data["scores"]] @@ -543,6 +554,7 @@ def vit_nsfw_classification(image: np.ndarray) -> Dict[str, Any]: data = { "image": image_b64, "tool": "nsfw_image_classification", + "function_name": "vit_nsfw_classification", } resp_data = send_inference_request(data, "tools") resp_data["scores"] = round(resp_data["scores"], 4) @@ -569,6 +581,7 @@ def blip_image_caption(image: np.ndarray) -> str: data = { "image": image_b64, "tool": "image_captioning", + "function_name": "blip_image_caption", } answer = send_inference_request(data, "tools") @@ -597,6 +610,7 @@ def florencev2_image_caption(image: np.ndarray, detail_caption: bool = True) -> "image": image_b64, "tool": "florence2_image_captioning", "detail_caption": detail_caption, + "function_name": "florencev2_image_caption", } answer = send_inference_request(data, "tools") @@ -632,6 +646,7 @@ def florencev2_object_detection(image: np.ndarray) -> List[Dict[str, Any]]: data = { "image": image_b64, "tool": "object_detection", + "function_name": "florencev2_object_detection", } answer = send_inference_request(data, "tools") @@ -673,7 +688,7 @@ def florencev2_fine_tuning(bboxes: List[Dict[str, Any]]) -> UUID: image=convert_to_b64(bbox_input.image), filename=bbox_input.filename, labels=bbox_input.labels, - bboxes=bbox_input.bboxes + bboxes=bbox_input.bboxes, ) for bbox_input in bboxes_input ] @@ -724,6 +739,7 @@ def detr_segmentation(image: np.ndarray) -> List[Dict[str, Any]]: data = { "image": image_b64, "tool": "panoptic_segmentation", + "function_name": "detr_segmentation", } answer = send_inference_request(data, "tools") @@ -766,6 +782,7 @@ def depth_anything_v2(image: np.ndarray) -> np.ndarray: data = { "image": image_b64, "tool": "generate_depth", + "function_name": "depth_anything_v2", } answer = send_inference_request(data, "tools") @@ -797,6 +814,7 @@ def generate_soft_edge_image(image: np.ndarray) -> np.ndarray: data = { "image": image_b64, "tool": "generate_hed", + "function_name": "generate_soft_edge_image", } answer = send_inference_request(data, "tools") @@ -829,6 +847,7 @@ def dpt_hybrid_midas(image: np.ndarray) -> np.ndarray: data = { "image": image_b64, "tool": "generate_normal", + "function_name": "dpt_hybrid_midas", } answer = send_inference_request(data, "tools") @@ -900,6 +919,7 @@ def template_match( "image": image_b64, "template": template_image_b64, "tool": "template_match", + "function_name": "template_match", } answer = send_inference_request(data, "tools") @@ -1259,50 +1279,6 @@ def overlay_heat_map( return np.array(combined) -def get_tool_documentation(funcs: List[Callable[..., Any]]) -> str: - docstrings = "" - for func in funcs: - docstrings += f"{func.__name__}{inspect.signature(func)}:\n{func.__doc__}\n\n" - - return docstrings - - -def get_tool_descriptions(funcs: List[Callable[..., Any]]) -> str: - descriptions = "" - for func in funcs: - description = func.__doc__ - if description is None: - description = "" - - if "Parameters:" in description: - description = ( - description[: description.find("Parameters:")] - .replace("\n", " ") - .strip() - ) - - description = " ".join(description.split()) - descriptions += f"- {func.__name__}{inspect.signature(func)}: {description}\n" - return descriptions - - -def get_tools_df(funcs: List[Callable[..., Any]]) -> pd.DataFrame: - data: Dict[str, List[str]] = {"desc": [], "doc": []} - - for func in funcs: - desc = func.__doc__ - if desc is None: - desc = "" - desc = desc[: desc.find("Parameters:")].replace("\n", " ").strip() - desc = " ".join(desc.split()) - - doc = f"{func.__name__}{inspect.signature(func)}:\n{func.__doc__}" - data["desc"].append(desc) - data["doc"].append(doc) - - return pd.DataFrame(data) # type: ignore - - TOOLS = [ owl_v2, grounding_sam,