diff --git a/vision_agent/clients/__init__.py b/vision_agent/clients/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/vision_agent/clients/http.py b/vision_agent/clients/http.py new file mode 100644 index 00000000..6cd37654 --- /dev/null +++ b/vision_agent/clients/http.py @@ -0,0 +1,42 @@ +import json +import logging +from typing import Any, Dict, Optional + +from requests import Session +from requests.adapters import HTTPAdapter +from requests.exceptions import ConnectionError, RequestException, Timeout + +_LOGGER = logging.getLogger(__name__) + + +class BaseHTTP: + _TIMEOUT = 30 # seconds + _MAX_RETRIES = 3 + + def __init__(self, base_endpoint: str, *, headers: Optional[Dict[str, Any]] = None) -> None: + self._headers = headers + if headers is None: + self._headers = { + "Content-Type": "application/json", + } + self._base_endpoint = base_endpoint + self._session = Session() + self._session.headers.update(self._headers) + self._session.mount(self._base_endpoint, HTTPAdapter(max_retries=self._MAX_RETRIES)) + + def post(self, url: str, payload: Dict[str, Any]) -> None: + formatted_url = f"{self._base_endpoint}/{url}" + _LOGGER.info(f"Sending data to {formatted_url}") + try: + response = self._session.post( + url=formatted_url, + json=payload, + timeout=self._TIMEOUT + ) + response.raise_for_status() + _LOGGER.info(json.dumps(response.json())) + except (ConnectionError, Timeout, RequestException) as err: + _LOGGER.warning(f"Error: {err}.") + except json.JSONDecodeError: + resp_text = response.text + _LOGGER.warning(f"Response seems incorrect: '{resp_text}'.") diff --git a/vision_agent/clients/landing_public_api.py b/vision_agent/clients/landing_public_api.py new file mode 100644 index 00000000..846ea935 --- /dev/null +++ b/vision_agent/clients/landing_public_api.py @@ -0,0 +1,26 @@ +import os +from uuid import UUID +from typing import List + +from vision_agent.clients.http import BaseHTTP +from vision_agent.utils.type_defs import LandingaiAPIKey +from vision_agent.tools.tool_types import BboxInputBase64 + + +class LandingPublicAPI(BaseHTTP): + def __init__(self) -> None: + landing_url = os.environ.get("LANDINGAI_URL", "https://api.dev.landing.ai") + landing_api_key = os.environ.get("LANDINGAI_API_KEY", LandingaiAPIKey().api_key) + headers = {"Content-Type": "application/json", "apikey": landing_api_key} + super().__init__(base_endpoint=landing_url, headers=headers) + + def launch_fine_tuning_job( + self, model_name: str, task: str, bboxes: List[BboxInputBase64] + ) -> UUID: + url = "v1/agent/jobs/fine-tuning" + data = { + "model": {"name": model_name, "task": task}, + "bboxes": [bbox.model_dump(by_alias=True) for bbox in bboxes] + } + response = self.post(url, payload=data) + return UUID(response["jobId"]) diff --git a/vision_agent/tools/__init__.py b/vision_agent/tools/__init__.py index e95a78c7..1ea94510 100644 --- a/vision_agent/tools/__init__.py +++ b/vision_agent/tools/__init__.py @@ -19,6 +19,7 @@ florencev2_image_caption, florencev2_object_detection, florencev2_roberta_vqa, + florencev2_fine_tuning, generate_pose_image, generate_soft_edge_image, get_tool_documentation, diff --git a/vision_agent/tools/tool_types.py b/vision_agent/tools/tool_types.py new file mode 100644 index 00000000..c575cec8 --- /dev/null +++ b/vision_agent/tools/tool_types.py @@ -0,0 +1,20 @@ +from typing import List, Tuple + +from nptyping import UInt8, NDArray, Shape +from pydantic import BaseModel, ConfigDict + + +class BboxInput(BaseModel): + model_config = ConfigDict(arbitrary_types_allowed=True) + + image: NDArray[Shape["Height, Width, 3"], UInt8] + filename: str + labels: List[str] + bboxes: List[Tuple[int, int, int, int]] + + +class BboxInputBase64(BaseModel): + image: str + filename: str + labels: List[str] + bboxes: List[Tuple[int, int, int, int]] diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py index c0af8b21..d2687683 100644 --- a/vision_agent/tools/tools.py +++ b/vision_agent/tools/tools.py @@ -1,24 +1,25 @@ import io import json +import inspect import logging import tempfile -from importlib import resources +from uuid import UUID from pathlib import Path -from typing import Any, Dict, List, Optional, Tuple, Union, cast +from importlib import resources +from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast import cv2 -import numpy as np import requests +import numpy as np +import pandas as pd +from pytube import YouTube # type: ignore +from moviepy.editor import ImageSequenceClip from PIL import Image, ImageDraw, ImageFont from pillow_heif import register_heif_opener # type: ignore -from pytube import YouTube # type: ignore -from vision_agent.tools.tool_utils import ( - get_tool_descriptions, - get_tool_documentation, - get_tools_df, - send_inference_request, -) +from vision_agent.clients.landing_public_api import LandingPublicAPI +from vision_agent.tools.tool_types import BboxInput, BboxInputBase64 +from vision_agent.tools.tool_utils import send_inference_request from vision_agent.utils import extract_frames_from_video from vision_agent.utils.execute import FileSerializer, MimeType from vision_agent.utils.image_utils import ( @@ -56,6 +57,7 @@ ] _API_KEY = "land_sk_WVYwP00xA3iXely2vuar6YUDZ3MJT9yLX6oW5noUkwICzYLiDV" _OCR_URL = "https://app.landing.ai/ocr/v1/detect-text" +logging.basicConfig(level=logging.INFO) _LOGGER = logging.getLogger(__name__) @@ -106,7 +108,6 @@ def grounding_dino( "visual_grounding" if model_size == "large" else "visual_grounding_tiny" ), "kwargs": {"box_threshold": box_threshold, "iou_threshold": iou_threshold}, - "function_name": "grounding_dino", } data: Dict[str, Any] = send_inference_request(request_data, "tools") return_data = [] @@ -162,7 +163,6 @@ def owl_v2( "image": image_b64, "tool": "open_vocab_detection", "kwargs": {"box_threshold": box_threshold, "iou_threshold": iou_threshold}, - "function_name": "owl_v2", } data: Dict[str, Any] = send_inference_request(request_data, "tools") return_data = [] @@ -227,7 +227,6 @@ def grounding_sam( "image": image_b64, "tool": "visual_grounding_segment", "kwargs": {"box_threshold": box_threshold, "iou_threshold": iou_threshold}, - "function_name": "grounding_sam", } data: Dict[str, Any] = send_inference_request(request_data, "tools") return_data = [] @@ -367,7 +366,6 @@ def loca_zero_shot_counting(image: np.ndarray) -> Dict[str, Any]: data = { "image": image_b64, "tool": "zero_shot_counting", - "function_name": "loca_zero_shot_counting", } resp_data = send_inference_request(data, "tools") resp_data["heat_map"] = np.array(b64_to_pil(resp_data["heat_map"][0])) @@ -403,7 +401,6 @@ def loca_visual_prompt_counting( "image": image_b64, "prompt": bbox_str, "tool": "few_shot_counting", - "function_name": "loca_visual_prompt_counting", } resp_data = send_inference_request(data, "tools") resp_data["heat_map"] = np.array(b64_to_pil(resp_data["heat_map"][0])) @@ -433,7 +430,6 @@ def florencev2_roberta_vqa(prompt: str, image: np.ndarray) -> str: "image": image_b64, "prompt": prompt, "tool": "image_question_answering_with_context", - "function_name": "florencev2_roberta_vqa", } answer = send_inference_request(data, "tools") @@ -463,7 +459,6 @@ def git_vqa_v2(prompt: str, image: np.ndarray) -> str: "image": image_b64, "prompt": prompt, "tool": "image_question_answering", - "function_name": "git_vqa_v2", } answer = send_inference_request(data, "tools") @@ -494,7 +489,6 @@ def clip(image: np.ndarray, classes: List[str]) -> Dict[str, Any]: "prompt": ",".join(classes), "image": image_b64, "tool": "closed_set_image_classification", - "function_name": "clip", } resp_data = send_inference_request(data, "tools") resp_data["scores"] = [round(prob, 4) for prob in resp_data["scores"]] @@ -522,7 +516,6 @@ def vit_image_classification(image: np.ndarray) -> Dict[str, Any]: data = { "image": image_b64, "tool": "image_classification", - "function_name": "vit_image_classification", } resp_data = send_inference_request(data, "tools") resp_data["scores"] = [round(prob, 4) for prob in resp_data["scores"]] @@ -550,7 +543,6 @@ def vit_nsfw_classification(image: np.ndarray) -> Dict[str, Any]: data = { "image": image_b64, "tool": "nsfw_image_classification", - "function_name": "vit_nsfw_classification", } resp_data = send_inference_request(data, "tools") resp_data["scores"] = round(resp_data["scores"], 4) @@ -577,7 +569,6 @@ def blip_image_caption(image: np.ndarray) -> str: data = { "image": image_b64, "tool": "image_captioning", - "function_name": "blip_image_caption", } answer = send_inference_request(data, "tools") @@ -606,7 +597,6 @@ def florencev2_image_caption(image: np.ndarray, detail_caption: bool = True) -> "image": image_b64, "tool": "florence2_image_captioning", "detail_caption": detail_caption, - "function_name": "florencev2_image_caption", } answer = send_inference_request(data, "tools") @@ -642,7 +632,6 @@ def florencev2_object_detection(image: np.ndarray) -> List[Dict[str, Any]]: data = { "image": image_b64, "tool": "object_detection", - "function_name": "florencev2_object_detection", } answer = send_inference_request(data, "tools") @@ -658,6 +647,42 @@ def florencev2_object_detection(image: np.ndarray) -> List[Dict[str, Any]]: return return_data +def florencev2_fine_tuning(bboxes: List[Dict[str, Any]]) -> UUID: + """'florencev2_fine_tuning' is a tool that fine-tune florencev2 to be able + to detect objects in an image based on a given dataset. It returns the fine + tuning job id. + + Parameters: + bboxes (List[BboxInput]): A list of BboxInput containing the + image object, image filename, labels and bounding boxes. + + Returns: + UUID: The fine tuning job id, this id will used to retrieve the fine + tuned model. + + Example + ------- + >>> fine_tuning_job_id = florencev2_fine_tuning( + [{'image': image, 'filename': 'filename.png', 'label': ['screw'], 'bbox': [[370, 30, 560, 290]]}, + {'image': image, 'filename': 'filename.png', 'label': ['screw'], 'bbox': [[120, 0, 300, 170]]}] + ) + """ + bboxes_input = [BboxInput.model_validate(bbox) for bbox in bboxes] + fine_tuning_request = [ + BboxInputBase64( + image=convert_to_b64(bbox_input.image), + filename=bbox_input.filename, + labels=bbox_input.labels, + bboxes=bbox_input.bboxes + ) + for bbox_input in bboxes_input + ] + # TODO: receive the task from user prompt + task = "" + landing_api = LandingPublicAPI() + return landing_api.launch_fine_tuning_job("florencev2", task, fine_tuning_request) + + def detr_segmentation(image: np.ndarray) -> List[Dict[str, Any]]: """'detr_segmentation' is a tool that can segment common objects in an image without any text prompt. It returns a list of detected objects @@ -699,7 +724,6 @@ def detr_segmentation(image: np.ndarray) -> List[Dict[str, Any]]: data = { "image": image_b64, "tool": "panoptic_segmentation", - "function_name": "detr_segmentation", } answer = send_inference_request(data, "tools") @@ -742,7 +766,6 @@ def depth_anything_v2(image: np.ndarray) -> np.ndarray: data = { "image": image_b64, "tool": "generate_depth", - "function_name": "depth_anything_v2", } answer = send_inference_request(data, "tools") @@ -774,7 +797,6 @@ def generate_soft_edge_image(image: np.ndarray) -> np.ndarray: data = { "image": image_b64, "tool": "generate_hed", - "function_name": "generate_soft_edge_image", } answer = send_inference_request(data, "tools") @@ -807,7 +829,6 @@ def dpt_hybrid_midas(image: np.ndarray) -> np.ndarray: data = { "image": image_b64, "tool": "generate_normal", - "function_name": "dpt_hybrid_midas", } answer = send_inference_request(data, "tools") @@ -839,7 +860,6 @@ def generate_pose_image(image: np.ndarray) -> np.ndarray: data = { "image": image_b64, "tool": "generate_pose", - "function_name": "generate_pose_image", } answer = send_inference_request(data, "tools") @@ -880,7 +900,6 @@ def template_match( "image": image_b64, "template": template_image_b64, "tool": "template_match", - "function_name": "template_match", } answer = send_inference_request(data, "tools") @@ -1062,21 +1081,15 @@ def save_video( if fps <= 0: _LOGGER.warning(f"Invalid fps value: {fps}. Setting fps to 4 (default value).") fps = 4 - - if not output_video_path: - output_video_path = tempfile.NamedTemporaryFile( - suffix=".mp4", delete=False - ).name - - height, width, layers = frames[0].shape if frames else (0, 0, 0) - fourcc = cv2.VideoWriter_fourcc(*"mp4v") # type: ignore - video = cv2.VideoWriter(output_video_path, fourcc, fps, (width, height)) - for frame in frames: - video.write(cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)) - video.release() - - _save_video_to_result(output_video_path) - return output_video_path + with ImageSequenceClip(frames, fps=fps) as video: + if output_video_path: + f = open(output_video_path, "wb") + else: + f = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) # type: ignore + video.write_videofile(f.name, codec="libx264") + f.close() + _save_video_to_result(f.name) + return f.name def _save_video_to_result(video_uri: str) -> None: @@ -1246,6 +1259,50 @@ def overlay_heat_map( return np.array(combined) +def get_tool_documentation(funcs: List[Callable[..., Any]]) -> str: + docstrings = "" + for func in funcs: + docstrings += f"{func.__name__}{inspect.signature(func)}:\n{func.__doc__}\n\n" + + return docstrings + + +def get_tool_descriptions(funcs: List[Callable[..., Any]]) -> str: + descriptions = "" + for func in funcs: + description = func.__doc__ + if description is None: + description = "" + + if "Parameters:" in description: + description = ( + description[: description.find("Parameters:")] + .replace("\n", " ") + .strip() + ) + + description = " ".join(description.split()) + descriptions += f"- {func.__name__}{inspect.signature(func)}: {description}\n" + return descriptions + + +def get_tools_df(funcs: List[Callable[..., Any]]) -> pd.DataFrame: + data: Dict[str, List[str]] = {"desc": [], "doc": []} + + for func in funcs: + desc = func.__doc__ + if desc is None: + desc = "" + desc = desc[: desc.find("Parameters:")].replace("\n", " ").strip() + desc = " ".join(desc.split()) + + doc = f"{func.__name__}{inspect.signature(func)}:\n{func.__doc__}" + data["desc"].append(desc) + data["doc"].append(doc) + + return pd.DataFrame(data) # type: ignore + + TOOLS = [ owl_v2, grounding_sam, @@ -1259,6 +1316,7 @@ def overlay_heat_map( florencev2_roberta_vqa, florencev2_image_caption, florencev2_object_detection, + florencev2_fine_tuning, detr_segmentation, depth_anything_v2, generate_soft_edge_image,