diff --git a/vision_agent/tools/meta_tools.py b/vision_agent/tools/meta_tools.py index c5860a60..290133eb 100644 --- a/vision_agent/tools/meta_tools.py +++ b/vision_agent/tools/meta_tools.py @@ -1,5 +1,6 @@ import os import subprocess +from uuid import UUID from pathlib import Path from typing import Any, Dict, List, Union @@ -7,6 +8,9 @@ from vision_agent.lmm.types import Message from vision_agent.tools.tool_utils import get_tool_documentation from vision_agent.tools.tools import TOOL_DESCRIPTIONS +from vision_agent.utils.image_utils import convert_to_b64 +from vision_agent.clients.landing_public_api import LandingPublicAPI +from vision_agent.tools.meta_tools_types import BboxInput, BboxInputBase64, PromptTask # These tools are adapted from SWE-Agent https://github.com/princeton-nlp/SWE-agent @@ -385,6 +389,46 @@ def get_tool_descriptions() -> str: return TOOL_DESCRIPTIONS +def florencev2_fine_tuning(bboxes: List[Dict[str, Any]], task: str) -> UUID: + """'florencev2_fine_tuning' is a tool that fine-tune florencev2 to be able + to detect objects in an image based on a given dataset. It returns the fine + tuning job id. + + Parameters: + bboxes (List[BboxInput]): A list of BboxInput containing the + image object, image filename, labels and bounding boxes. + task (PromptTask): The florencev2 fine-tuning task. The options are + CAPTION, CAPTION_TO_PHRASE_GROUNDING and OBJECT_DETECTION. + + Returns: + UUID: The fine tuning job id, this id will used to retrieve the fine + tuned model. + + Example + ------- + >>> fine_tuning_job_id = florencev2_fine_tuning( + [{'image': image, 'filename': 'filename.png', 'label': ['screw'], 'bbox': [[370, 30, 560, 290]]}, + {'image': image, 'filename': 'filename.png', 'label': ['screw'], 'bbox': [[120, 0, 300, 170]]}], + "OBJECT_DETECTION" + ) + """ + bboxes_input = [BboxInput.model_validate(bbox) for bbox in bboxes] + task_input = PromptTask[task] + fine_tuning_request = [ + BboxInputBase64( + image=convert_to_b64(bbox_input.image), + filename=bbox_input.filename, + labels=bbox_input.labels, + bboxes=bbox_input.bboxes, + ) + for bbox_input in bboxes_input + ] + landing_api = LandingPublicAPI() + return landing_api.launch_fine_tuning_job( + "florencev2", task_input, fine_tuning_request + ) + + META_TOOL_DOCSTRING = get_tool_documentation( [ get_tool_descriptions, @@ -398,5 +442,6 @@ def get_tool_descriptions() -> str: search_dir, search_file, find_file, + florencev2_fine_tuning, ] ) diff --git a/vision_agent/tools/tool_types.py b/vision_agent/tools/meta_tools_types.py similarity index 100% rename from vision_agent/tools/tool_types.py rename to vision_agent/tools/meta_tools_types.py diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py index d40b13c1..a78457ed 100644 --- a/vision_agent/tools/tools.py +++ b/vision_agent/tools/tools.py @@ -2,7 +2,6 @@ import json import logging import tempfile -from uuid import UUID from pathlib import Path from importlib import resources from typing import Any, Dict, List, Optional, Tuple, Union, cast @@ -15,8 +14,6 @@ from PIL import Image, ImageDraw, ImageFont from pillow_heif import register_heif_opener # type: ignore -from vision_agent.clients.landing_public_api import LandingPublicAPI -from vision_agent.tools.tool_types import BboxInput, BboxInputBase64, PromptTask from vision_agent.tools.tool_utils import ( send_inference_request, get_tool_descriptions, @@ -662,46 +659,6 @@ def florencev2_object_detection(image: np.ndarray) -> List[Dict[str, Any]]: return return_data -def florencev2_fine_tuning(bboxes: List[Dict[str, Any]], task: str) -> UUID: - """'florencev2_fine_tuning' is a tool that fine-tune florencev2 to be able - to detect objects in an image based on a given dataset. It returns the fine - tuning job id. - - Parameters: - bboxes (List[BboxInput]): A list of BboxInput containing the - image object, image filename, labels and bounding boxes. - task (PromptTask): The florencev2 fine-tuning task. The options are - CAPTION, CAPTION_TO_PHRASE_GROUNDING and OBJECT_DETECTION. - - Returns: - UUID: The fine tuning job id, this id will used to retrieve the fine - tuned model. - - Example - ------- - >>> fine_tuning_job_id = florencev2_fine_tuning( - [{'image': image, 'filename': 'filename.png', 'label': ['screw'], 'bbox': [[370, 30, 560, 290]]}, - {'image': image, 'filename': 'filename.png', 'label': ['screw'], 'bbox': [[120, 0, 300, 170]]}], - "OBJECT_DETECTION" - ) - """ - bboxes_input = [BboxInput.model_validate(bbox) for bbox in bboxes] - task_input = PromptTask[task] - fine_tuning_request = [ - BboxInputBase64( - image=convert_to_b64(bbox_input.image), - filename=bbox_input.filename, - labels=bbox_input.labels, - bboxes=bbox_input.bboxes, - ) - for bbox_input in bboxes_input - ] - landing_api = LandingPublicAPI() - return landing_api.launch_fine_tuning_job( - "florencev2", task_input, fine_tuning_request - ) - - def detr_segmentation(image: np.ndarray) -> List[Dict[str, Any]]: """'detr_segmentation' is a tool that can segment common objects in an image without any text prompt. It returns a list of detected objects @@ -1297,7 +1254,6 @@ def overlay_heat_map( florencev2_roberta_vqa, florencev2_image_caption, florencev2_object_detection, - florencev2_fine_tuning, detr_segmentation, depth_anything_v2, generate_soft_edge_image,