From a61ac2667d69674265036dab7cbbc7dfc37b6124 Mon Sep 17 00:00:00 2001 From: Dayanne Fernandes Date: Fri, 23 Aug 2024 18:20:20 -0300 Subject: [PATCH] fine-tuning to tools --- vision_agent/agent/vision_agent.py | 2 +- vision_agent/clients/landing_public_api.py | 2 +- vision_agent/tools/__init__.py | 3 - vision_agent/tools/meta_tools.py | 155 +----------------- vision_agent/tools/tools.py | 147 +++++++++++++++++ .../{meta_tools_types.py => tools_types.py} | 0 6 files changed, 151 insertions(+), 158 deletions(-) rename vision_agent/tools/{meta_tools_types.py => tools_types.py} (100%) diff --git a/vision_agent/agent/vision_agent.py b/vision_agent/agent/vision_agent.py index 996e5eac..cfb482e1 100644 --- a/vision_agent/agent/vision_agent.py +++ b/vision_agent/agent/vision_agent.py @@ -28,7 +28,7 @@ class DefaultImports: code = [ "from typing import *", "from vision_agent.utils.execute import CodeInterpreter", - "from vision_agent.tools.meta_tools import generate_vision_code, edit_vision_code, open_file, create_file, scroll_up, scroll_down, edit_file, get_tool_descriptions, florencev2_fine_tuning, florencev2_fine_tuned_object_detection, check_if_fine_tuned_florencev2_is_ready", + "from vision_agent.tools.meta_tools import generate_vision_code, edit_vision_code, open_file, create_file, scroll_up, scroll_down, edit_file, get_tool_descriptions", ] @staticmethod diff --git a/vision_agent/clients/landing_public_api.py b/vision_agent/clients/landing_public_api.py index 09f98b44..f9d52389 100644 --- a/vision_agent/clients/landing_public_api.py +++ b/vision_agent/clients/landing_public_api.py @@ -4,7 +4,7 @@ from vision_agent.clients.http import BaseHTTP from vision_agent.utils.type_defs import LandingaiAPIKey -from vision_agent.tools.meta_tools_types import BboxInputBase64, PromptTask, JobStatus +from vision_agent.tools.tools_types import BboxInputBase64, PromptTask, JobStatus class LandingPublicAPI(BaseHTTP): diff --git a/vision_agent/tools/__init__.py b/vision_agent/tools/__init__.py index 4a863994..53b64ffb 100644 --- a/vision_agent/tools/__init__.py +++ b/vision_agent/tools/__init__.py @@ -2,9 +2,6 @@ from .meta_tools import ( META_TOOL_DOCSTRING, - florencev2_fine_tuning, - florencev2_fine_tuned_object_detection, - check_if_fine_tuned_florencev2_is_ready, ) from .prompts import CHOOSE_PARAMS, SYSTEM_PROMPT from .tools import ( diff --git a/vision_agent/tools/meta_tools.py b/vision_agent/tools/meta_tools.py index 925de2d4..7c857550 100644 --- a/vision_agent/tools/meta_tools.py +++ b/vision_agent/tools/meta_tools.py @@ -1,25 +1,13 @@ import os import subprocess -from uuid import UUID from pathlib import Path from typing import Any, Dict, List, Union -import numpy as np - import vision_agent as va from vision_agent.lmm.types import Message -from vision_agent.tools.tool_utils import get_tool_documentation, send_inference_request +from vision_agent.tools.tool_utils import get_tool_documentation from vision_agent.tools.tools import TOOL_DESCRIPTIONS -from vision_agent.utils.image_utils import convert_to_b64, normalize_bbox -from vision_agent.clients.landing_public_api import LandingPublicAPI -from vision_agent.tools.meta_tools_types import ( - BboxInput, - BboxInputBase64, - PromptTask, - Florencev2FtRequest, - FineTuning, - JobStatus, -) + # These tools are adapted from SWE-Agent https://github.com/princeton-nlp/SWE-agent @@ -398,142 +386,6 @@ def get_tool_descriptions() -> str: return TOOL_DESCRIPTIONS -def florencev2_fine_tuning(bboxes: List[Dict[str, Any]], task: str) -> UUID: - """'florencev2_fine_tuning' is a tool that fine-tune florencev2 to be able - to detect objects in an image based on a given dataset. It returns the fine - tuning job id. - - Parameters: - bboxes (List[BboxInput]): A list of BboxInput containing the - image path, labels and bounding boxes. - task (PromptTask): The florencev2 fine-tuning task. The options are - CAPTION, CAPTION_TO_PHRASE_GROUNDING and OBJECT_DETECTION. - - Returns: - UUID: The fine tuning job id, this id will used to retrieve the fine - tuned model. - - Example - ------- - >>> fine_tuning_job_id = florencev2_fine_tuning( - [{'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[370, 30, 560, 290]]}, - {'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[120, 0, 300, 170]]}], - "OBJECT_DETECTION" - ) - """ - bboxes_input = [BboxInput.model_validate(bbox) for bbox in bboxes] - task_input = PromptTask[task] - fine_tuning_request = [ - BboxInputBase64( - image=convert_to_b64(bbox_input.image_path), - filename=bbox_input.image_path.split("/")[-1], - labels=bbox_input.labels, - bboxes=bbox_input.bboxes, - ) - for bbox_input in bboxes_input - ] - landing_api = LandingPublicAPI() - return landing_api.launch_fine_tuning_job( - "florencev2", task_input, fine_tuning_request - ) - - -def check_if_fine_tuned_florencev2_is_ready(model_id: UUID) -> bool: - """'check_if_fine_tuned_florencev2_is_ready' is a tool that checks whether - is possible to use a certain florencev2 model. It checks if the status - is SUCCEEDED. - - Parameters: - model_id (UUID): The fine-tuned model id. - - Returns: - bool: The indication if the model is ready to be used or not. If this - is False, it's recommended to wait 5 seconds before checking again. - - Example - ------- - >>> check_if_fine_tuned_florencev2_is_ready(UUID("381cd5f9-5dc4-472d-9260-f3bb89d31f83")) - True - """ - # check if job succeeded first - landing_api = LandingPublicAPI() - status = landing_api.check_fine_tuning_job(model_id) - return status is JobStatus.SUCCEEDED - - -def florencev2_fine_tuned_object_detection( - image: np.ndarray, prompt: str, model_id: UUID, task: str, model_is_ready: bool -) -> List[Dict[str, Any]]: - """'florencev2_fine_tuned_object_detection' is a tool that uses a fine tuned model - to detect objects given a text prompt such as a phrase or class names separated by - commas. It returns a list of detected objects as labels and their location as - bounding boxes with score of 1.0. - - Parameters: - image (np.ndarray): The image to used to detect objects. - prompt (str): The prompt to help find objects in the image. - model_id (UUID): The fine-tuned model id. - task (PromptTask): The florencev2 fine-tuning task. The options are - CAPTION, CAPTION_TO_PHRASE_GROUNDING and OBJECT_DETECTION. - model_is_ready (bool): If the model is ready to be used. It's recommended - to get this value from the function check_if_fine_tuned_florencev2_is_ready. - - Returns: - List[Dict[str, Any]]: A list of dictionaries containing the score, label, and - bounding box of the detected objects with normalized coordinates between 0 - and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the - top-left and xmax and ymax are the coordinates of the bottom-right of the - bounding box. The scores are always 1.0 and cannot be thresholded - - Example - ------- - >>> florencev2_fine_tuned_object_detection( - image, - 'person looking at a coyote', - UUID("381cd5f9-5dc4-472d-9260-f3bb89d31f83"), - model_is_ready=check_if_fine_tuned_florencev2_is_ready( - UUID("381cd5f9-5dc4-472d-9260-f3bb89d31f83") - ) - ) - [ - {'score': 1.0, 'label': 'person', 'bbox': [0.1, 0.11, 0.35, 0.4]}, - {'score': 1.0, 'label': 'coyote', 'bbox': [0.34, 0.21, 0.85, 0.5}, - ] - """ - if not model_is_ready: - return [] - - task = PromptTask[task] - if task is PromptTask.OBJECT_DETECTION: - prompt = "" - - data_obj = Florencev2FtRequest( - image=convert_to_b64(image), - task=task, - tool="florencev2_fine_tuning", - prompt=prompt, - fine_tuning=FineTuning(job_id=model_id), - ) - data = data_obj.model_dump(by_alias=True) - metadata_payload = {"function_name": "florencev2_fine_tuned_object_detection"} - detections = send_inference_request( - data, "tools", v2=False, metadata_payload=metadata_payload - ) - - detections = detections[task.value] - return_data = [] - image_size = image.shape[:2] - for i in range(len(detections["bboxes"])): - return_data.append( - { - "score": 1.0, - "label": detections["labels"][i], - "bbox": normalize_bbox(detections["bboxes"][i], image_size), - } - ) - return return_data - - META_TOOL_DOCSTRING = get_tool_documentation( [ get_tool_descriptions, @@ -547,8 +399,5 @@ def florencev2_fine_tuned_object_detection( search_dir, search_file, find_file, - florencev2_fine_tuning, - florencev2_fine_tuned_object_detection, - check_if_fine_tuned_florencev2_is_ready, ] ) diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py index 0254a455..52f3c6d9 100644 --- a/vision_agent/tools/tools.py +++ b/vision_agent/tools/tools.py @@ -2,6 +2,7 @@ import json import logging import tempfile +from uuid import UUID from pathlib import Path from importlib import resources from typing import Any, Dict, List, Optional, Tuple, Union, cast @@ -31,6 +32,15 @@ convert_quad_box_to_bbox, rle_decode, ) +from vision_agent.tools.tools_types import ( + BboxInput, + BboxInputBase64, + PromptTask, + Florencev2FtRequest, + FineTuning, + JobStatus, +) +from vision_agent.clients.landing_public_api import LandingPublicAPI register_heif_opener() @@ -1285,6 +1295,143 @@ def overlay_heat_map( return np.array(combined) +# TODO: add this function to the imports so that is picked in the agent +def florencev2_fine_tuning(bboxes: List[Dict[str, Any]], task: str) -> UUID: + """'florencev2_fine_tuning' is a tool that fine-tune florencev2 to be able + to detect objects in an image based on a given dataset. It returns the fine + tuning job id. + + Parameters: + bboxes (List[BboxInput]): A list of BboxInput containing the + image path, labels and bounding boxes. + task (PromptTask): The florencev2 fine-tuning task. The options are + CAPTION, CAPTION_TO_PHRASE_GROUNDING and OBJECT_DETECTION. + + Returns: + UUID: The fine tuning job id, this id will used to retrieve the fine + tuned model. + + Example + ------- + >>> fine_tuning_job_id = florencev2_fine_tuning( + [{'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[370, 30, 560, 290]]}, + {'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[120, 0, 300, 170]]}], + "OBJECT_DETECTION" + ) + """ + bboxes_input = [BboxInput.model_validate(bbox) for bbox in bboxes] + task_input = PromptTask[task] + fine_tuning_request = [ + BboxInputBase64( + image=convert_to_b64(bbox_input.image_path), + filename=bbox_input.image_path.split("/")[-1], + labels=bbox_input.labels, + bboxes=bbox_input.bboxes, + ) + for bbox_input in bboxes_input + ] + landing_api = LandingPublicAPI() + return landing_api.launch_fine_tuning_job( + "florencev2", task_input, fine_tuning_request + ) + + +def check_if_fine_tuned_florencev2_is_ready(model_id: UUID) -> bool: + """'check_if_fine_tuned_florencev2_is_ready' is a tool that checks whether + is possible to use a certain florencev2 model. It checks if the status + is SUCCEEDED. + + Parameters: + model_id (UUID): The fine-tuned model id. + + Returns: + bool: The indication if the model is ready to be used or not. If this + is False, it's recommended to wait 5 seconds before checking again. + + Example + ------- + >>> check_if_fine_tuned_florencev2_is_ready(UUID("381cd5f9-5dc4-472d-9260-f3bb89d31f83")) + True + """ + # check if job succeeded first + landing_api = LandingPublicAPI() + status = landing_api.check_fine_tuning_job(model_id) + return status is JobStatus.SUCCEEDED + + +def florencev2_fine_tuned_object_detection( + image: np.ndarray, prompt: str, model_id: UUID, task: str, model_is_ready: bool +) -> List[Dict[str, Any]]: + """'florencev2_fine_tuned_object_detection' is a tool that uses a fine tuned model + to detect objects given a text prompt such as a phrase or class names separated by + commas. It returns a list of detected objects as labels and their location as + bounding boxes with score of 1.0. + + Parameters: + image (np.ndarray): The image to used to detect objects. + prompt (str): The prompt to help find objects in the image. + model_id (UUID): The fine-tuned model id. + task (PromptTask): The florencev2 fine-tuning task. The options are + CAPTION, CAPTION_TO_PHRASE_GROUNDING and OBJECT_DETECTION. + model_is_ready (bool): If the model is ready to be used. It's recommended + to get this value from the function check_if_fine_tuned_florencev2_is_ready. + + Returns: + List[Dict[str, Any]]: A list of dictionaries containing the score, label, and + bounding box of the detected objects with normalized coordinates between 0 + and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the + top-left and xmax and ymax are the coordinates of the bottom-right of the + bounding box. The scores are always 1.0 and cannot be thresholded + + Example + ------- + >>> florencev2_fine_tuned_object_detection( + image, + 'person looking at a coyote', + UUID("381cd5f9-5dc4-472d-9260-f3bb89d31f83"), + model_is_ready=check_if_fine_tuned_florencev2_is_ready( + UUID("381cd5f9-5dc4-472d-9260-f3bb89d31f83") + ) + ) + [ + {'score': 1.0, 'label': 'person', 'bbox': [0.1, 0.11, 0.35, 0.4]}, + {'score': 1.0, 'label': 'coyote', 'bbox': [0.34, 0.21, 0.85, 0.5}, + ] + """ + if not model_is_ready: + return [] + + task = PromptTask[task] + if task is PromptTask.OBJECT_DETECTION: + prompt = "" + + data_obj = Florencev2FtRequest( + image=convert_to_b64(image), + task=task, + tool="florencev2_fine_tuning", + prompt=prompt, + fine_tuning=FineTuning(job_id=model_id), + ) + data = data_obj.model_dump(by_alias=True) + metadata_payload = {"function_name": "florencev2_fine_tuned_object_detection"} + detections = send_inference_request( + data, "tools", v2=False, metadata_payload=metadata_payload + ) + + detections = detections[task.value] + return_data = [] + image_size = image.shape[:2] + for i in range(len(detections["bboxes"])): + return_data.append( + { + "score": 1.0, + "label": detections["labels"][i], + "bbox": normalize_bbox(detections["bboxes"][i], image_size), + } + ) + return return_data + + TOOLS = [ owl_v2, grounding_sam, diff --git a/vision_agent/tools/meta_tools_types.py b/vision_agent/tools/tools_types.py similarity index 100% rename from vision_agent/tools/meta_tools_types.py rename to vision_agent/tools/tools_types.py