fine-tuning to tools

landing-ai · Aug 23, 2024 · a61ac26 · a61ac26
1 parent d6d4b78
commit a61ac26
Show file tree

Hide file tree

Showing 6 changed files with 151 additions and 158 deletions.
diff --git a/vision_agent/agent/vision_agent.py b/vision_agent/agent/vision_agent.py
@@ -28,7 +28,7 @@ class DefaultImports:
  code = [
  "from typing import *",
  "from vision_agent.utils.execute import CodeInterpreter",
- "from vision_agent.tools.meta_tools import generate_vision_code, edit_vision_code, open_file, create_file, scroll_up, scroll_down, edit_file, get_tool_descriptions, florencev2_fine_tuning, florencev2_fine_tuned_object_detection, check_if_fine_tuned_florencev2_is_ready",
+ "from vision_agent.tools.meta_tools import generate_vision_code, edit_vision_code, open_file, create_file, scroll_up, scroll_down, edit_file, get_tool_descriptions",
  ]
 
  @staticmethod

diff --git a/vision_agent/clients/landing_public_api.py b/vision_agent/clients/landing_public_api.py
@@ -4,7 +4,7 @@
 
 from vision_agent.clients.http import BaseHTTP
 from vision_agent.utils.type_defs import LandingaiAPIKey
-from vision_agent.tools.meta_tools_types import BboxInputBase64, PromptTask, JobStatus
+from vision_agent.tools.tools_types import BboxInputBase64, PromptTask, JobStatus
 
 
 class LandingPublicAPI(BaseHTTP):

diff --git a/vision_agent/tools/__init__.py b/vision_agent/tools/__init__.py
@@ -2,9 +2,6 @@
 
 from .meta_tools import (
  META_TOOL_DOCSTRING,
- florencev2_fine_tuning,
- florencev2_fine_tuned_object_detection,
- check_if_fine_tuned_florencev2_is_ready,
 )
 from .prompts import CHOOSE_PARAMS, SYSTEM_PROMPT
 from .tools import (

diff --git a/vision_agent/tools/meta_tools.py b/vision_agent/tools/meta_tools.py
@@ -1,25 +1,13 @@
 import os
 import subprocess
-from uuid import UUID
 from pathlib import Path
 from typing import Any, Dict, List, Union
 
-import numpy as np
-
 import vision_agent as va
 from vision_agent.lmm.types import Message
-from vision_agent.tools.tool_utils import get_tool_documentation, send_inference_request
+from vision_agent.tools.tool_utils import get_tool_documentation
 from vision_agent.tools.tools import TOOL_DESCRIPTIONS
-from vision_agent.utils.image_utils import convert_to_b64, normalize_bbox
-from vision_agent.clients.landing_public_api import LandingPublicAPI
-from vision_agent.tools.meta_tools_types import (
- BboxInput,
- BboxInputBase64,
- PromptTask,
- Florencev2FtRequest,
- FineTuning,
- JobStatus,
-)
+
 
 # These tools are adapted from SWE-Agent https://github.com/princeton-nlp/SWE-agent
 
@@ -398,142 +386,6 @@ def get_tool_descriptions() -> str:
  return TOOL_DESCRIPTIONS
 
 
-def florencev2_fine_tuning(bboxes: List[Dict[str, Any]], task: str) -> UUID:
- """'florencev2_fine_tuning' is a tool that fine-tune florencev2 to be able
- to detect objects in an image based on a given dataset. It returns the fine
- tuning job id.
-
- Parameters:
- bboxes (List[BboxInput]): A list of BboxInput containing the
- image path, labels and bounding boxes.
- task (PromptTask): The florencev2 fine-tuning task. The options are
- CAPTION, CAPTION_TO_PHRASE_GROUNDING and OBJECT_DETECTION.
-
- Returns:
- UUID: The fine tuning job id, this id will used to retrieve the fine
- tuned model.
-
- Example
- -------
- >>> fine_tuning_job_id = florencev2_fine_tuning(
- [{'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[370, 30, 560, 290]]},
- {'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[120, 0, 300, 170]]}],
- "OBJECT_DETECTION"
- )
- """
- bboxes_input = [BboxInput.model_validate(bbox) for bbox in bboxes]
- task_input = PromptTask[task]
- fine_tuning_request = [
- BboxInputBase64(
- image=convert_to_b64(bbox_input.image_path),
- filename=bbox_input.image_path.split("/")[-1],
- labels=bbox_input.labels,
- bboxes=bbox_input.bboxes,
- )
- for bbox_input in bboxes_input
- ]
- landing_api = LandingPublicAPI()
- return landing_api.launch_fine_tuning_job(
- "florencev2", task_input, fine_tuning_request
- )
-
-
-def check_if_fine_tuned_florencev2_is_ready(model_id: UUID) -> bool:
- """'check_if_fine_tuned_florencev2_is_ready' is a tool that checks whether
- is possible to use a certain florencev2 model. It checks if the status
- is SUCCEEDED.
-
- Parameters:
- model_id (UUID): The fine-tuned model id.
-
- Returns:
- bool: The indication if the model is ready to be used or not. If this
- is False, it's recommended to wait 5 seconds before checking again.
-
- Example
- -------
- >>> check_if_fine_tuned_florencev2_is_ready(UUID("381cd5f9-5dc4-472d-9260-f3bb89d31f83"))
- True
- """
- # check if job succeeded first
- landing_api = LandingPublicAPI()
- status = landing_api.check_fine_tuning_job(model_id)
- return status is JobStatus.SUCCEEDED
-
-
-def florencev2_fine_tuned_object_detection(
- image: np.ndarray, prompt: str, model_id: UUID, task: str, model_is_ready: bool
-) -> List[Dict[str, Any]]:
- """'florencev2_fine_tuned_object_detection' is a tool that uses a fine tuned model
- to detect objects given a text prompt such as a phrase or class names separated by
- commas. It returns a list of detected objects as labels and their location as
- bounding boxes with score of 1.0.
-
- Parameters:
- image (np.ndarray): The image to used to detect objects.
- prompt (str): The prompt to help find objects in the image.
- model_id (UUID): The fine-tuned model id.
- task (PromptTask): The florencev2 fine-tuning task. The options are
- CAPTION, CAPTION_TO_PHRASE_GROUNDING and OBJECT_DETECTION.
- model_is_ready (bool): If the model is ready to be used. It's recommended
- to get this value from the function check_if_fine_tuned_florencev2_is_ready.
-
- Returns:
- List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
- bounding box of the detected objects with normalized coordinates between 0
- and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the
- top-left and xmax and ymax are the coordinates of the bottom-right of the
- bounding box. The scores are always 1.0 and cannot be thresholded
-
- Example
- -------
- >>> florencev2_fine_tuned_object_detection(
- image,
- 'person looking at a coyote',
- UUID("381cd5f9-5dc4-472d-9260-f3bb89d31f83"),
- model_is_ready=check_if_fine_tuned_florencev2_is_ready(
- UUID("381cd5f9-5dc4-472d-9260-f3bb89d31f83")
- )
- )
- [
- {'score': 1.0, 'label': 'person', 'bbox': [0.1, 0.11, 0.35, 0.4]},
- {'score': 1.0, 'label': 'coyote', 'bbox': [0.34, 0.21, 0.85, 0.5},
- ]
- """
- if not model_is_ready:
- return []
-
- task = PromptTask[task]
- if task is PromptTask.OBJECT_DETECTION:
- prompt = ""
-
- data_obj = Florencev2FtRequest(
- image=convert_to_b64(image),
- task=task,
- tool="florencev2_fine_tuning",
- prompt=prompt,
- fine_tuning=FineTuning(job_id=model_id),
- )
- data = data_obj.model_dump(by_alias=True)
- metadata_payload = {"function_name": "florencev2_fine_tuned_object_detection"}
- detections = send_inference_request(
- data, "tools", v2=False, metadata_payload=metadata_payload
- )
-
- detections = detections[task.value]
- return_data = []
- image_size = image.shape[:2]
- for i in range(len(detections["bboxes"])):
- return_data.append(
- {
- "score": 1.0,
- "label": detections["labels"][i],
- "bbox": normalize_bbox(detections["bboxes"][i], image_size),
- }
- )
- return return_data
-
-
 META_TOOL_DOCSTRING = get_tool_documentation(
  [
  get_tool_descriptions,
@@ -547,8 +399,5 @@ def florencev2_fine_tuned_object_detection(
  search_dir,
  search_file,
  find_file,
- florencev2_fine_tuning,
- florencev2_fine_tuned_object_detection,
- check_if_fine_tuned_florencev2_is_ready,
  ]
 )
diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py
@@ -2,6 +2,7 @@
 import json
 import logging
 import tempfile
+from uuid import UUID
 from pathlib import Path
 from importlib import resources
 from typing import Any, Dict, List, Optional, Tuple, Union, cast
@@ -31,6 +32,15 @@
  convert_quad_box_to_bbox,
  rle_decode,
 )
+from vision_agent.tools.tools_types import (
+ BboxInput,
+ BboxInputBase64,
+ PromptTask,
+ Florencev2FtRequest,
+ FineTuning,
+ JobStatus,
+)
+from vision_agent.clients.landing_public_api import LandingPublicAPI
 
 register_heif_opener()
 
@@ -1285,6 +1295,143 @@ def overlay_heat_map(
  return np.array(combined)
 
 
+# TODO: add this function to the imports so that is picked in the agent
+def florencev2_fine_tuning(bboxes: List[Dict[str, Any]], task: str) -> UUID:
+ """'florencev2_fine_tuning' is a tool that fine-tune florencev2 to be able
+ to detect objects in an image based on a given dataset. It returns the fine
+ tuning job id.
+
+ Parameters:
+ bboxes (List[BboxInput]): A list of BboxInput containing the
+ image path, labels and bounding boxes.
+ task (PromptTask): The florencev2 fine-tuning task. The options are
+ CAPTION, CAPTION_TO_PHRASE_GROUNDING and OBJECT_DETECTION.
+
+ Returns:
+ UUID: The fine tuning job id, this id will used to retrieve the fine
+ tuned model.
+
+ Example
+ -------
+ >>> fine_tuning_job_id = florencev2_fine_tuning(
+ [{'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[370, 30, 560, 290]]},
+ {'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[120, 0, 300, 170]]}],
+ "OBJECT_DETECTION"
+ )
+ """
+ bboxes_input = [BboxInput.model_validate(bbox) for bbox in bboxes]
+ task_input = PromptTask[task]
+ fine_tuning_request = [
+ BboxInputBase64(
+ image=convert_to_b64(bbox_input.image_path),
+ filename=bbox_input.image_path.split("/")[-1],
+ labels=bbox_input.labels,
+ bboxes=bbox_input.bboxes,
+ )
+ for bbox_input in bboxes_input
+ ]
+ landing_api = LandingPublicAPI()
+ return landing_api.launch_fine_tuning_job(
+ "florencev2", task_input, fine_tuning_request
+ )
+
+
+def check_if_fine_tuned_florencev2_is_ready(model_id: UUID) -> bool:
+ """'check_if_fine_tuned_florencev2_is_ready' is a tool that checks whether
+ is possible to use a certain florencev2 model. It checks if the status
+ is SUCCEEDED.
+
+ Parameters:
+ model_id (UUID): The fine-tuned model id.
+
+ Returns:
+ bool: The indication if the model is ready to be used or not. If this
+ is False, it's recommended to wait 5 seconds before checking again.
+
+ Example
+ -------
+ >>> check_if_fine_tuned_florencev2_is_ready(UUID("381cd5f9-5dc4-472d-9260-f3bb89d31f83"))
+ True
+ """
+ # check if job succeeded first
+ landing_api = LandingPublicAPI()
+ status = landing_api.check_fine_tuning_job(model_id)
+ return status is JobStatus.SUCCEEDED
+
+
+def florencev2_fine_tuned_object_detection(
+ image: np.ndarray, prompt: str, model_id: UUID, task: str, model_is_ready: bool
+) -> List[Dict[str, Any]]:
+ """'florencev2_fine_tuned_object_detection' is a tool that uses a fine tuned model
+ to detect objects given a text prompt such as a phrase or class names separated by
+ commas. It returns a list of detected objects as labels and their location as
+ bounding boxes with score of 1.0.
+
+ Parameters:
+ image (np.ndarray): The image to used to detect objects.
+ prompt (str): The prompt to help find objects in the image.
+ model_id (UUID): The fine-tuned model id.
+ task (PromptTask): The florencev2 fine-tuning task. The options are
+ CAPTION, CAPTION_TO_PHRASE_GROUNDING and OBJECT_DETECTION.
+ model_is_ready (bool): If the model is ready to be used. It's recommended
+ to get this value from the function check_if_fine_tuned_florencev2_is_ready.
+
+ Returns:
+ List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
+ bounding box of the detected objects with normalized coordinates between 0
+ and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the
+ top-left and xmax and ymax are the coordinates of the bottom-right of the
+ bounding box. The scores are always 1.0 and cannot be thresholded
+
+ Example
+ -------
+ >>> florencev2_fine_tuned_object_detection(
+ image,
+ 'person looking at a coyote',
+ UUID("381cd5f9-5dc4-472d-9260-f3bb89d31f83"),
+ model_is_ready=check_if_fine_tuned_florencev2_is_ready(
+ UUID("381cd5f9-5dc4-472d-9260-f3bb89d31f83")
+ )
+ )
+ [
+ {'score': 1.0, 'label': 'person', 'bbox': [0.1, 0.11, 0.35, 0.4]},
+ {'score': 1.0, 'label': 'coyote', 'bbox': [0.34, 0.21, 0.85, 0.5},
+ ]
+ """
+ if not model_is_ready:
+ return []
+
+ task = PromptTask[task]
+ if task is PromptTask.OBJECT_DETECTION:
+ prompt = ""
+
+ data_obj = Florencev2FtRequest(
+ image=convert_to_b64(image),
+ task=task,
+ tool="florencev2_fine_tuning",
+ prompt=prompt,
+ fine_tuning=FineTuning(job_id=model_id),
+ )
+ data = data_obj.model_dump(by_alias=True)
+ metadata_payload = {"function_name": "florencev2_fine_tuned_object_detection"}
+ detections = send_inference_request(
+ data, "tools", v2=False, metadata_payload=metadata_payload
+ )
+
+ detections = detections[task.value]
+ return_data = []
+ image_size = image.shape[:2]
+ for i in range(len(detections["bboxes"])):
+ return_data.append(
+ {
+ "score": 1.0,
+ "label": detections["labels"][i],
+ "bbox": normalize_bbox(detections["bboxes"][i], image_size),
+ }
+ )
+ return return_data
+
+
 TOOLS = [
  owl_v2,
  grounding_sam,

diff --git a/vision_agent/tools/meta_tools_types.py → vision_agent/tools/tools_types.py b/vision_agent/tools/meta_tools_types.py → vision_agent/tools/tools_types.py