tools to meta tools

landing-ai · Aug 7, 2024 · c9ab90a · c9ab90a
1 parent fa2452a
commit c9ab90a
Show file tree

Hide file tree

Showing 3 changed files with 45 additions and 44 deletions.
diff --git a/vision_agent/tools/meta_tools.py b/vision_agent/tools/meta_tools.py
@@ -1,12 +1,16 @@
 import os
 import subprocess
+from uuid import UUID
 from pathlib import Path
 from typing import Any, Dict, List, Union
 
 import vision_agent as va
 from vision_agent.lmm.types import Message
 from vision_agent.tools.tool_utils import get_tool_documentation
 from vision_agent.tools.tools import TOOL_DESCRIPTIONS
+from vision_agent.utils.image_utils import convert_to_b64
+from vision_agent.clients.landing_public_api import LandingPublicAPI
+from vision_agent.tools.meta_tools_types import BboxInput, BboxInputBase64, PromptTask
 
 # These tools are adapted from SWE-Agent https://github.com/princeton-nlp/SWE-agent
 
@@ -385,6 +389,46 @@ def get_tool_descriptions() -> str:
  return TOOL_DESCRIPTIONS
 
 
+def florencev2_fine_tuning(bboxes: List[Dict[str, Any]], task: str) -> UUID:
+ """'florencev2_fine_tuning' is a tool that fine-tune florencev2 to be able
+ to detect objects in an image based on a given dataset. It returns the fine
+ tuning job id.
+
+ Parameters:
+ bboxes (List[BboxInput]): A list of BboxInput containing the
+ image object, image filename, labels and bounding boxes.
+ task (PromptTask): The florencev2 fine-tuning task. The options are
+ CAPTION, CAPTION_TO_PHRASE_GROUNDING and OBJECT_DETECTION.
+
+ Returns:
+ UUID: The fine tuning job id, this id will used to retrieve the fine
+ tuned model.
+
+ Example
+ -------
+ >>> fine_tuning_job_id = florencev2_fine_tuning(
+ [{'image': image, 'filename': 'filename.png', 'label': ['screw'], 'bbox': [[370, 30, 560, 290]]},
+ {'image': image, 'filename': 'filename.png', 'label': ['screw'], 'bbox': [[120, 0, 300, 170]]}],
+ "OBJECT_DETECTION"
+ )
+ """
+ bboxes_input = [BboxInput.model_validate(bbox) for bbox in bboxes]
+ task_input = PromptTask[task]
+ fine_tuning_request = [
+ BboxInputBase64(
+ image=convert_to_b64(bbox_input.image),
+ filename=bbox_input.filename,
+ labels=bbox_input.labels,
+ bboxes=bbox_input.bboxes,
+ )
+ for bbox_input in bboxes_input
+ ]
+ landing_api = LandingPublicAPI()
+ return landing_api.launch_fine_tuning_job(
+ "florencev2", task_input, fine_tuning_request
+ )
+
+
 META_TOOL_DOCSTRING = get_tool_documentation(
  [
  get_tool_descriptions,
@@ -398,5 +442,6 @@ def get_tool_descriptions() -> str:
  search_dir,
  search_file,
  find_file,
+ florencev2_fine_tuning,
  ]
 )
diff --git a/vision_agent/tools/tool_types.py → vision_agent/tools/meta_tools_types.py b/vision_agent/tools/tool_types.py → vision_agent/tools/meta_tools_types.py
diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py
@@ -2,7 +2,6 @@
 import json
 import logging
 import tempfile
-from uuid import UUID
 from pathlib import Path
 from importlib import resources
 from typing import Any, Dict, List, Optional, Tuple, Union, cast
@@ -15,8 +14,6 @@
 from PIL import Image, ImageDraw, ImageFont
 from pillow_heif import register_heif_opener # type: ignore
 
-from vision_agent.clients.landing_public_api import LandingPublicAPI
-from vision_agent.tools.tool_types import BboxInput, BboxInputBase64, PromptTask
 from vision_agent.tools.tool_utils import (
  send_inference_request,
  get_tool_descriptions,
@@ -662,46 +659,6 @@ def florencev2_object_detection(image: np.ndarray) -> List[Dict[str, Any]]:
  return return_data
 
 
-def florencev2_fine_tuning(bboxes: List[Dict[str, Any]], task: str) -> UUID:
- """'florencev2_fine_tuning' is a tool that fine-tune florencev2 to be able
- to detect objects in an image based on a given dataset. It returns the fine
- tuning job id.
-
- Parameters:
- bboxes (List[BboxInput]): A list of BboxInput containing the
- image object, image filename, labels and bounding boxes.
- task (PromptTask): The florencev2 fine-tuning task. The options are
- CAPTION, CAPTION_TO_PHRASE_GROUNDING and OBJECT_DETECTION.
-
- Returns:
- UUID: The fine tuning job id, this id will used to retrieve the fine
- tuned model.
-
- Example
- -------
- >>> fine_tuning_job_id = florencev2_fine_tuning(
- [{'image': image, 'filename': 'filename.png', 'label': ['screw'], 'bbox': [[370, 30, 560, 290]]},
- {'image': image, 'filename': 'filename.png', 'label': ['screw'], 'bbox': [[120, 0, 300, 170]]}],
- "OBJECT_DETECTION"
- )
- """
- bboxes_input = [BboxInput.model_validate(bbox) for bbox in bboxes]
- task_input = PromptTask[task]
- fine_tuning_request = [
- BboxInputBase64(
- image=convert_to_b64(bbox_input.image),
- filename=bbox_input.filename,
- labels=bbox_input.labels,
- bboxes=bbox_input.bboxes,
- )
- for bbox_input in bboxes_input
- ]
- landing_api = LandingPublicAPI()
- return landing_api.launch_fine_tuning_job(
- "florencev2", task_input, fine_tuning_request
- )
-
-
 def detr_segmentation(image: np.ndarray) -> List[Dict[str, Any]]:
  """'detr_segmentation' is a tool that can segment common objects in an
  image without any text prompt. It returns a list of detected objects
@@ -1297,7 +1254,6 @@ def overlay_heat_map(
  florencev2_roberta_vqa,
  florencev2_image_caption,
  florencev2_object_detection,
- florencev2_fine_tuning,
  detr_segmentation,
  depth_anything_v2,
  generate_soft_edge_image,