Skip to content

Commit

Permalink
tools to meta tools
Browse files Browse the repository at this point in the history
  • Loading branch information
Dayof committed Aug 7, 2024
1 parent fa2452a commit c9ab90a
Show file tree
Hide file tree
Showing 3 changed files with 45 additions and 44 deletions.
45 changes: 45 additions & 0 deletions vision_agent/tools/meta_tools.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,16 @@
import os
import subprocess
from uuid import UUID
from pathlib import Path
from typing import Any, Dict, List, Union

import vision_agent as va
from vision_agent.lmm.types import Message
from vision_agent.tools.tool_utils import get_tool_documentation
from vision_agent.tools.tools import TOOL_DESCRIPTIONS
from vision_agent.utils.image_utils import convert_to_b64
from vision_agent.clients.landing_public_api import LandingPublicAPI
from vision_agent.tools.meta_tools_types import BboxInput, BboxInputBase64, PromptTask

# These tools are adapted from SWE-Agent https://github.com/princeton-nlp/SWE-agent

Expand Down Expand Up @@ -385,6 +389,46 @@ def get_tool_descriptions() -> str:
return TOOL_DESCRIPTIONS


def florencev2_fine_tuning(bboxes: List[Dict[str, Any]], task: str) -> UUID:
"""'florencev2_fine_tuning' is a tool that fine-tune florencev2 to be able
to detect objects in an image based on a given dataset. It returns the fine
tuning job id.
Parameters:
bboxes (List[BboxInput]): A list of BboxInput containing the
image object, image filename, labels and bounding boxes.
task (PromptTask): The florencev2 fine-tuning task. The options are
CAPTION, CAPTION_TO_PHRASE_GROUNDING and OBJECT_DETECTION.
Returns:
UUID: The fine tuning job id, this id will used to retrieve the fine
tuned model.
Example
-------
>>> fine_tuning_job_id = florencev2_fine_tuning(
[{'image': image, 'filename': 'filename.png', 'label': ['screw'], 'bbox': [[370, 30, 560, 290]]},
{'image': image, 'filename': 'filename.png', 'label': ['screw'], 'bbox': [[120, 0, 300, 170]]}],
"OBJECT_DETECTION"
)
"""
bboxes_input = [BboxInput.model_validate(bbox) for bbox in bboxes]
task_input = PromptTask[task]
fine_tuning_request = [
BboxInputBase64(
image=convert_to_b64(bbox_input.image),
filename=bbox_input.filename,
labels=bbox_input.labels,
bboxes=bbox_input.bboxes,
)
for bbox_input in bboxes_input
]
landing_api = LandingPublicAPI()
return landing_api.launch_fine_tuning_job(
"florencev2", task_input, fine_tuning_request
)


META_TOOL_DOCSTRING = get_tool_documentation(
[
get_tool_descriptions,
Expand All @@ -398,5 +442,6 @@ def get_tool_descriptions() -> str:
search_dir,
search_file,
find_file,
florencev2_fine_tuning,
]
)
File renamed without changes.
44 changes: 0 additions & 44 deletions vision_agent/tools/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
import json
import logging
import tempfile
from uuid import UUID
from pathlib import Path
from importlib import resources
from typing import Any, Dict, List, Optional, Tuple, Union, cast
Expand All @@ -15,8 +14,6 @@
from PIL import Image, ImageDraw, ImageFont
from pillow_heif import register_heif_opener # type: ignore

from vision_agent.clients.landing_public_api import LandingPublicAPI
from vision_agent.tools.tool_types import BboxInput, BboxInputBase64, PromptTask
from vision_agent.tools.tool_utils import (
send_inference_request,
get_tool_descriptions,
Expand Down Expand Up @@ -662,46 +659,6 @@ def florencev2_object_detection(image: np.ndarray) -> List[Dict[str, Any]]:
return return_data


def florencev2_fine_tuning(bboxes: List[Dict[str, Any]], task: str) -> UUID:
"""'florencev2_fine_tuning' is a tool that fine-tune florencev2 to be able
to detect objects in an image based on a given dataset. It returns the fine
tuning job id.
Parameters:
bboxes (List[BboxInput]): A list of BboxInput containing the
image object, image filename, labels and bounding boxes.
task (PromptTask): The florencev2 fine-tuning task. The options are
CAPTION, CAPTION_TO_PHRASE_GROUNDING and OBJECT_DETECTION.
Returns:
UUID: The fine tuning job id, this id will used to retrieve the fine
tuned model.
Example
-------
>>> fine_tuning_job_id = florencev2_fine_tuning(
[{'image': image, 'filename': 'filename.png', 'label': ['screw'], 'bbox': [[370, 30, 560, 290]]},
{'image': image, 'filename': 'filename.png', 'label': ['screw'], 'bbox': [[120, 0, 300, 170]]}],
"OBJECT_DETECTION"
)
"""
bboxes_input = [BboxInput.model_validate(bbox) for bbox in bboxes]
task_input = PromptTask[task]
fine_tuning_request = [
BboxInputBase64(
image=convert_to_b64(bbox_input.image),
filename=bbox_input.filename,
labels=bbox_input.labels,
bboxes=bbox_input.bboxes,
)
for bbox_input in bboxes_input
]
landing_api = LandingPublicAPI()
return landing_api.launch_fine_tuning_job(
"florencev2", task_input, fine_tuning_request
)


def detr_segmentation(image: np.ndarray) -> List[Dict[str, Any]]:
"""'detr_segmentation' is a tool that can segment common objects in an
image without any text prompt. It returns a list of detected objects
Expand Down Expand Up @@ -1297,7 +1254,6 @@ def overlay_heat_map(
florencev2_roberta_vqa,
florencev2_image_caption,
florencev2_object_detection,
florencev2_fine_tuning,
detr_segmentation,
depth_anything_v2,
generate_soft_edge_image,
Expand Down

0 comments on commit c9ab90a

Please sign in to comment.