Skip to content

Commit

Permalink
fine-tuning to tools
Browse files Browse the repository at this point in the history
  • Loading branch information
Dayof committed Aug 23, 2024
1 parent d6d4b78 commit a61ac26
Show file tree
Hide file tree
Showing 6 changed files with 151 additions and 158 deletions.
2 changes: 1 addition & 1 deletion vision_agent/agent/vision_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ class DefaultImports:
code = [
"from typing import *",
"from vision_agent.utils.execute import CodeInterpreter",
"from vision_agent.tools.meta_tools import generate_vision_code, edit_vision_code, open_file, create_file, scroll_up, scroll_down, edit_file, get_tool_descriptions, florencev2_fine_tuning, florencev2_fine_tuned_object_detection, check_if_fine_tuned_florencev2_is_ready",
"from vision_agent.tools.meta_tools import generate_vision_code, edit_vision_code, open_file, create_file, scroll_up, scroll_down, edit_file, get_tool_descriptions",
]

@staticmethod
Expand Down
2 changes: 1 addition & 1 deletion vision_agent/clients/landing_public_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

from vision_agent.clients.http import BaseHTTP
from vision_agent.utils.type_defs import LandingaiAPIKey
from vision_agent.tools.meta_tools_types import BboxInputBase64, PromptTask, JobStatus
from vision_agent.tools.tools_types import BboxInputBase64, PromptTask, JobStatus


class LandingPublicAPI(BaseHTTP):
Expand Down
3 changes: 0 additions & 3 deletions vision_agent/tools/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,6 @@

from .meta_tools import (
META_TOOL_DOCSTRING,
florencev2_fine_tuning,
florencev2_fine_tuned_object_detection,
check_if_fine_tuned_florencev2_is_ready,
)
from .prompts import CHOOSE_PARAMS, SYSTEM_PROMPT
from .tools import (
Expand Down
155 changes: 2 additions & 153 deletions vision_agent/tools/meta_tools.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,13 @@
import os
import subprocess
from uuid import UUID
from pathlib import Path
from typing import Any, Dict, List, Union

import numpy as np

import vision_agent as va
from vision_agent.lmm.types import Message
from vision_agent.tools.tool_utils import get_tool_documentation, send_inference_request
from vision_agent.tools.tool_utils import get_tool_documentation
from vision_agent.tools.tools import TOOL_DESCRIPTIONS
from vision_agent.utils.image_utils import convert_to_b64, normalize_bbox
from vision_agent.clients.landing_public_api import LandingPublicAPI
from vision_agent.tools.meta_tools_types import (
BboxInput,
BboxInputBase64,
PromptTask,
Florencev2FtRequest,
FineTuning,
JobStatus,
)


# These tools are adapted from SWE-Agent https://github.com/princeton-nlp/SWE-agent

Expand Down Expand Up @@ -398,142 +386,6 @@ def get_tool_descriptions() -> str:
return TOOL_DESCRIPTIONS


def florencev2_fine_tuning(bboxes: List[Dict[str, Any]], task: str) -> UUID:
"""'florencev2_fine_tuning' is a tool that fine-tune florencev2 to be able
to detect objects in an image based on a given dataset. It returns the fine
tuning job id.
Parameters:
bboxes (List[BboxInput]): A list of BboxInput containing the
image path, labels and bounding boxes.
task (PromptTask): The florencev2 fine-tuning task. The options are
CAPTION, CAPTION_TO_PHRASE_GROUNDING and OBJECT_DETECTION.
Returns:
UUID: The fine tuning job id, this id will used to retrieve the fine
tuned model.
Example
-------
>>> fine_tuning_job_id = florencev2_fine_tuning(
[{'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[370, 30, 560, 290]]},
{'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[120, 0, 300, 170]]}],
"OBJECT_DETECTION"
)
"""
bboxes_input = [BboxInput.model_validate(bbox) for bbox in bboxes]
task_input = PromptTask[task]
fine_tuning_request = [
BboxInputBase64(
image=convert_to_b64(bbox_input.image_path),
filename=bbox_input.image_path.split("/")[-1],
labels=bbox_input.labels,
bboxes=bbox_input.bboxes,
)
for bbox_input in bboxes_input
]
landing_api = LandingPublicAPI()
return landing_api.launch_fine_tuning_job(
"florencev2", task_input, fine_tuning_request
)


def check_if_fine_tuned_florencev2_is_ready(model_id: UUID) -> bool:
"""'check_if_fine_tuned_florencev2_is_ready' is a tool that checks whether
is possible to use a certain florencev2 model. It checks if the status
is SUCCEEDED.
Parameters:
model_id (UUID): The fine-tuned model id.
Returns:
bool: The indication if the model is ready to be used or not. If this
is False, it's recommended to wait 5 seconds before checking again.
Example
-------
>>> check_if_fine_tuned_florencev2_is_ready(UUID("381cd5f9-5dc4-472d-9260-f3bb89d31f83"))
True
"""
# check if job succeeded first
landing_api = LandingPublicAPI()
status = landing_api.check_fine_tuning_job(model_id)
return status is JobStatus.SUCCEEDED


def florencev2_fine_tuned_object_detection(
image: np.ndarray, prompt: str, model_id: UUID, task: str, model_is_ready: bool
) -> List[Dict[str, Any]]:
"""'florencev2_fine_tuned_object_detection' is a tool that uses a fine tuned model
to detect objects given a text prompt such as a phrase or class names separated by
commas. It returns a list of detected objects as labels and their location as
bounding boxes with score of 1.0.
Parameters:
image (np.ndarray): The image to used to detect objects.
prompt (str): The prompt to help find objects in the image.
model_id (UUID): The fine-tuned model id.
task (PromptTask): The florencev2 fine-tuning task. The options are
CAPTION, CAPTION_TO_PHRASE_GROUNDING and OBJECT_DETECTION.
model_is_ready (bool): If the model is ready to be used. It's recommended
to get this value from the function check_if_fine_tuned_florencev2_is_ready.
Returns:
List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
bounding box of the detected objects with normalized coordinates between 0
and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the
top-left and xmax and ymax are the coordinates of the bottom-right of the
bounding box. The scores are always 1.0 and cannot be thresholded
Example
-------
>>> florencev2_fine_tuned_object_detection(
image,
'person looking at a coyote',
UUID("381cd5f9-5dc4-472d-9260-f3bb89d31f83"),
model_is_ready=check_if_fine_tuned_florencev2_is_ready(
UUID("381cd5f9-5dc4-472d-9260-f3bb89d31f83")
)
)
[
{'score': 1.0, 'label': 'person', 'bbox': [0.1, 0.11, 0.35, 0.4]},
{'score': 1.0, 'label': 'coyote', 'bbox': [0.34, 0.21, 0.85, 0.5},
]
"""
if not model_is_ready:
return []

task = PromptTask[task]
if task is PromptTask.OBJECT_DETECTION:
prompt = ""

data_obj = Florencev2FtRequest(
image=convert_to_b64(image),
task=task,
tool="florencev2_fine_tuning",
prompt=prompt,
fine_tuning=FineTuning(job_id=model_id),
)
data = data_obj.model_dump(by_alias=True)
metadata_payload = {"function_name": "florencev2_fine_tuned_object_detection"}
detections = send_inference_request(
data, "tools", v2=False, metadata_payload=metadata_payload
)

detections = detections[task.value]
return_data = []
image_size = image.shape[:2]
for i in range(len(detections["bboxes"])):
return_data.append(
{
"score": 1.0,
"label": detections["labels"][i],
"bbox": normalize_bbox(detections["bboxes"][i], image_size),
}
)
return return_data


META_TOOL_DOCSTRING = get_tool_documentation(
[
get_tool_descriptions,
Expand All @@ -547,8 +399,5 @@ def florencev2_fine_tuned_object_detection(
search_dir,
search_file,
find_file,
florencev2_fine_tuning,
florencev2_fine_tuned_object_detection,
check_if_fine_tuned_florencev2_is_ready,
]
)
147 changes: 147 additions & 0 deletions vision_agent/tools/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import json
import logging
import tempfile
from uuid import UUID
from pathlib import Path
from importlib import resources
from typing import Any, Dict, List, Optional, Tuple, Union, cast
Expand Down Expand Up @@ -31,6 +32,15 @@
convert_quad_box_to_bbox,
rle_decode,
)
from vision_agent.tools.tools_types import (
BboxInput,
BboxInputBase64,
PromptTask,
Florencev2FtRequest,
FineTuning,
JobStatus,
)
from vision_agent.clients.landing_public_api import LandingPublicAPI

register_heif_opener()

Expand Down Expand Up @@ -1285,6 +1295,143 @@ def overlay_heat_map(
return np.array(combined)


# TODO: add this function to the imports so that is picked in the agent
def florencev2_fine_tuning(bboxes: List[Dict[str, Any]], task: str) -> UUID:
"""'florencev2_fine_tuning' is a tool that fine-tune florencev2 to be able
to detect objects in an image based on a given dataset. It returns the fine
tuning job id.
Parameters:
bboxes (List[BboxInput]): A list of BboxInput containing the
image path, labels and bounding boxes.
task (PromptTask): The florencev2 fine-tuning task. The options are
CAPTION, CAPTION_TO_PHRASE_GROUNDING and OBJECT_DETECTION.
Returns:
UUID: The fine tuning job id, this id will used to retrieve the fine
tuned model.
Example
-------
>>> fine_tuning_job_id = florencev2_fine_tuning(
[{'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[370, 30, 560, 290]]},
{'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[120, 0, 300, 170]]}],
"OBJECT_DETECTION"
)
"""
bboxes_input = [BboxInput.model_validate(bbox) for bbox in bboxes]
task_input = PromptTask[task]
fine_tuning_request = [
BboxInputBase64(
image=convert_to_b64(bbox_input.image_path),
filename=bbox_input.image_path.split("/")[-1],
labels=bbox_input.labels,
bboxes=bbox_input.bboxes,
)
for bbox_input in bboxes_input
]
landing_api = LandingPublicAPI()
return landing_api.launch_fine_tuning_job(
"florencev2", task_input, fine_tuning_request
)


def check_if_fine_tuned_florencev2_is_ready(model_id: UUID) -> bool:
"""'check_if_fine_tuned_florencev2_is_ready' is a tool that checks whether
is possible to use a certain florencev2 model. It checks if the status
is SUCCEEDED.
Parameters:
model_id (UUID): The fine-tuned model id.
Returns:
bool: The indication if the model is ready to be used or not. If this
is False, it's recommended to wait 5 seconds before checking again.
Example
-------
>>> check_if_fine_tuned_florencev2_is_ready(UUID("381cd5f9-5dc4-472d-9260-f3bb89d31f83"))
True
"""
# check if job succeeded first
landing_api = LandingPublicAPI()
status = landing_api.check_fine_tuning_job(model_id)
return status is JobStatus.SUCCEEDED


def florencev2_fine_tuned_object_detection(
image: np.ndarray, prompt: str, model_id: UUID, task: str, model_is_ready: bool
) -> List[Dict[str, Any]]:
"""'florencev2_fine_tuned_object_detection' is a tool that uses a fine tuned model
to detect objects given a text prompt such as a phrase or class names separated by
commas. It returns a list of detected objects as labels and their location as
bounding boxes with score of 1.0.
Parameters:
image (np.ndarray): The image to used to detect objects.
prompt (str): The prompt to help find objects in the image.
model_id (UUID): The fine-tuned model id.
task (PromptTask): The florencev2 fine-tuning task. The options are
CAPTION, CAPTION_TO_PHRASE_GROUNDING and OBJECT_DETECTION.
model_is_ready (bool): If the model is ready to be used. It's recommended
to get this value from the function check_if_fine_tuned_florencev2_is_ready.
Returns:
List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
bounding box of the detected objects with normalized coordinates between 0
and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the
top-left and xmax and ymax are the coordinates of the bottom-right of the
bounding box. The scores are always 1.0 and cannot be thresholded
Example
-------
>>> florencev2_fine_tuned_object_detection(
image,
'person looking at a coyote',
UUID("381cd5f9-5dc4-472d-9260-f3bb89d31f83"),
model_is_ready=check_if_fine_tuned_florencev2_is_ready(
UUID("381cd5f9-5dc4-472d-9260-f3bb89d31f83")
)
)
[
{'score': 1.0, 'label': 'person', 'bbox': [0.1, 0.11, 0.35, 0.4]},
{'score': 1.0, 'label': 'coyote', 'bbox': [0.34, 0.21, 0.85, 0.5},
]
"""
if not model_is_ready:
return []

task = PromptTask[task]
if task is PromptTask.OBJECT_DETECTION:
prompt = ""

data_obj = Florencev2FtRequest(
image=convert_to_b64(image),
task=task,
tool="florencev2_fine_tuning",
prompt=prompt,
fine_tuning=FineTuning(job_id=model_id),
)
data = data_obj.model_dump(by_alias=True)
metadata_payload = {"function_name": "florencev2_fine_tuned_object_detection"}
detections = send_inference_request(
data, "tools", v2=False, metadata_payload=metadata_payload
)

detections = detections[task.value]
return_data = []
image_size = image.shape[:2]
for i in range(len(detections["bboxes"])):
return_data.append(
{
"score": 1.0,
"label": detections["labels"][i],
"bbox": normalize_bbox(detections["bboxes"][i], image_size),
}
)
return return_data


TOOLS = [
owl_v2,
grounding_sam,
Expand Down
File renamed without changes.

0 comments on commit a61ac26

Please sign in to comment.