diff --git a/vision_agent/agent/vision_agent.py b/vision_agent/agent/vision_agent.py index 95560ecd..9090b706 100644 --- a/vision_agent/agent/vision_agent.py +++ b/vision_agent/agent/vision_agent.py @@ -28,7 +28,7 @@ class DefaultImports: code = [ "from typing import *", "from vision_agent.utils.execute import CodeInterpreter", - "from vision_agent.tools.meta_tools import generate_vision_code, edit_vision_code, open_file, create_file, scroll_up, scroll_down, edit_file, get_tool_descriptions", + "from vision_agent.tools.meta_tools import generate_vision_code, edit_vision_code, open_file, create_file, scroll_up, scroll_down, edit_file, get_tool_descriptions, florencev2_fine_tuning", ] @staticmethod diff --git a/vision_agent/clients/landing_public_api.py b/vision_agent/clients/landing_public_api.py index 1a07f8ae..4c50c388 100644 --- a/vision_agent/clients/landing_public_api.py +++ b/vision_agent/clients/landing_public_api.py @@ -4,7 +4,7 @@ from vision_agent.clients.http import BaseHTTP from vision_agent.utils.type_defs import LandingaiAPIKey -from vision_agent.tools.tool_types import BboxInputBase64, PromptTask +from vision_agent.tools.meta_tools_types import BboxInputBase64, PromptTask class LandingPublicAPI(BaseHTTP): diff --git a/vision_agent/tools/__init__.py b/vision_agent/tools/__init__.py index 1ea94510..52681274 100644 --- a/vision_agent/tools/__init__.py +++ b/vision_agent/tools/__init__.py @@ -1,6 +1,6 @@ from typing import Callable, List, Optional -from .meta_tools import META_TOOL_DOCSTRING +from .meta_tools import META_TOOL_DOCSTRING, florencev2_fine_tuning from .prompts import CHOOSE_PARAMS, SYSTEM_PROMPT from .tools import ( TOOL_DESCRIPTIONS, @@ -19,7 +19,6 @@ florencev2_image_caption, florencev2_object_detection, florencev2_roberta_vqa, - florencev2_fine_tuning, generate_pose_image, generate_soft_edge_image, get_tool_documentation, diff --git a/vision_agent/tools/meta_tools.py b/vision_agent/tools/meta_tools.py index 290133eb..851aab18 100644 --- a/vision_agent/tools/meta_tools.py +++ b/vision_agent/tools/meta_tools.py @@ -396,7 +396,7 @@ def florencev2_fine_tuning(bboxes: List[Dict[str, Any]], task: str) -> UUID: Parameters: bboxes (List[BboxInput]): A list of BboxInput containing the - image object, image filename, labels and bounding boxes. + image path, labels and bounding boxes. task (PromptTask): The florencev2 fine-tuning task. The options are CAPTION, CAPTION_TO_PHRASE_GROUNDING and OBJECT_DETECTION. @@ -407,8 +407,8 @@ def florencev2_fine_tuning(bboxes: List[Dict[str, Any]], task: str) -> UUID: Example ------- >>> fine_tuning_job_id = florencev2_fine_tuning( - [{'image': image, 'filename': 'filename.png', 'label': ['screw'], 'bbox': [[370, 30, 560, 290]]}, - {'image': image, 'filename': 'filename.png', 'label': ['screw'], 'bbox': [[120, 0, 300, 170]]}], + [{'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[370, 30, 560, 290]]}, + {'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[120, 0, 300, 170]]}], "OBJECT_DETECTION" ) """ @@ -416,8 +416,8 @@ def florencev2_fine_tuning(bboxes: List[Dict[str, Any]], task: str) -> UUID: task_input = PromptTask[task] fine_tuning_request = [ BboxInputBase64( - image=convert_to_b64(bbox_input.image), - filename=bbox_input.filename, + image=convert_to_b64(bbox_input.image_path), + filename=bbox_input.image_path.split("/")[-1], labels=bbox_input.labels, bboxes=bbox_input.bboxes, ) diff --git a/vision_agent/tools/meta_tools_types.py b/vision_agent/tools/meta_tools_types.py index f14cd337..4c60923e 100644 --- a/vision_agent/tools/meta_tools_types.py +++ b/vision_agent/tools/meta_tools_types.py @@ -1,15 +1,11 @@ from enum import Enum -from typing import List, Tuple, Literal +from typing import List, Tuple -from nptyping import UInt8, NDArray -from pydantic import BaseModel, ConfigDict +from pydantic import BaseModel class BboxInput(BaseModel): - model_config = ConfigDict(arbitrary_types_allowed=True) - - image: NDArray[Literal["Height, Width, 3"], UInt8] - filename: str + image_path: str labels: List[str] bboxes: List[Tuple[int, int, int, int]]