From fd87fa51cb001001817c2668db02b51f156f4a64 Mon Sep 17 00:00:00 2001 From: Dayanne Fernandes Date: Tue, 13 Aug 2024 18:27:55 -0300 Subject: [PATCH 1/7] check status and run prediction with fine tuned model --- vision_agent/agent/vision_agent.py | 2 +- vision_agent/clients/http.py | 15 +++ vision_agent/clients/landing_public_api.py | 6 +- vision_agent/tools/__init__.py | 7 +- vision_agent/tools/meta_tools.py | 113 ++++++++++++++++++++- vision_agent/tools/meta_tools_types.py | 58 ++++++++++- vision_agent/tools/tool_utils.py | 24 +++-- 7 files changed, 209 insertions(+), 16 deletions(-) diff --git a/vision_agent/agent/vision_agent.py b/vision_agent/agent/vision_agent.py index 9090b706..375202db 100644 --- a/vision_agent/agent/vision_agent.py +++ b/vision_agent/agent/vision_agent.py @@ -28,7 +28,7 @@ class DefaultImports: code = [ "from typing import *", "from vision_agent.utils.execute import CodeInterpreter", - "from vision_agent.tools.meta_tools import generate_vision_code, edit_vision_code, open_file, create_file, scroll_up, scroll_down, edit_file, get_tool_descriptions, florencev2_fine_tuning", + "from vision_agent.tools.meta_tools import generate_vision_code, edit_vision_code, open_file, create_file, scroll_up, scroll_down, edit_file, get_tool_descriptions, florencev2_fine_tuning, florencev2_fine_tuned_object_detection, check_if_fine_tuned_florencev2_is_ready", ] @staticmethod diff --git a/vision_agent/clients/http.py b/vision_agent/clients/http.py index 678148a9..dc969595 100644 --- a/vision_agent/clients/http.py +++ b/vision_agent/clients/http.py @@ -44,3 +44,18 @@ def post(self, url: str, payload: Dict[str, Any]) -> Dict[str, Any]: resp_text = response.text _LOGGER.warning(f"Response seems incorrect: '{resp_text}'.") return result + + def get(self, url: str) -> Dict[str, Any]: + formatted_url = f"{self._base_endpoint}/{url}" + _LOGGER.info(f"Sending data to {formatted_url}") + try: + response = self._session.get(url=formatted_url, timeout=self._TIMEOUT) + response.raise_for_status() + result: Dict[str, Any] = response.json() + _LOGGER.info(json.dumps(result)) + except (ConnectionError, Timeout, RequestException) as err: + _LOGGER.warning(f"Error: {err}.") + except json.JSONDecodeError: + resp_text = response.text + _LOGGER.warning(f"Response seems incorrect: '{resp_text}'.") + return result diff --git a/vision_agent/clients/landing_public_api.py b/vision_agent/clients/landing_public_api.py index 4c50c388..09f98b44 100644 --- a/vision_agent/clients/landing_public_api.py +++ b/vision_agent/clients/landing_public_api.py @@ -4,7 +4,7 @@ from vision_agent.clients.http import BaseHTTP from vision_agent.utils.type_defs import LandingaiAPIKey -from vision_agent.tools.meta_tools_types import BboxInputBase64, PromptTask +from vision_agent.tools.meta_tools_types import BboxInputBase64, PromptTask, JobStatus class LandingPublicAPI(BaseHTTP): @@ -24,3 +24,7 @@ def launch_fine_tuning_job( } response = self.post(url, payload=data) return UUID(response["jobId"]) + + def check_fine_tuning_job(self, job_id: UUID) -> JobStatus: + url = f"v1/agent/jobs/fine-tuning/{job_id}/status" + return JobStatus(self.get(url)["status"]) diff --git a/vision_agent/tools/__init__.py b/vision_agent/tools/__init__.py index f9879626..4a863994 100644 --- a/vision_agent/tools/__init__.py +++ b/vision_agent/tools/__init__.py @@ -1,6 +1,11 @@ from typing import Callable, List, Optional -from .meta_tools import META_TOOL_DOCSTRING, florencev2_fine_tuning +from .meta_tools import ( + META_TOOL_DOCSTRING, + florencev2_fine_tuning, + florencev2_fine_tuned_object_detection, + check_if_fine_tuned_florencev2_is_ready, +) from .prompts import CHOOSE_PARAMS, SYSTEM_PROMPT from .tools import ( TOOL_DESCRIPTIONS, diff --git a/vision_agent/tools/meta_tools.py b/vision_agent/tools/meta_tools.py index 851aab18..2ff6df3c 100644 --- a/vision_agent/tools/meta_tools.py +++ b/vision_agent/tools/meta_tools.py @@ -4,13 +4,22 @@ from pathlib import Path from typing import Any, Dict, List, Union +import numpy as np + import vision_agent as va from vision_agent.lmm.types import Message -from vision_agent.tools.tool_utils import get_tool_documentation +from vision_agent.tools.tool_utils import get_tool_documentation, send_inference_request from vision_agent.tools.tools import TOOL_DESCRIPTIONS -from vision_agent.utils.image_utils import convert_to_b64 +from vision_agent.utils.image_utils import convert_to_b64, normalize_bbox from vision_agent.clients.landing_public_api import LandingPublicAPI -from vision_agent.tools.meta_tools_types import BboxInput, BboxInputBase64, PromptTask +from vision_agent.tools.meta_tools_types import ( + BboxInput, + BboxInputBase64, + PromptTask, + Florencev2FtRequest, + FineTuning, + JobStatus, +) # These tools are adapted from SWE-Agent https://github.com/princeton-nlp/SWE-agent @@ -384,7 +393,7 @@ def edit_file(file_path: str, start: int, end: int, content: str) -> str: def get_tool_descriptions() -> str: """Returns a description of all the tools that `generate_vision_code` has access to. - Helpful for answerings questions about what types of vision tasks you can do with + Helpful for answering questions about what types of vision tasks you can do with `generate_vision_code`.""" return TOOL_DESCRIPTIONS @@ -429,6 +438,100 @@ def florencev2_fine_tuning(bboxes: List[Dict[str, Any]], task: str) -> UUID: ) +def check_if_fine_tuned_florencev2_is_ready(model_id: UUID) -> bool: + """'check_if_fine_tuned_florencev2_is_ready' is a tool that checks whether + is possible to use a certain florencev2 model. It checks if the status + is SUCCEEDED. + + Parameters: + model_id (UUID): The fine-tuned model id. + + Returns: + bool: The indication if the model is ready to be used or not. If this + is False, it's recommended to wait 5 seconds before checking again. + + Example + ------- + >>> check_if_fine_tuned_florencev2_is_ready(UUID("381cd5f9-5dc4-472d-9260-f3bb89d31f83")) + True + """ + # check if job succeeded first + landing_api = LandingPublicAPI() + status = landing_api.check_fine_tuning_job(model_id) + return status is JobStatus.SUCCEEDED + + +def florencev2_fine_tuned_object_detection( + image: np.ndarray, prompt: str, model_id: UUID, task: str, model_is_ready: bool +) -> List[Dict[str, Any]]: + """'florencev2_fine_tuned_object_detection' is a tool that uses a fine tuned model + to detect objects given a text prompt such as a phrase or class names separated by + commas. It returns a list of detected objects as labels and their location as + bounding boxes with score of 1.0. + + Parameters: + image (np.ndarray): The image to used to detect objects. + prompt (str): The prompt to help find objects in the image. + model_id (UUID): The fine-tuned model id. + task (PromptTask): The florencev2 fine-tuning task. The options are + CAPTION, CAPTION_TO_PHRASE_GROUNDING and OBJECT_DETECTION. + model_is_ready (bool): If the model is ready to be used. It's recommended + to get this value from the function check_if_fine_tuned_florencev2_is_ready. + + Returns: + List[Dict[str, Any]]: A list of dictionaries containing the score, label, and + bounding box of the detected objects with normalized coordinates between 0 + and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the + top-left and xmax and ymax are the coordinates of the bottom-right of the + bounding box. The scores are always 1.0 and cannot be thresholded + + Example + ------- + >>> florencev2_fine_tuned_object_detection( + image, + 'person looking at a coyote', + UUID("381cd5f9-5dc4-472d-9260-f3bb89d31f83"), + model_is_ready=True + ) + [ + {'score': 1.0, 'label': 'person', 'bbox': [0.1, 0.11, 0.35, 0.4]}, + {'score': 1.0, 'label': 'coyote', 'bbox': [0.34, 0.21, 0.85, 0.5}, + ] + """ + if not model_is_ready: + return [] + + task = PromptTask[task] + if task is PromptTask.OBJECT_DETECTION: + prompt = "" + + data_obj = Florencev2FtRequest( + image=convert_to_b64(image), + task=task, + tool="florencev2_fine_tuning", + prompt=prompt, + fine_tuning=FineTuning(job_id=model_id), + ) + data = data_obj.model_dump(by_alias=True) + metadata_payload = {"function_name": "florencev2_fine_tuned_object_detection"} + detections = send_inference_request( + data, "tools", v2=False, metadata_payload=metadata_payload + ) + + detections = detections[task.value] + return_data = [] + image_size = image.shape[:2] + for i in range(len(detections["bboxes"])): + return_data.append( + { + "score": 1.0, + "label": detections["labels"][i], + "bbox": normalize_bbox(detections["bboxes"][i], image_size), + } + ) + return return_data + + META_TOOL_DOCSTRING = get_tool_documentation( [ get_tool_descriptions, @@ -443,5 +546,7 @@ def florencev2_fine_tuning(bboxes: List[Dict[str, Any]], task: str) -> UUID: search_file, find_file, florencev2_fine_tuning, + florencev2_fine_tuned_object_detection, + check_if_fine_tuned_florencev2_is_ready, ] ) diff --git a/vision_agent/tools/meta_tools_types.py b/vision_agent/tools/meta_tools_types.py index 4c60923e..6b34750b 100644 --- a/vision_agent/tools/meta_tools_types.py +++ b/vision_agent/tools/meta_tools_types.py @@ -1,7 +1,8 @@ +from uuid import UUID from enum import Enum -from typing import List, Tuple +from typing import List, Tuple, Optional -from pydantic import BaseModel +from pydantic import BaseModel, ConfigDict, Field, field_serializer class BboxInput(BaseModel): @@ -28,3 +29,56 @@ class PromptTask(str, Enum): """""" OBJECT_DETECTION = "" """""" + + +class FineTuning(BaseModel): + model_config = ConfigDict(populate_by_name=True) + + job_id: UUID = Field(alias="jobId") + + @field_serializer("job_id") + def serialize_job_id(self, job_id: UUID, _info): + return str(job_id) + + +class Florencev2FtRequest(BaseModel): + model_config = ConfigDict(populate_by_name=True) + + image: str + task: PromptTask + tool: str + prompt: Optional[str] = "" + fine_tuning: Optional[FineTuning] = Field(None, alias="fineTuning") + + +class JobStatus(str, Enum): + """The status of a fine-tuning job. + + CREATED: + The job has been created and is waiting to be scheduled to run. + STARTING: + The job has started running, but not entering the training phase. + TRAINING: + The job is training a model. + EVALUATING: + The job is evaluating the model and computing metrics. + PUBLISHING: + The job is exporting the artifact(s) to an external directory (s3 or local). + SUCCEEDED: + The job has finished, including training, evaluation and publishing the + artifact(s). + FAILED: + The job has failed for some reason internally, it can be due to resources + issues or the code itself. + STOPPED: + The job has been stopped by the use locally or in the cloud. + """ + + CREATED = "CREATED" + STARTING = "STARTING" + TRAINING = "TRAINING" + EVALUATING = "EVALUATING" + PUBLISHING = "PUBLISHING" + SUCCEEDED = "SUCCEEDED" + FAILED = "FAILED" + STOPPED = "STOPPED" diff --git a/vision_agent/tools/tool_utils.py b/vision_agent/tools/tool_utils.py index 0ff56177..e6dfc67d 100644 --- a/vision_agent/tools/tool_utils.py +++ b/vision_agent/tools/tool_utils.py @@ -15,9 +15,10 @@ from vision_agent.utils.type_defs import LandingaiAPIKey _LOGGER = logging.getLogger(__name__) -_LND_API_KEY = LandingaiAPIKey().api_key -_LND_API_URL = "https://api.landing.ai/v1/agent/model" -_LND_API_URL_v2 = "https://api.landing.ai/v1/tools" +_LND_API_KEY = os.environ.get("LANDINGAI_API_KEY", LandingaiAPIKey().api_key) +_LND_BASE_URL = os.environ.get("LANDINGAI_URL", "https://api.landing.ai") +_LND_API_URL = f"{_LND_BASE_URL}/v1/agent/model" +_LND_API_URL_v2 = f"{_LND_BASE_URL}/v1/tools" class ToolCallTrace(BaseModel): @@ -28,8 +29,13 @@ class ToolCallTrace(BaseModel): def send_inference_request( - payload: Dict[str, Any], endpoint_name: str, v2: bool = False + payload: Dict[str, Any], + endpoint_name: str, + v2: bool = False, + metadata_payload: Optional[Dict[str, Any]] = None, ) -> Dict[str, Any]: + # TODO: runtime_tag and function_name should be metadata_payload and now included + # in the service payload try: if runtime_tag := os.environ.get("RUNTIME_TAG", ""): payload["runtime_tag"] = runtime_tag @@ -62,9 +68,13 @@ def send_inference_request( traceback_raw=[], ) _LOGGER.error(f"Request failed: {res.status_code} {res.text}") - raise RemoteToolCallFailed( - payload["function_name"], res.status_code, res.text - ) + # TODO: function_name should be in metadata_payload + function_name = "unknown" + if "function_name" in payload: + function_name = payload["function_name"] + elif metadata_payload is not None and "function_name" in metadata_payload: + function_name = metadata_payload["function_name"] + raise RemoteToolCallFailed(function_name, res.status_code, res.text) resp = res.json() tool_call_trace.response = resp From 4bee8d76a84981cbb247567dd5dba78bc6ae7325 Mon Sep 17 00:00:00 2001 From: Dayanne Fernandes Date: Tue, 13 Aug 2024 19:04:07 -0300 Subject: [PATCH 2/7] fix linter --- pyproject.toml | 2 ++ vision_agent/tools/meta_tools_types.py | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 664534e2..9f971210 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -78,6 +78,8 @@ line_length = 88 profile = "black" [tool.mypy] +plugins = "pydantic.mypy" + exclude = "tests" show_error_context = true pretty = true diff --git a/vision_agent/tools/meta_tools_types.py b/vision_agent/tools/meta_tools_types.py index 6b34750b..aeb45c95 100644 --- a/vision_agent/tools/meta_tools_types.py +++ b/vision_agent/tools/meta_tools_types.py @@ -2,7 +2,7 @@ from enum import Enum from typing import List, Tuple, Optional -from pydantic import BaseModel, ConfigDict, Field, field_serializer +from pydantic import BaseModel, ConfigDict, Field, field_serializer, SerializationInfo class BboxInput(BaseModel): @@ -37,7 +37,7 @@ class FineTuning(BaseModel): job_id: UUID = Field(alias="jobId") @field_serializer("job_id") - def serialize_job_id(self, job_id: UUID, _info): + def serialize_job_id(self, job_id: UUID, _info: SerializationInfo) -> str: return str(job_id) From 21d5ee82a01c15250309ef47c1a0098581177e1c Mon Sep 17 00:00:00 2001 From: Dayanne Fernandes Date: Tue, 13 Aug 2024 19:09:58 -0300 Subject: [PATCH 3/7] improve docstring --- vision_agent/tools/meta_tools.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vision_agent/tools/meta_tools.py b/vision_agent/tools/meta_tools.py index 2ff6df3c..925de2d4 100644 --- a/vision_agent/tools/meta_tools.py +++ b/vision_agent/tools/meta_tools.py @@ -491,7 +491,9 @@ def florencev2_fine_tuned_object_detection( image, 'person looking at a coyote', UUID("381cd5f9-5dc4-472d-9260-f3bb89d31f83"), - model_is_ready=True + model_is_ready=check_if_fine_tuned_florencev2_is_ready( + UUID("381cd5f9-5dc4-472d-9260-f3bb89d31f83") + ) ) [ {'score': 1.0, 'label': 'person', 'bbox': [0.1, 0.11, 0.35, 0.4]}, From a61ac2667d69674265036dab7cbbc7dfc37b6124 Mon Sep 17 00:00:00 2001 From: Dayanne Fernandes Date: Fri, 23 Aug 2024 18:20:20 -0300 Subject: [PATCH 4/7] fine-tuning to tools --- vision_agent/agent/vision_agent.py | 2 +- vision_agent/clients/landing_public_api.py | 2 +- vision_agent/tools/__init__.py | 3 - vision_agent/tools/meta_tools.py | 155 +----------------- vision_agent/tools/tools.py | 147 +++++++++++++++++ .../{meta_tools_types.py => tools_types.py} | 0 6 files changed, 151 insertions(+), 158 deletions(-) rename vision_agent/tools/{meta_tools_types.py => tools_types.py} (100%) diff --git a/vision_agent/agent/vision_agent.py b/vision_agent/agent/vision_agent.py index 996e5eac..cfb482e1 100644 --- a/vision_agent/agent/vision_agent.py +++ b/vision_agent/agent/vision_agent.py @@ -28,7 +28,7 @@ class DefaultImports: code = [ "from typing import *", "from vision_agent.utils.execute import CodeInterpreter", - "from vision_agent.tools.meta_tools import generate_vision_code, edit_vision_code, open_file, create_file, scroll_up, scroll_down, edit_file, get_tool_descriptions, florencev2_fine_tuning, florencev2_fine_tuned_object_detection, check_if_fine_tuned_florencev2_is_ready", + "from vision_agent.tools.meta_tools import generate_vision_code, edit_vision_code, open_file, create_file, scroll_up, scroll_down, edit_file, get_tool_descriptions", ] @staticmethod diff --git a/vision_agent/clients/landing_public_api.py b/vision_agent/clients/landing_public_api.py index 09f98b44..f9d52389 100644 --- a/vision_agent/clients/landing_public_api.py +++ b/vision_agent/clients/landing_public_api.py @@ -4,7 +4,7 @@ from vision_agent.clients.http import BaseHTTP from vision_agent.utils.type_defs import LandingaiAPIKey -from vision_agent.tools.meta_tools_types import BboxInputBase64, PromptTask, JobStatus +from vision_agent.tools.tools_types import BboxInputBase64, PromptTask, JobStatus class LandingPublicAPI(BaseHTTP): diff --git a/vision_agent/tools/__init__.py b/vision_agent/tools/__init__.py index 4a863994..53b64ffb 100644 --- a/vision_agent/tools/__init__.py +++ b/vision_agent/tools/__init__.py @@ -2,9 +2,6 @@ from .meta_tools import ( META_TOOL_DOCSTRING, - florencev2_fine_tuning, - florencev2_fine_tuned_object_detection, - check_if_fine_tuned_florencev2_is_ready, ) from .prompts import CHOOSE_PARAMS, SYSTEM_PROMPT from .tools import ( diff --git a/vision_agent/tools/meta_tools.py b/vision_agent/tools/meta_tools.py index 925de2d4..7c857550 100644 --- a/vision_agent/tools/meta_tools.py +++ b/vision_agent/tools/meta_tools.py @@ -1,25 +1,13 @@ import os import subprocess -from uuid import UUID from pathlib import Path from typing import Any, Dict, List, Union -import numpy as np - import vision_agent as va from vision_agent.lmm.types import Message -from vision_agent.tools.tool_utils import get_tool_documentation, send_inference_request +from vision_agent.tools.tool_utils import get_tool_documentation from vision_agent.tools.tools import TOOL_DESCRIPTIONS -from vision_agent.utils.image_utils import convert_to_b64, normalize_bbox -from vision_agent.clients.landing_public_api import LandingPublicAPI -from vision_agent.tools.meta_tools_types import ( - BboxInput, - BboxInputBase64, - PromptTask, - Florencev2FtRequest, - FineTuning, - JobStatus, -) + # These tools are adapted from SWE-Agent https://github.com/princeton-nlp/SWE-agent @@ -398,142 +386,6 @@ def get_tool_descriptions() -> str: return TOOL_DESCRIPTIONS -def florencev2_fine_tuning(bboxes: List[Dict[str, Any]], task: str) -> UUID: - """'florencev2_fine_tuning' is a tool that fine-tune florencev2 to be able - to detect objects in an image based on a given dataset. It returns the fine - tuning job id. - - Parameters: - bboxes (List[BboxInput]): A list of BboxInput containing the - image path, labels and bounding boxes. - task (PromptTask): The florencev2 fine-tuning task. The options are - CAPTION, CAPTION_TO_PHRASE_GROUNDING and OBJECT_DETECTION. - - Returns: - UUID: The fine tuning job id, this id will used to retrieve the fine - tuned model. - - Example - ------- - >>> fine_tuning_job_id = florencev2_fine_tuning( - [{'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[370, 30, 560, 290]]}, - {'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[120, 0, 300, 170]]}], - "OBJECT_DETECTION" - ) - """ - bboxes_input = [BboxInput.model_validate(bbox) for bbox in bboxes] - task_input = PromptTask[task] - fine_tuning_request = [ - BboxInputBase64( - image=convert_to_b64(bbox_input.image_path), - filename=bbox_input.image_path.split("/")[-1], - labels=bbox_input.labels, - bboxes=bbox_input.bboxes, - ) - for bbox_input in bboxes_input - ] - landing_api = LandingPublicAPI() - return landing_api.launch_fine_tuning_job( - "florencev2", task_input, fine_tuning_request - ) - - -def check_if_fine_tuned_florencev2_is_ready(model_id: UUID) -> bool: - """'check_if_fine_tuned_florencev2_is_ready' is a tool that checks whether - is possible to use a certain florencev2 model. It checks if the status - is SUCCEEDED. - - Parameters: - model_id (UUID): The fine-tuned model id. - - Returns: - bool: The indication if the model is ready to be used or not. If this - is False, it's recommended to wait 5 seconds before checking again. - - Example - ------- - >>> check_if_fine_tuned_florencev2_is_ready(UUID("381cd5f9-5dc4-472d-9260-f3bb89d31f83")) - True - """ - # check if job succeeded first - landing_api = LandingPublicAPI() - status = landing_api.check_fine_tuning_job(model_id) - return status is JobStatus.SUCCEEDED - - -def florencev2_fine_tuned_object_detection( - image: np.ndarray, prompt: str, model_id: UUID, task: str, model_is_ready: bool -) -> List[Dict[str, Any]]: - """'florencev2_fine_tuned_object_detection' is a tool that uses a fine tuned model - to detect objects given a text prompt such as a phrase or class names separated by - commas. It returns a list of detected objects as labels and their location as - bounding boxes with score of 1.0. - - Parameters: - image (np.ndarray): The image to used to detect objects. - prompt (str): The prompt to help find objects in the image. - model_id (UUID): The fine-tuned model id. - task (PromptTask): The florencev2 fine-tuning task. The options are - CAPTION, CAPTION_TO_PHRASE_GROUNDING and OBJECT_DETECTION. - model_is_ready (bool): If the model is ready to be used. It's recommended - to get this value from the function check_if_fine_tuned_florencev2_is_ready. - - Returns: - List[Dict[str, Any]]: A list of dictionaries containing the score, label, and - bounding box of the detected objects with normalized coordinates between 0 - and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the - top-left and xmax and ymax are the coordinates of the bottom-right of the - bounding box. The scores are always 1.0 and cannot be thresholded - - Example - ------- - >>> florencev2_fine_tuned_object_detection( - image, - 'person looking at a coyote', - UUID("381cd5f9-5dc4-472d-9260-f3bb89d31f83"), - model_is_ready=check_if_fine_tuned_florencev2_is_ready( - UUID("381cd5f9-5dc4-472d-9260-f3bb89d31f83") - ) - ) - [ - {'score': 1.0, 'label': 'person', 'bbox': [0.1, 0.11, 0.35, 0.4]}, - {'score': 1.0, 'label': 'coyote', 'bbox': [0.34, 0.21, 0.85, 0.5}, - ] - """ - if not model_is_ready: - return [] - - task = PromptTask[task] - if task is PromptTask.OBJECT_DETECTION: - prompt = "" - - data_obj = Florencev2FtRequest( - image=convert_to_b64(image), - task=task, - tool="florencev2_fine_tuning", - prompt=prompt, - fine_tuning=FineTuning(job_id=model_id), - ) - data = data_obj.model_dump(by_alias=True) - metadata_payload = {"function_name": "florencev2_fine_tuned_object_detection"} - detections = send_inference_request( - data, "tools", v2=False, metadata_payload=metadata_payload - ) - - detections = detections[task.value] - return_data = [] - image_size = image.shape[:2] - for i in range(len(detections["bboxes"])): - return_data.append( - { - "score": 1.0, - "label": detections["labels"][i], - "bbox": normalize_bbox(detections["bboxes"][i], image_size), - } - ) - return return_data - - META_TOOL_DOCSTRING = get_tool_documentation( [ get_tool_descriptions, @@ -547,8 +399,5 @@ def florencev2_fine_tuned_object_detection( search_dir, search_file, find_file, - florencev2_fine_tuning, - florencev2_fine_tuned_object_detection, - check_if_fine_tuned_florencev2_is_ready, ] ) diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py index 0254a455..52f3c6d9 100644 --- a/vision_agent/tools/tools.py +++ b/vision_agent/tools/tools.py @@ -2,6 +2,7 @@ import json import logging import tempfile +from uuid import UUID from pathlib import Path from importlib import resources from typing import Any, Dict, List, Optional, Tuple, Union, cast @@ -31,6 +32,15 @@ convert_quad_box_to_bbox, rle_decode, ) +from vision_agent.tools.tools_types import ( + BboxInput, + BboxInputBase64, + PromptTask, + Florencev2FtRequest, + FineTuning, + JobStatus, +) +from vision_agent.clients.landing_public_api import LandingPublicAPI register_heif_opener() @@ -1285,6 +1295,143 @@ def overlay_heat_map( return np.array(combined) +# TODO: add this function to the imports so that is picked in the agent +def florencev2_fine_tuning(bboxes: List[Dict[str, Any]], task: str) -> UUID: + """'florencev2_fine_tuning' is a tool that fine-tune florencev2 to be able + to detect objects in an image based on a given dataset. It returns the fine + tuning job id. + + Parameters: + bboxes (List[BboxInput]): A list of BboxInput containing the + image path, labels and bounding boxes. + task (PromptTask): The florencev2 fine-tuning task. The options are + CAPTION, CAPTION_TO_PHRASE_GROUNDING and OBJECT_DETECTION. + + Returns: + UUID: The fine tuning job id, this id will used to retrieve the fine + tuned model. + + Example + ------- + >>> fine_tuning_job_id = florencev2_fine_tuning( + [{'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[370, 30, 560, 290]]}, + {'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[120, 0, 300, 170]]}], + "OBJECT_DETECTION" + ) + """ + bboxes_input = [BboxInput.model_validate(bbox) for bbox in bboxes] + task_input = PromptTask[task] + fine_tuning_request = [ + BboxInputBase64( + image=convert_to_b64(bbox_input.image_path), + filename=bbox_input.image_path.split("/")[-1], + labels=bbox_input.labels, + bboxes=bbox_input.bboxes, + ) + for bbox_input in bboxes_input + ] + landing_api = LandingPublicAPI() + return landing_api.launch_fine_tuning_job( + "florencev2", task_input, fine_tuning_request + ) + + +def check_if_fine_tuned_florencev2_is_ready(model_id: UUID) -> bool: + """'check_if_fine_tuned_florencev2_is_ready' is a tool that checks whether + is possible to use a certain florencev2 model. It checks if the status + is SUCCEEDED. + + Parameters: + model_id (UUID): The fine-tuned model id. + + Returns: + bool: The indication if the model is ready to be used or not. If this + is False, it's recommended to wait 5 seconds before checking again. + + Example + ------- + >>> check_if_fine_tuned_florencev2_is_ready(UUID("381cd5f9-5dc4-472d-9260-f3bb89d31f83")) + True + """ + # check if job succeeded first + landing_api = LandingPublicAPI() + status = landing_api.check_fine_tuning_job(model_id) + return status is JobStatus.SUCCEEDED + + +def florencev2_fine_tuned_object_detection( + image: np.ndarray, prompt: str, model_id: UUID, task: str, model_is_ready: bool +) -> List[Dict[str, Any]]: + """'florencev2_fine_tuned_object_detection' is a tool that uses a fine tuned model + to detect objects given a text prompt such as a phrase or class names separated by + commas. It returns a list of detected objects as labels and their location as + bounding boxes with score of 1.0. + + Parameters: + image (np.ndarray): The image to used to detect objects. + prompt (str): The prompt to help find objects in the image. + model_id (UUID): The fine-tuned model id. + task (PromptTask): The florencev2 fine-tuning task. The options are + CAPTION, CAPTION_TO_PHRASE_GROUNDING and OBJECT_DETECTION. + model_is_ready (bool): If the model is ready to be used. It's recommended + to get this value from the function check_if_fine_tuned_florencev2_is_ready. + + Returns: + List[Dict[str, Any]]: A list of dictionaries containing the score, label, and + bounding box of the detected objects with normalized coordinates between 0 + and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the + top-left and xmax and ymax are the coordinates of the bottom-right of the + bounding box. The scores are always 1.0 and cannot be thresholded + + Example + ------- + >>> florencev2_fine_tuned_object_detection( + image, + 'person looking at a coyote', + UUID("381cd5f9-5dc4-472d-9260-f3bb89d31f83"), + model_is_ready=check_if_fine_tuned_florencev2_is_ready( + UUID("381cd5f9-5dc4-472d-9260-f3bb89d31f83") + ) + ) + [ + {'score': 1.0, 'label': 'person', 'bbox': [0.1, 0.11, 0.35, 0.4]}, + {'score': 1.0, 'label': 'coyote', 'bbox': [0.34, 0.21, 0.85, 0.5}, + ] + """ + if not model_is_ready: + return [] + + task = PromptTask[task] + if task is PromptTask.OBJECT_DETECTION: + prompt = "" + + data_obj = Florencev2FtRequest( + image=convert_to_b64(image), + task=task, + tool="florencev2_fine_tuning", + prompt=prompt, + fine_tuning=FineTuning(job_id=model_id), + ) + data = data_obj.model_dump(by_alias=True) + metadata_payload = {"function_name": "florencev2_fine_tuned_object_detection"} + detections = send_inference_request( + data, "tools", v2=False, metadata_payload=metadata_payload + ) + + detections = detections[task.value] + return_data = [] + image_size = image.shape[:2] + for i in range(len(detections["bboxes"])): + return_data.append( + { + "score": 1.0, + "label": detections["labels"][i], + "bbox": normalize_bbox(detections["bboxes"][i], image_size), + } + ) + return return_data + + TOOLS = [ owl_v2, grounding_sam, diff --git a/vision_agent/tools/meta_tools_types.py b/vision_agent/tools/tools_types.py similarity index 100% rename from vision_agent/tools/meta_tools_types.py rename to vision_agent/tools/tools_types.py From f88cd6c236b0c957349eaa54cccaef3987ef7dd4 Mon Sep 17 00:00:00 2001 From: Dayanne Fernandes Date: Fri, 23 Aug 2024 18:29:19 -0300 Subject: [PATCH 5/7] raise exception when model is not ready --- vision_agent/tools/tools.py | 40 +++++++------------------------- vision_agent/utils/exceptions.py | 7 ++++++ 2 files changed, 15 insertions(+), 32 deletions(-) diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py index 52f3c6d9..6cb697a3 100644 --- a/vision_agent/tools/tools.py +++ b/vision_agent/tools/tools.py @@ -21,6 +21,7 @@ get_tool_documentation, get_tools_df, ) +from vision_agent.utils.exceptions import FineTuneModelIsNotReady from vision_agent.utils import extract_frames_from_video from vision_agent.utils.execute import FileSerializer, MimeType from vision_agent.utils.image_utils import ( @@ -1336,31 +1337,8 @@ def florencev2_fine_tuning(bboxes: List[Dict[str, Any]], task: str) -> UUID: ) -def check_if_fine_tuned_florencev2_is_ready(model_id: UUID) -> bool: - """'check_if_fine_tuned_florencev2_is_ready' is a tool that checks whether - is possible to use a certain florencev2 model. It checks if the status - is SUCCEEDED. - - Parameters: - model_id (UUID): The fine-tuned model id. - - Returns: - bool: The indication if the model is ready to be used or not. If this - is False, it's recommended to wait 5 seconds before checking again. - - Example - ------- - >>> check_if_fine_tuned_florencev2_is_ready(UUID("381cd5f9-5dc4-472d-9260-f3bb89d31f83")) - True - """ - # check if job succeeded first - landing_api = LandingPublicAPI() - status = landing_api.check_fine_tuning_job(model_id) - return status is JobStatus.SUCCEEDED - - def florencev2_fine_tuned_object_detection( - image: np.ndarray, prompt: str, model_id: UUID, task: str, model_is_ready: bool + image: np.ndarray, prompt: str, model_id: UUID, task: str ) -> List[Dict[str, Any]]: """'florencev2_fine_tuned_object_detection' is a tool that uses a fine tuned model to detect objects given a text prompt such as a phrase or class names separated by @@ -1373,8 +1351,6 @@ def florencev2_fine_tuned_object_detection( model_id (UUID): The fine-tuned model id. task (PromptTask): The florencev2 fine-tuning task. The options are CAPTION, CAPTION_TO_PHRASE_GROUNDING and OBJECT_DETECTION. - model_is_ready (bool): If the model is ready to be used. It's recommended - to get this value from the function check_if_fine_tuned_florencev2_is_ready. Returns: List[Dict[str, Any]]: A list of dictionaries containing the score, label, and @@ -1388,18 +1364,18 @@ def florencev2_fine_tuned_object_detection( >>> florencev2_fine_tuned_object_detection( image, 'person looking at a coyote', - UUID("381cd5f9-5dc4-472d-9260-f3bb89d31f83"), - model_is_ready=check_if_fine_tuned_florencev2_is_ready( - UUID("381cd5f9-5dc4-472d-9260-f3bb89d31f83") - ) + UUID("381cd5f9-5dc4-472d-9260-f3bb89d31f83") ) [ {'score': 1.0, 'label': 'person', 'bbox': [0.1, 0.11, 0.35, 0.4]}, {'score': 1.0, 'label': 'coyote', 'bbox': [0.34, 0.21, 0.85, 0.5}, ] """ - if not model_is_ready: - return [] + # check if job succeeded first + landing_api = LandingPublicAPI() + status = landing_api.check_fine_tuning_job(model_id) + if status is not JobStatus.SUCCEEDED: + raise FineTuneModelIsNotReady() task = PromptTask[task] if task is PromptTask.OBJECT_DETECTION: diff --git a/vision_agent/utils/exceptions.py b/vision_agent/utils/exceptions.py index 41f81dad..ce2066b2 100644 --- a/vision_agent/utils/exceptions.py +++ b/vision_agent/utils/exceptions.py @@ -49,3 +49,10 @@ class RemoteSandboxClosedError(RemoteSandboxError): """ is_retryable = True + + +class FineTuneModelIsNotReady(Exception): + """Exception raised when the fine-tune model is not ready. + If this is raised, it's recommended to wait 5 seconds before trying to use + the model again. + """ From 653bfd210eb634168b859e10a914b853887c8a2a Mon Sep 17 00:00:00 2001 From: Dayanne Fernandes Date: Fri, 23 Aug 2024 19:29:58 -0300 Subject: [PATCH 6/7] comment --- vision_agent/tools/tools.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py index 6cb697a3..6d20f4f9 100644 --- a/vision_agent/tools/tools.py +++ b/vision_agent/tools/tools.py @@ -1337,6 +1337,7 @@ def florencev2_fine_tuning(bboxes: List[Dict[str, Any]], task: str) -> UUID: ) +# TODO: add this function to the imports so that is picked in the agent def florencev2_fine_tuned_object_detection( image: np.ndarray, prompt: str, model_id: UUID, task: str ) -> List[Dict[str, Any]]: From 14126f7269c8635464acbc7af25181e08f5716c2 Mon Sep 17 00:00:00 2001 From: Dayanne Fernandes Date: Fri, 23 Aug 2024 21:57:43 -0300 Subject: [PATCH 7/7] handle exceptions --- vision_agent/clients/http.py | 7 ++----- vision_agent/clients/landing_public_api.py | 10 +++++++++- vision_agent/utils/exceptions.py | 6 ++++++ 3 files changed, 17 insertions(+), 6 deletions(-) diff --git a/vision_agent/clients/http.py b/vision_agent/clients/http.py index dc969595..fd6b3e32 100644 --- a/vision_agent/clients/http.py +++ b/vision_agent/clients/http.py @@ -4,7 +4,6 @@ from requests import Session from requests.adapters import HTTPAdapter -from requests.exceptions import ConnectionError, RequestException, Timeout _LOGGER = logging.getLogger(__name__) @@ -38,11 +37,10 @@ def post(self, url: str, payload: Dict[str, Any]) -> Dict[str, Any]: response.raise_for_status() result: Dict[str, Any] = response.json() _LOGGER.info(json.dumps(result)) - except (ConnectionError, Timeout, RequestException) as err: - _LOGGER.warning(f"Error: {err}.") except json.JSONDecodeError: resp_text = response.text _LOGGER.warning(f"Response seems incorrect: '{resp_text}'.") + raise return result def get(self, url: str) -> Dict[str, Any]: @@ -53,9 +51,8 @@ def get(self, url: str) -> Dict[str, Any]: response.raise_for_status() result: Dict[str, Any] = response.json() _LOGGER.info(json.dumps(result)) - except (ConnectionError, Timeout, RequestException) as err: - _LOGGER.warning(f"Error: {err}.") except json.JSONDecodeError: resp_text = response.text _LOGGER.warning(f"Response seems incorrect: '{resp_text}'.") + raise return result diff --git a/vision_agent/clients/landing_public_api.py b/vision_agent/clients/landing_public_api.py index f9d52389..3fd1928e 100644 --- a/vision_agent/clients/landing_public_api.py +++ b/vision_agent/clients/landing_public_api.py @@ -2,8 +2,11 @@ from uuid import UUID from typing import List +from requests.exceptions import HTTPError + from vision_agent.clients.http import BaseHTTP from vision_agent.utils.type_defs import LandingaiAPIKey +from vision_agent.utils.exceptions import FineTuneModelNotFound from vision_agent.tools.tools_types import BboxInputBase64, PromptTask, JobStatus @@ -27,4 +30,9 @@ def launch_fine_tuning_job( def check_fine_tuning_job(self, job_id: UUID) -> JobStatus: url = f"v1/agent/jobs/fine-tuning/{job_id}/status" - return JobStatus(self.get(url)["status"]) + try: + get_job = self.get(url) + except HTTPError as err: + if err.response.status_code == 404: + raise FineTuneModelNotFound() + return JobStatus(get_job["status"]) diff --git a/vision_agent/utils/exceptions.py b/vision_agent/utils/exceptions.py index ce2066b2..22def208 100644 --- a/vision_agent/utils/exceptions.py +++ b/vision_agent/utils/exceptions.py @@ -56,3 +56,9 @@ class FineTuneModelIsNotReady(Exception): If this is raised, it's recommended to wait 5 seconds before trying to use the model again. """ + + +class FineTuneModelNotFound(Exception): + """Exception raised when the fine-tune model is not found. + If this is raised, it's recommended to try another model id. + """