diff --git a/pyproject.toml b/pyproject.toml
index 897791d2..58125691 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -78,6 +78,8 @@ line_length = 88
profile = "black"
[tool.mypy]
+plugins = "pydantic.mypy"
+
exclude = "tests"
show_error_context = true
pretty = true
diff --git a/vision_agent/agent/vision_agent.py b/vision_agent/agent/vision_agent.py
index a41fd09f..cfb482e1 100644
--- a/vision_agent/agent/vision_agent.py
+++ b/vision_agent/agent/vision_agent.py
@@ -28,7 +28,7 @@ class DefaultImports:
code = [
"from typing import *",
"from vision_agent.utils.execute import CodeInterpreter",
- "from vision_agent.tools.meta_tools import generate_vision_code, edit_vision_code, open_file, create_file, scroll_up, scroll_down, edit_file, get_tool_descriptions, florencev2_fine_tuning",
+ "from vision_agent.tools.meta_tools import generate_vision_code, edit_vision_code, open_file, create_file, scroll_up, scroll_down, edit_file, get_tool_descriptions",
]
@staticmethod
diff --git a/vision_agent/clients/http.py b/vision_agent/clients/http.py
index 678148a9..fd6b3e32 100644
--- a/vision_agent/clients/http.py
+++ b/vision_agent/clients/http.py
@@ -4,7 +4,6 @@
from requests import Session
from requests.adapters import HTTPAdapter
-from requests.exceptions import ConnectionError, RequestException, Timeout
_LOGGER = logging.getLogger(__name__)
@@ -38,9 +37,22 @@ def post(self, url: str, payload: Dict[str, Any]) -> Dict[str, Any]:
response.raise_for_status()
result: Dict[str, Any] = response.json()
_LOGGER.info(json.dumps(result))
- except (ConnectionError, Timeout, RequestException) as err:
- _LOGGER.warning(f"Error: {err}.")
except json.JSONDecodeError:
resp_text = response.text
_LOGGER.warning(f"Response seems incorrect: '{resp_text}'.")
+ raise
+ return result
+
+ def get(self, url: str) -> Dict[str, Any]:
+ formatted_url = f"{self._base_endpoint}/{url}"
+ _LOGGER.info(f"Sending data to {formatted_url}")
+ try:
+ response = self._session.get(url=formatted_url, timeout=self._TIMEOUT)
+ response.raise_for_status()
+ result: Dict[str, Any] = response.json()
+ _LOGGER.info(json.dumps(result))
+ except json.JSONDecodeError:
+ resp_text = response.text
+ _LOGGER.warning(f"Response seems incorrect: '{resp_text}'.")
+ raise
return result
diff --git a/vision_agent/clients/landing_public_api.py b/vision_agent/clients/landing_public_api.py
index 4c50c388..3fd1928e 100644
--- a/vision_agent/clients/landing_public_api.py
+++ b/vision_agent/clients/landing_public_api.py
@@ -2,9 +2,12 @@
from uuid import UUID
from typing import List
+from requests.exceptions import HTTPError
+
from vision_agent.clients.http import BaseHTTP
from vision_agent.utils.type_defs import LandingaiAPIKey
-from vision_agent.tools.meta_tools_types import BboxInputBase64, PromptTask
+from vision_agent.utils.exceptions import FineTuneModelNotFound
+from vision_agent.tools.tools_types import BboxInputBase64, PromptTask, JobStatus
class LandingPublicAPI(BaseHTTP):
@@ -24,3 +27,12 @@ def launch_fine_tuning_job(
}
response = self.post(url, payload=data)
return UUID(response["jobId"])
+
+ def check_fine_tuning_job(self, job_id: UUID) -> JobStatus:
+ url = f"v1/agent/jobs/fine-tuning/{job_id}/status"
+ try:
+ get_job = self.get(url)
+ except HTTPError as err:
+ if err.response.status_code == 404:
+ raise FineTuneModelNotFound()
+ return JobStatus(get_job["status"])
diff --git a/vision_agent/tools/__init__.py b/vision_agent/tools/__init__.py
index 51868dd9..2f4ab4d6 100644
--- a/vision_agent/tools/__init__.py
+++ b/vision_agent/tools/__init__.py
@@ -1,6 +1,8 @@
from typing import Callable, List, Optional
-from .meta_tools import META_TOOL_DOCSTRING, florencev2_fine_tuning
+from .meta_tools import (
+ META_TOOL_DOCSTRING,
+)
from .prompts import CHOOSE_PARAMS, SYSTEM_PROMPT
from .tools import (
TOOL_DESCRIPTIONS,
diff --git a/vision_agent/tools/meta_tools.py b/vision_agent/tools/meta_tools.py
index 851aab18..7c857550 100644
--- a/vision_agent/tools/meta_tools.py
+++ b/vision_agent/tools/meta_tools.py
@@ -1,6 +1,5 @@
import os
import subprocess
-from uuid import UUID
from pathlib import Path
from typing import Any, Dict, List, Union
@@ -8,9 +7,7 @@
from vision_agent.lmm.types import Message
from vision_agent.tools.tool_utils import get_tool_documentation
from vision_agent.tools.tools import TOOL_DESCRIPTIONS
-from vision_agent.utils.image_utils import convert_to_b64
-from vision_agent.clients.landing_public_api import LandingPublicAPI
-from vision_agent.tools.meta_tools_types import BboxInput, BboxInputBase64, PromptTask
+
# These tools are adapted from SWE-Agent https://github.com/princeton-nlp/SWE-agent
@@ -384,51 +381,11 @@ def edit_file(file_path: str, start: int, end: int, content: str) -> str:
def get_tool_descriptions() -> str:
"""Returns a description of all the tools that `generate_vision_code` has access to.
- Helpful for answerings questions about what types of vision tasks you can do with
+ Helpful for answering questions about what types of vision tasks you can do with
`generate_vision_code`."""
return TOOL_DESCRIPTIONS
-def florencev2_fine_tuning(bboxes: List[Dict[str, Any]], task: str) -> UUID:
- """'florencev2_fine_tuning' is a tool that fine-tune florencev2 to be able
- to detect objects in an image based on a given dataset. It returns the fine
- tuning job id.
-
- Parameters:
- bboxes (List[BboxInput]): A list of BboxInput containing the
- image path, labels and bounding boxes.
- task (PromptTask): The florencev2 fine-tuning task. The options are
- CAPTION, CAPTION_TO_PHRASE_GROUNDING and OBJECT_DETECTION.
-
- Returns:
- UUID: The fine tuning job id, this id will used to retrieve the fine
- tuned model.
-
- Example
- -------
- >>> fine_tuning_job_id = florencev2_fine_tuning(
- [{'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[370, 30, 560, 290]]},
- {'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[120, 0, 300, 170]]}],
- "OBJECT_DETECTION"
- )
- """
- bboxes_input = [BboxInput.model_validate(bbox) for bbox in bboxes]
- task_input = PromptTask[task]
- fine_tuning_request = [
- BboxInputBase64(
- image=convert_to_b64(bbox_input.image_path),
- filename=bbox_input.image_path.split("/")[-1],
- labels=bbox_input.labels,
- bboxes=bbox_input.bboxes,
- )
- for bbox_input in bboxes_input
- ]
- landing_api = LandingPublicAPI()
- return landing_api.launch_fine_tuning_job(
- "florencev2", task_input, fine_tuning_request
- )
-
-
META_TOOL_DOCSTRING = get_tool_documentation(
[
get_tool_descriptions,
@@ -442,6 +399,5 @@ def florencev2_fine_tuning(bboxes: List[Dict[str, Any]], task: str) -> UUID:
search_dir,
search_file,
find_file,
- florencev2_fine_tuning,
]
)
diff --git a/vision_agent/tools/meta_tools_types.py b/vision_agent/tools/meta_tools_types.py
deleted file mode 100644
index 4c60923e..00000000
--- a/vision_agent/tools/meta_tools_types.py
+++ /dev/null
@@ -1,30 +0,0 @@
-from enum import Enum
-from typing import List, Tuple
-
-from pydantic import BaseModel
-
-
-class BboxInput(BaseModel):
- image_path: str
- labels: List[str]
- bboxes: List[Tuple[int, int, int, int]]
-
-
-class BboxInputBase64(BaseModel):
- image: str
- filename: str
- labels: List[str]
- bboxes: List[Tuple[int, int, int, int]]
-
-
-class PromptTask(str, Enum):
- """
- Valid task prompts options for the Florencev2 model.
- """
-
- CAPTION = "
"
- """"""
- CAPTION_TO_PHRASE_GROUNDING = ""
- """"""
- OBJECT_DETECTION = ""
- """"""
diff --git a/vision_agent/tools/tool_utils.py b/vision_agent/tools/tool_utils.py
index fda89526..8d7e3aa9 100644
--- a/vision_agent/tools/tool_utils.py
+++ b/vision_agent/tools/tool_utils.py
@@ -15,9 +15,10 @@
from vision_agent.utils.type_defs import LandingaiAPIKey
_LOGGER = logging.getLogger(__name__)
-_LND_API_KEY = LandingaiAPIKey().api_key
-_LND_API_URL = "https://api.landing.ai/v1/agent/model"
-_LND_API_URL_v2 = "https://api.landing.ai/v1/tools"
+_LND_API_KEY = os.environ.get("LANDINGAI_API_KEY", LandingaiAPIKey().api_key)
+_LND_BASE_URL = os.environ.get("LANDINGAI_URL", "https://api.landing.ai")
+_LND_API_URL = f"{_LND_BASE_URL}/v1/agent/model"
+_LND_API_URL_v2 = f"{_LND_BASE_URL}/v1/tools"
class ToolCallTrace(BaseModel):
@@ -28,8 +29,13 @@ class ToolCallTrace(BaseModel):
def send_inference_request(
- payload: Dict[str, Any], endpoint_name: str, v2: bool = False
+ payload: Dict[str, Any],
+ endpoint_name: str,
+ v2: bool = False,
+ metadata_payload: Optional[Dict[str, Any]] = None,
) -> Dict[str, Any]:
+ # TODO: runtime_tag and function_name should be metadata_payload and now included
+ # in the service payload
try:
if runtime_tag := os.environ.get("RUNTIME_TAG", ""):
payload["runtime_tag"] = runtime_tag
@@ -62,9 +68,13 @@ def send_inference_request(
traceback_raw=[],
)
_LOGGER.error(f"Request failed: {res.status_code} {res.text}")
- raise RemoteToolCallFailed(
- payload["function_name"], res.status_code, res.text
- )
+ # TODO: function_name should be in metadata_payload
+ function_name = "unknown"
+ if "function_name" in payload:
+ function_name = payload["function_name"]
+ elif metadata_payload is not None and "function_name" in metadata_payload:
+ function_name = metadata_payload["function_name"]
+ raise RemoteToolCallFailed(function_name, res.status_code, res.text)
resp = res.json()
tool_call_trace.response = resp
diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py
index 3070f1f2..c05369e3 100644
--- a/vision_agent/tools/tools.py
+++ b/vision_agent/tools/tools.py
@@ -2,6 +2,7 @@
import json
import logging
import tempfile
+from uuid import UUID
from pathlib import Path
from importlib import resources
from typing import Any, Dict, List, Optional, Tuple, Union, cast
@@ -21,6 +22,7 @@
get_tools_df,
get_tools_info,
)
+from vision_agent.utils.exceptions import FineTuneModelIsNotReady
from vision_agent.utils import extract_frames_from_video
from vision_agent.utils.execute import FileSerializer, MimeType
from vision_agent.utils.image_utils import (
@@ -32,6 +34,15 @@
convert_quad_box_to_bbox,
rle_decode,
)
+from vision_agent.tools.tools_types import (
+ BboxInput,
+ BboxInputBase64,
+ PromptTask,
+ Florencev2FtRequest,
+ FineTuning,
+ JobStatus,
+)
+from vision_agent.clients.landing_public_api import LandingPublicAPI
register_heif_opener()
@@ -1286,6 +1297,119 @@ def overlay_heat_map(
return np.array(combined)
+# TODO: add this function to the imports so that is picked in the agent
+def florencev2_fine_tuning(bboxes: List[Dict[str, Any]], task: str) -> UUID:
+ """'florencev2_fine_tuning' is a tool that fine-tune florencev2 to be able
+ to detect objects in an image based on a given dataset. It returns the fine
+ tuning job id.
+
+ Parameters:
+ bboxes (List[BboxInput]): A list of BboxInput containing the
+ image path, labels and bounding boxes.
+ task (PromptTask): The florencev2 fine-tuning task. The options are
+ CAPTION, CAPTION_TO_PHRASE_GROUNDING and OBJECT_DETECTION.
+
+ Returns:
+ UUID: The fine tuning job id, this id will used to retrieve the fine
+ tuned model.
+
+ Example
+ -------
+ >>> fine_tuning_job_id = florencev2_fine_tuning(
+ [{'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[370, 30, 560, 290]]},
+ {'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[120, 0, 300, 170]]}],
+ "OBJECT_DETECTION"
+ )
+ """
+ bboxes_input = [BboxInput.model_validate(bbox) for bbox in bboxes]
+ task_input = PromptTask[task]
+ fine_tuning_request = [
+ BboxInputBase64(
+ image=convert_to_b64(bbox_input.image_path),
+ filename=bbox_input.image_path.split("/")[-1],
+ labels=bbox_input.labels,
+ bboxes=bbox_input.bboxes,
+ )
+ for bbox_input in bboxes_input
+ ]
+ landing_api = LandingPublicAPI()
+ return landing_api.launch_fine_tuning_job(
+ "florencev2", task_input, fine_tuning_request
+ )
+
+
+# TODO: add this function to the imports so that is picked in the agent
+def florencev2_fine_tuned_object_detection(
+ image: np.ndarray, prompt: str, model_id: UUID, task: str
+) -> List[Dict[str, Any]]:
+ """'florencev2_fine_tuned_object_detection' is a tool that uses a fine tuned model
+ to detect objects given a text prompt such as a phrase or class names separated by
+ commas. It returns a list of detected objects as labels and their location as
+ bounding boxes with score of 1.0.
+
+ Parameters:
+ image (np.ndarray): The image to used to detect objects.
+ prompt (str): The prompt to help find objects in the image.
+ model_id (UUID): The fine-tuned model id.
+ task (PromptTask): The florencev2 fine-tuning task. The options are
+ CAPTION, CAPTION_TO_PHRASE_GROUNDING and OBJECT_DETECTION.
+
+ Returns:
+ List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
+ bounding box of the detected objects with normalized coordinates between 0
+ and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the
+ top-left and xmax and ymax are the coordinates of the bottom-right of the
+ bounding box. The scores are always 1.0 and cannot be thresholded
+
+ Example
+ -------
+ >>> florencev2_fine_tuned_object_detection(
+ image,
+ 'person looking at a coyote',
+ UUID("381cd5f9-5dc4-472d-9260-f3bb89d31f83")
+ )
+ [
+ {'score': 1.0, 'label': 'person', 'bbox': [0.1, 0.11, 0.35, 0.4]},
+ {'score': 1.0, 'label': 'coyote', 'bbox': [0.34, 0.21, 0.85, 0.5},
+ ]
+ """
+ # check if job succeeded first
+ landing_api = LandingPublicAPI()
+ status = landing_api.check_fine_tuning_job(model_id)
+ if status is not JobStatus.SUCCEEDED:
+ raise FineTuneModelIsNotReady()
+
+ task = PromptTask[task]
+ if task is PromptTask.OBJECT_DETECTION:
+ prompt = ""
+
+ data_obj = Florencev2FtRequest(
+ image=convert_to_b64(image),
+ task=task,
+ tool="florencev2_fine_tuning",
+ prompt=prompt,
+ fine_tuning=FineTuning(job_id=model_id),
+ )
+ data = data_obj.model_dump(by_alias=True)
+ metadata_payload = {"function_name": "florencev2_fine_tuned_object_detection"}
+ detections = send_inference_request(
+ data, "tools", v2=False, metadata_payload=metadata_payload
+ )
+
+ detections = detections[task.value]
+ return_data = []
+ image_size = image.shape[:2]
+ for i in range(len(detections["bboxes"])):
+ return_data.append(
+ {
+ "score": 1.0,
+ "label": detections["labels"][i],
+ "bbox": normalize_bbox(detections["bboxes"][i], image_size),
+ }
+ )
+ return return_data
+
+
TOOLS = [
owl_v2,
grounding_sam,
diff --git a/vision_agent/tools/tools_types.py b/vision_agent/tools/tools_types.py
new file mode 100644
index 00000000..aeb45c95
--- /dev/null
+++ b/vision_agent/tools/tools_types.py
@@ -0,0 +1,84 @@
+from uuid import UUID
+from enum import Enum
+from typing import List, Tuple, Optional
+
+from pydantic import BaseModel, ConfigDict, Field, field_serializer, SerializationInfo
+
+
+class BboxInput(BaseModel):
+ image_path: str
+ labels: List[str]
+ bboxes: List[Tuple[int, int, int, int]]
+
+
+class BboxInputBase64(BaseModel):
+ image: str
+ filename: str
+ labels: List[str]
+ bboxes: List[Tuple[int, int, int, int]]
+
+
+class PromptTask(str, Enum):
+ """
+ Valid task prompts options for the Florencev2 model.
+ """
+
+ CAPTION = ""
+ """"""
+ CAPTION_TO_PHRASE_GROUNDING = ""
+ """"""
+ OBJECT_DETECTION = ""
+ """"""
+
+
+class FineTuning(BaseModel):
+ model_config = ConfigDict(populate_by_name=True)
+
+ job_id: UUID = Field(alias="jobId")
+
+ @field_serializer("job_id")
+ def serialize_job_id(self, job_id: UUID, _info: SerializationInfo) -> str:
+ return str(job_id)
+
+
+class Florencev2FtRequest(BaseModel):
+ model_config = ConfigDict(populate_by_name=True)
+
+ image: str
+ task: PromptTask
+ tool: str
+ prompt: Optional[str] = ""
+ fine_tuning: Optional[FineTuning] = Field(None, alias="fineTuning")
+
+
+class JobStatus(str, Enum):
+ """The status of a fine-tuning job.
+
+ CREATED:
+ The job has been created and is waiting to be scheduled to run.
+ STARTING:
+ The job has started running, but not entering the training phase.
+ TRAINING:
+ The job is training a model.
+ EVALUATING:
+ The job is evaluating the model and computing metrics.
+ PUBLISHING:
+ The job is exporting the artifact(s) to an external directory (s3 or local).
+ SUCCEEDED:
+ The job has finished, including training, evaluation and publishing the
+ artifact(s).
+ FAILED:
+ The job has failed for some reason internally, it can be due to resources
+ issues or the code itself.
+ STOPPED:
+ The job has been stopped by the use locally or in the cloud.
+ """
+
+ CREATED = "CREATED"
+ STARTING = "STARTING"
+ TRAINING = "TRAINING"
+ EVALUATING = "EVALUATING"
+ PUBLISHING = "PUBLISHING"
+ SUCCEEDED = "SUCCEEDED"
+ FAILED = "FAILED"
+ STOPPED = "STOPPED"
diff --git a/vision_agent/utils/exceptions.py b/vision_agent/utils/exceptions.py
index 41f81dad..22def208 100644
--- a/vision_agent/utils/exceptions.py
+++ b/vision_agent/utils/exceptions.py
@@ -49,3 +49,16 @@ class RemoteSandboxClosedError(RemoteSandboxError):
"""
is_retryable = True
+
+
+class FineTuneModelIsNotReady(Exception):
+ """Exception raised when the fine-tune model is not ready.
+ If this is raised, it's recommended to wait 5 seconds before trying to use
+ the model again.
+ """
+
+
+class FineTuneModelNotFound(Exception):
+ """Exception raised when the fine-tune model is not found.
+ If this is raised, it's recommended to try another model id.
+ """