diff --git a/vision_agent/agent/vision_agent.py b/vision_agent/agent/vision_agent.py
index 2bb04343..4733bb24 100644
--- a/vision_agent/agent/vision_agent.py
+++ b/vision_agent/agent/vision_agent.py
@@ -30,7 +30,7 @@ class BoilerplateCode:
pre_code = [
"from typing import *",
"from vision_agent.utils.execute import CodeInterpreter",
- "from vision_agent.tools.meta_tools import Artifacts, open_code_artifact, create_code_artifact, edit_code_artifact, get_tool_descriptions, generate_vision_code, edit_vision_code, write_media_artifact",
+ "from vision_agent.tools.meta_tools import Artifacts, open_code_artifact, create_code_artifact, edit_code_artifact, get_tool_descriptions, generate_vision_code, edit_vision_code, write_media_artifact, florence2_fine_tuning, use_florence2_fine_tuning",
"artifacts = Artifacts('{remote_path}')",
"artifacts.load('{remote_path}')",
]
@@ -76,11 +76,16 @@ def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
def run_code_action(
code: str, code_interpreter: CodeInterpreter, artifact_remote_path: str
-) -> Execution:
- return code_interpreter.exec_isolation(
+) -> Tuple[Execution, str]:
+ result = code_interpreter.exec_isolation(
BoilerplateCode.add_boilerplate(code, remote_path=artifact_remote_path)
)
+ obs = str(result.logs)
+ if result.error:
+ obs += f"\n{result.error}"
+ return result, obs
+
def parse_execution(response: str) -> Optional[str]:
code = None
@@ -192,7 +197,7 @@ def chat_with_code(
artifacts = Artifacts(WORKSPACE / "artifacts.pkl")
with CodeInterpreterFactory.new_instance(
- code_sandbox_runtime=self.code_sandbox_runtime
+ code_sandbox_runtime=self.code_sandbox_runtime,
) as code_interpreter:
orig_chat = copy.deepcopy(chat)
int_chat = copy.deepcopy(chat)
@@ -260,10 +265,9 @@ def chat_with_code(
code_action = parse_execution(response["response"])
if code_action is not None:
- result = run_code_action(
+ result, obs = run_code_action(
code_action, code_interpreter, str(remote_artifacts_path)
)
- obs = str(result.logs)
if self.verbosity >= 1:
_LOGGER.info(obs)
diff --git a/vision_agent/agent/vision_agent_coder.py b/vision_agent/agent/vision_agent_coder.py
index c8488902..dd893d1d 100644
--- a/vision_agent/agent/vision_agent_coder.py
+++ b/vision_agent/agent/vision_agent_coder.py
@@ -1,5 +1,4 @@
import copy
-import difflib
import logging
import os
import sys
@@ -29,6 +28,7 @@
USER_REQ,
)
from vision_agent.lmm import LMM, AzureOpenAILMM, Message, OllamaLMM, OpenAILMM
+from vision_agent.tools.meta_tools import get_diff
from vision_agent.utils import CodeInterpreterFactory, Execution
from vision_agent.utils.execute import CodeInterpreter
from vision_agent.utils.image_utils import b64_to_pil
@@ -63,14 +63,6 @@ def prepend_imports(code: str) -> str:
return DefaultImports.to_code_string() + "\n\n" + code
-def get_diff(before: str, after: str) -> str:
- return "".join(
- difflib.unified_diff(
- before.splitlines(keepends=True), after.splitlines(keepends=True)
- )
- )
-
-
def format_memory(memory: List[Dict[str, str]]) -> str:
output_str = ""
for i, m in enumerate(memory):
diff --git a/vision_agent/agent/vision_agent_prompts.py b/vision_agent/agent/vision_agent_prompts.py
index 85e34cd5..bf9fac80 100644
--- a/vision_agent/agent/vision_agent_prompts.py
+++ b/vision_agent/agent/vision_agent_prompts.py
@@ -48,7 +48,7 @@
4| return dogs
[End of artifact]
-AGENT: {"thoughts": "I have generated the code to detect the dogs in the image, I must now run the code to get the output.", "response": "from dog_detector import detect_dogs\n print(detect_dogs('/example/workspace/dog.jpg'))", "let_user_respond": false}
+AGENT: {"thoughts": "I have generated the code to detect the dogs in the image, I must now run the code and print the results to get the output.", "response": "from dog_detector import detect_dogs\n print(detect_dogs('/example/workspace/dog.jpg'))", "let_user_respond": false}
OBSERVATION:
----- stdout -----
@@ -75,7 +75,7 @@
4| return dogs
[End of artifact]
-AGENT: {"thoughts": "I have edited the code to detect only one dog, I must now run the code to get the output.", "response": "from dog_detector import detect_dogs\n print(detect_dogs('/path/to/images/dog.jpg'))", "let_user_respond": false}
+AGENT: {"thoughts": "I have edited the code to detect only one dog, I must now run the code and print the results to get the output.", "response": "from dog_detector import detect_dogs\n print(detect_dogs('/path/to/images/dog.jpg'))", "let_user_respond": false}
OBSERVATION:
----- stdout -----
@@ -126,7 +126,7 @@
15| return count
[End of artifact]
-AGENT: {"thoughts": "I have generated the code to count the workers with helmets in the image, I must now run the code to get the output and write the visualization to the artifacts so the user can see it.", "response": "from code import count_workers_with_helmets\n print(count_workers_with_helmets('/path/to/images/workers.png', 'workers_viz.png'))\n write_media_artifact(artifacts, 'workers_viz.png')", "let_user_respond": false}
+AGENT: {"thoughts": "I have generated the code to count the workers with helmets in the image, I must now run the code and print the output and write the visualization to the artifacts so I can see the result and the user can see the visaulization.", "response": "from code import count_workers_with_helmets\n print(count_workers_with_helmets('/path/to/images/workers.png', 'workers_viz.png'))\n write_media_artifact(artifacts, 'workers_viz.png')", "let_user_respond": false}
OBSERVATION:
----- stdout -----
diff --git a/vision_agent/tools/meta_tools.py b/vision_agent/tools/meta_tools.py
index 833ad542..3670e600 100644
--- a/vision_agent/tools/meta_tools.py
+++ b/vision_agent/tools/meta_tools.py
@@ -1,5 +1,7 @@
+import difflib
import os
import pickle as pkl
+import re
import subprocess
import tempfile
from pathlib import Path
@@ -8,10 +10,13 @@
from IPython.display import display
import vision_agent as va
+from vision_agent.clients.landing_public_api import LandingPublicAPI
from vision_agent.lmm.types import Message
from vision_agent.tools.tool_utils import get_tool_documentation
from vision_agent.tools.tools import TOOL_DESCRIPTIONS
+from vision_agent.tools.tools_types import BboxInput, BboxInputBase64, PromptTask
from vision_agent.utils.execute import Execution, MimeType
+from vision_agent.utils.image_utils import convert_to_b64
# These tools are adapted from SWE-Agent https://github.com/princeton-nlp/SWE-agent
@@ -99,13 +104,14 @@ def load(self, file_path: Union[str, Path]) -> None:
def show(self) -> str:
"""Shows the artifacts that have been loaded and their remote save paths."""
- out_str = "[Artifacts loaded]\n"
+ output_str = "[Artifacts loaded]\n"
for k in self.artifacts.keys():
- out_str += (
+ output_str += (
f"Artifact {k} loaded to {str(self.remote_save_path.parent / k)}\n"
)
- out_str += "[End of artifacts]\n"
- return out_str
+ output_str += "[End of artifacts]\n"
+ print(output_str)
+ return output_str
def save(self, local_path: Optional[Union[str, Path]] = None) -> None:
save_path = (
@@ -135,7 +141,12 @@ def format_lines(lines: List[str], start_idx: int) -> str:
def view_lines(
- lines: List[str], line_num: int, window_size: int, name: str, total_lines: int
+ lines: List[str],
+ line_num: int,
+ window_size: int,
+ name: str,
+ total_lines: int,
+ print_output: bool = True,
) -> str:
start = max(0, line_num - window_size)
end = min(len(lines), line_num + window_size)
@@ -148,7 +159,9 @@ def view_lines(
else f"[{len(lines) - end} more lines]"
)
)
- print(return_str)
+
+ if print_output:
+ print(return_str)
return return_str
@@ -231,7 +244,7 @@ def edit_code_artifact(
new_content_lines = [
line if line.endswith("\n") else line + "\n" for line in new_content_lines
]
- lines = artifacts[name].splitlines()
+ lines = artifacts[name].splitlines(keepends=True)
edited_lines = lines[:start] + new_content_lines + lines[end:]
cur_line = start + len(content.split("\n")) // 2
@@ -261,13 +274,20 @@ def edit_code_artifact(
DEFAULT_WINDOW_SIZE,
name,
total_lines,
+ print_output=False,
)
total_lines_edit = sum(1 for _ in edited_lines)
edited_view = view_lines(
- edited_lines, cur_line, DEFAULT_WINDOW_SIZE, name, total_lines_edit
+ edited_lines,
+ cur_line,
+ DEFAULT_WINDOW_SIZE,
+ name,
+ total_lines_edit,
+ print_output=False,
)
error_msg += f"\n[This is how your edit would have looked like if applied]\n{edited_view}\n\n[This is the original code before your edit]\n{original_view}"
+ print(error_msg)
return error_msg
artifacts[name] = "".join(edited_lines)
@@ -390,6 +410,13 @@ def write_media_artifact(artifacts: Artifacts, local_path: str) -> str:
return f"[Media {Path(local_path).name} saved]"
+def list_artifacts(artifacts: Artifacts) -> str:
+ """Lists all the artifacts that have been loaded into the artifacts object."""
+ output_str = artifacts.show()
+ print(output_str)
+ return output_str
+
+
def get_tool_descriptions() -> str:
"""Returns a description of all the tools that `generate_vision_code` has access to.
Helpful for answering questions about what types of vision tasks you can do with
@@ -397,6 +424,108 @@ def get_tool_descriptions() -> str:
return TOOL_DESCRIPTIONS
+def florence2_fine_tuning(bboxes: List[Dict[str, Any]], task: str) -> str:
+ """'florence2_fine_tuning' is a tool that fine-tune florence2 to be able to detect
+ objects in an image based on a given dataset. It returns the fine tuning job id.
+
+ Parameters:
+ bboxes (List[BboxInput]): A list of BboxInput containing the
+ image path, labels and bounding boxes.
+ task (str): The florencev2 fine-tuning task. The options are
+ 'phrase_grounding'.
+
+ Returns:
+ UUID: The fine tuning job id, this id will used to retrieve the fine
+ tuned model.
+
+ Example
+ -------
+ >>> fine_tuning_job_id = florencev2_fine_tuning(
+ [{'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[370, 30, 560, 290]]},
+ {'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[120, 0, 300, 170]]}],
+ "phrase_grounding"
+ )
+ """
+ bboxes_input = [BboxInput.model_validate(bbox) for bbox in bboxes]
+ task_type = PromptTask[task.upper()]
+ fine_tuning_request = [
+ BboxInputBase64(
+ image=convert_to_b64(bbox_input.image_path),
+ filename=Path(bbox_input.image_path).name,
+ labels=bbox_input.labels,
+ bboxes=bbox_input.bboxes,
+ )
+ for bbox_input in bboxes_input
+ ]
+ landing_api = LandingPublicAPI()
+ fine_tune_id = str(
+ landing_api.launch_fine_tuning_job("florencev2", task_type, fine_tuning_request)
+ )
+ print(f"[Florence2 fine tuning id: {fine_tune_id}]")
+ return fine_tune_id
+
+
+def get_diff(before: str, after: str) -> str:
+ return "".join(
+ difflib.unified_diff(
+ before.splitlines(keepends=True), after.splitlines(keepends=True)
+ )
+ )
+
+
+def use_florence2_fine_tuning(
+ artifacts: Artifacts, name: str, task: str, fine_tune_id: str
+) -> str:
+ """Replaces florence2 calls with the fine tuning id. This ensures that the code
+ utilizes the fined tuned florence2 model. Returns the diff between the original
+ code and the new code.
+
+ Parameters:
+ artifacts (Artifacts): The artifacts object to edit the code from.
+ name (str): The name of the artifact to edit.
+ task (str): The task to fine tune the model for. The options are
+ 'phrase_grounding'.
+ fine_tune_id (str): The fine tuning job id.
+
+ Examples
+ --------
+ >>> diff = use_florence2_fine_tuning(artifacts, "code.py", "phrase_grounding", "23b3b022-5ebf-4798-9373-20ef36429abf")
+ """
+
+ task_to_fn = {"phrase_grounding": "florence2_phrase_grounding"}
+
+ if name not in artifacts:
+ output_str = f"[Artifact {name} does not exist]"
+ print(output_str)
+ return output_str
+
+ code = artifacts[name]
+ if task.lower() == "phrase_grounding":
+ pattern = r"florence2_phrase_grounding\(\s*([^\)]+)\)"
+
+ def replacer(match: re.Match) -> str:
+ arg = match.group(1) # capture all initial arguments
+ return f'florence2_phrase_grounding({arg}, "{fine_tune_id}")'
+
+ else:
+ raise ValueError(f"Task {task} is not supported.")
+
+ new_code = re.sub(pattern, replacer, code)
+
+ if new_code == code:
+ output_str = (
+ f"[Fine tuning task {task} function {task_to_fn[task]} not found in code]"
+ )
+ print(output_str)
+ return output_str
+
+ artifacts[name] = new_code
+
+ diff = get_diff(code, new_code)
+ print(diff)
+ return diff
+
+
META_TOOL_DOCSTRING = get_tool_documentation(
[
get_tool_descriptions,
@@ -406,5 +535,8 @@ def get_tool_descriptions() -> str:
generate_vision_code,
edit_vision_code,
write_media_artifact,
+ florence2_fine_tuning,
+ use_florence2_fine_tuning,
+ list_artifacts,
]
)
diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py
index 8012e60d..958b2cf6 100644
--- a/vision_agent/tools/tools.py
+++ b/vision_agent/tools/tools.py
@@ -28,10 +28,8 @@
filter_bboxes_by_threshold,
)
from vision_agent.tools.tools_types import (
- BboxInput,
- BboxInputBase64,
FineTuning,
- Florencev2FtRequest,
+ Florence2FtRequest,
JobStatus,
PromptTask,
ODResponseData,
@@ -867,7 +865,9 @@ def florence2_image_caption(image: np.ndarray, detail_caption: bool = True) -> s
return answer[task] # type: ignore
-def florence2_phrase_grounding(prompt: str, image: np.ndarray) -> List[Dict[str, Any]]:
+def florence2_phrase_grounding(
+ prompt: str, image: np.ndarray, fine_tune_id: Optional[str] = None
+) -> List[Dict[str, Any]]:
"""'florence2_phrase_grounding' is a tool that can detect multiple
objects given a text prompt which can be object names or caption. You
can optionally separate the object names in the text with commas. It returns a list
@@ -877,6 +877,8 @@ def florence2_phrase_grounding(prompt: str, image: np.ndarray) -> List[Dict[str,
Parameters:
prompt (str): The prompt to ground to the image.
image (np.ndarray): The image to used to detect objects
+ fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
+ fine-tuned model ID here to use it.
Returns:
List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
@@ -895,14 +897,33 @@ def florence2_phrase_grounding(prompt: str, image: np.ndarray) -> List[Dict[str,
"""
image_size = image.shape[:2]
image_b64 = convert_to_b64(image)
- data = {
- "image": image_b64,
- "task": "",
- "prompt": prompt,
- "function_name": "florence2_phrase_grounding",
- }
- detections = send_inference_request(data, "florence2", v2=True)
+ if fine_tune_id is not None:
+ landing_api = LandingPublicAPI()
+ status = landing_api.check_fine_tuning_job(UUID(fine_tune_id))
+ if status is not JobStatus.SUCCEEDED:
+ raise FineTuneModelIsNotReady(
+ f"Fine-tuned model {fine_tune_id} is not ready yet"
+ )
+
+ data_obj = Florence2FtRequest(
+ image=image_b64,
+ task=PromptTask.PHRASE_GROUNDING,
+ tool="florencev2_fine_tuning",
+ prompt=prompt,
+ fine_tuning=FineTuning(job_id=UUID(fine_tune_id)),
+ )
+ data = data_obj.model_dump(by_alias=True)
+ detections = send_inference_request(data, "tools", v2=False)
+ else:
+ data = {
+ "image": image_b64,
+ "task": "",
+ "prompt": prompt,
+ "function_name": "florence2_phrase_grounding",
+ }
+ detections = send_inference_request(data, "florence2", v2=True)
+
detections = detections[""]
return_data = []
for i in range(len(detections["bboxes"])):
@@ -1732,119 +1753,6 @@ def overlay_counting_results(
return np.array(pil_image)
-# TODO: add this function to the imports so that is picked in the agent
-def florencev2_fine_tuning(bboxes: List[Dict[str, Any]], task: str) -> UUID:
- """'florencev2_fine_tuning' is a tool that fine-tune florencev2 to be able
- to detect objects in an image based on a given dataset. It returns the fine
- tuning job id.
-
- Parameters:
- bboxes (List[BboxInput]): A list of BboxInput containing the
- image path, labels and bounding boxes.
- task (PromptTask): The florencev2 fine-tuning task. The options are
- CAPTION, CAPTION_TO_PHRASE_GROUNDING and OBJECT_DETECTION.
-
- Returns:
- UUID: The fine tuning job id, this id will used to retrieve the fine
- tuned model.
-
- Example
- -------
- >>> fine_tuning_job_id = florencev2_fine_tuning(
- [{'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[370, 30, 560, 290]]},
- {'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[120, 0, 300, 170]]}],
- "OBJECT_DETECTION"
- )
- """
- bboxes_input = [BboxInput.model_validate(bbox) for bbox in bboxes]
- task_input = PromptTask[task]
- fine_tuning_request = [
- BboxInputBase64(
- image=convert_to_b64(bbox_input.image_path),
- filename=bbox_input.image_path.split("/")[-1],
- labels=bbox_input.labels,
- bboxes=bbox_input.bboxes,
- )
- for bbox_input in bboxes_input
- ]
- landing_api = LandingPublicAPI()
- return landing_api.launch_fine_tuning_job(
- "florencev2", task_input, fine_tuning_request
- )
-
-
-# TODO: add this function to the imports so that is picked in the agent
-def florencev2_fine_tuned_object_detection(
- image: np.ndarray, prompt: str, model_id: UUID, task: str
-) -> List[Dict[str, Any]]:
- """'florencev2_fine_tuned_object_detection' is a tool that uses a fine tuned model
- to detect objects given a text prompt such as a phrase or class names separated by
- commas. It returns a list of detected objects as labels and their location as
- bounding boxes with score of 1.0.
-
- Parameters:
- image (np.ndarray): The image to used to detect objects.
- prompt (str): The prompt to help find objects in the image.
- model_id (UUID): The fine-tuned model id.
- task (PromptTask): The florencev2 fine-tuning task. The options are
- CAPTION, CAPTION_TO_PHRASE_GROUNDING and OBJECT_DETECTION.
-
- Returns:
- List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
- bounding box of the detected objects with normalized coordinates between 0
- and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the
- top-left and xmax and ymax are the coordinates of the bottom-right of the
- bounding box. The scores are always 1.0 and cannot be thresholded
-
- Example
- -------
- >>> florencev2_fine_tuned_object_detection(
- image,
- 'person looking at a coyote',
- UUID("381cd5f9-5dc4-472d-9260-f3bb89d31f83")
- )
- [
- {'score': 1.0, 'label': 'person', 'bbox': [0.1, 0.11, 0.35, 0.4]},
- {'score': 1.0, 'label': 'coyote', 'bbox': [0.34, 0.21, 0.85, 0.5},
- ]
- """
- # check if job succeeded first
- landing_api = LandingPublicAPI()
- status = landing_api.check_fine_tuning_job(model_id)
- if status is not JobStatus.SUCCEEDED:
- raise FineTuneModelIsNotReady()
-
- task = PromptTask[task]
- if task is PromptTask.OBJECT_DETECTION:
- prompt = ""
-
- data_obj = Florencev2FtRequest(
- image=convert_to_b64(image),
- task=task,
- tool="florencev2_fine_tuning",
- prompt=prompt,
- fine_tuning=FineTuning(job_id=model_id),
- )
- data = data_obj.model_dump(by_alias=True)
- metadata_payload = {"function_name": "florencev2_fine_tuned_object_detection"}
- detections = send_inference_request(
- data, "tools", v2=False, metadata_payload=metadata_payload
- )
-
- detections = detections[task.value]
- return_data = []
- image_size = image.shape[:2]
- for i in range(len(detections["bboxes"])):
- return_data.append(
- {
- "score": 1.0,
- "label": detections["labels"][i],
- "bbox": normalize_bbox(detections["bboxes"][i], image_size),
- }
- )
- return return_data
-
-
FUNCTION_TOOLS = [
owl_v2,
extract_frames,
diff --git a/vision_agent/tools/tools_types.py b/vision_agent/tools/tools_types.py
index af1e8ee9..f61c2cf1 100644
--- a/vision_agent/tools/tools_types.py
+++ b/vision_agent/tools/tools_types.py
@@ -19,16 +19,9 @@ class BboxInputBase64(BaseModel):
class PromptTask(str, Enum):
- """
- Valid task prompts options for the Florencev2 model.
- """
+ """Valid task prompts options for the Florence2 model."""
- CAPTION = ""
- """"""
- CAPTION_TO_PHRASE_GROUNDING = ""
- """"""
- OBJECT_DETECTION = ""
- """"""
+ PHRASE_GROUNDING = ""
class FineTuning(BaseModel):
@@ -41,7 +34,7 @@ def serialize_job_id(self, job_id: UUID, _info: SerializationInfo) -> str:
return str(job_id)
-class Florencev2FtRequest(BaseModel):
+class Florence2FtRequest(BaseModel):
model_config = ConfigDict(populate_by_name=True)
image: str
diff --git a/vision_agent/utils/execute.py b/vision_agent/utils/execute.py
index 37c8d260..0de10335 100644
--- a/vision_agent/utils/execute.py
+++ b/vision_agent/utils/execute.py
@@ -564,7 +564,13 @@ def __init__(
) -> None:
super().__init__(timeout=timeout)
self.nb = nbformat.v4.new_notebook()
- self.nb_client = NotebookClient(self.nb, timeout=self.timeout)
+ # Set the notebook execution path to the remote path
+ self.resources = {"metadata": {"path": str(self.remote_path)}}
+ self.nb_client = NotebookClient(
+ self.nb,
+ timeout=self.timeout,
+ resources=self.resources,
+ )
_LOGGER.info(
f"""Local code interpreter initialized
Python version: {sys.version}
@@ -606,7 +612,9 @@ def close(self) -> None:
def restart_kernel(self) -> None:
self.close()
self.nb = nbformat.v4.new_notebook()
- self.nb_client = NotebookClient(self.nb, timeout=self.timeout)
+ self.nb_client = NotebookClient(
+ self.nb, timeout=self.timeout, resources=self.resources
+ )
sleep(1)
self._new_kernel()
@@ -636,7 +644,7 @@ def upload_file(self, file_path: Union[str, Path]) -> Path:
f.write(contents)
_LOGGER.info(f"File ({file_path}) is uploaded to: {str(self.remote_path)}")
- return Path(self.remote_path / file_path)
+ return Path(self.remote_path / Path(file_path).name)
def download_file(
self, remote_file_path: Union[str, Path], local_file_path: Union[str, Path]
@@ -672,7 +680,8 @@ def get_default_instance() -> CodeInterpreter:
@staticmethod
def new_instance(
- code_sandbox_runtime: Optional[str] = None, remote_path: Optional[str] = None
+ code_sandbox_runtime: Optional[str] = None,
+ remote_path: Optional[Union[str, Path]] = None,
) -> CodeInterpreter:
if not code_sandbox_runtime:
code_sandbox_runtime = os.getenv("CODE_SANDBOX_RUNTIME", "local")