From 9c4bcbe13cb8e704bc90d07ff750a23bdaf2f756 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Sat, 31 Aug 2024 09:35:50 -0700
Subject: [PATCH 01/15] moved fine tuning to meta tools

---
 vision_agent/tools/meta_tools.py  |  43 ++++++
 vision_agent/tools/tools.py       | 214 +++++++++++++-----------------
 vision_agent/tools/tools_types.py |  14 +-
 3 files changed, 141 insertions(+), 130 deletions(-)

diff --git a/vision_agent/tools/meta_tools.py b/vision_agent/tools/meta_tools.py
index 833ad542..e04a055d 100644
--- a/vision_agent/tools/meta_tools.py
+++ b/vision_agent/tools/meta_tools.py
@@ -4,14 +4,18 @@
 import tempfile
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Union
+from uuid import UUID
 
 from IPython.display import display
 
 import vision_agent as va
+from vision_agent.clients.landing_public_api import LandingPublicAPI
 from vision_agent.lmm.types import Message
 from vision_agent.tools.tool_utils import get_tool_documentation
 from vision_agent.tools.tools import TOOL_DESCRIPTIONS
+from vision_agent.tools.tools_types import BboxInput, BboxInputBase64, PromptTask
 from vision_agent.utils.execute import Execution, MimeType
+from vision_agent.utils.image_utils import convert_to_b64
 
 # These tools are adapted from SWE-Agent https://github.com/princeton-nlp/SWE-agent
 
@@ -397,6 +401,45 @@ def get_tool_descriptions() -> str:
     return TOOL_DESCRIPTIONS
 
 
+def florence2_fine_tuning(bboxes: List[Dict[str, Any]], task: str) -> UUID:
+    """'florence2_fine_tuning' is a tool that fine-tune florence2 to be able to detect
+    objects in an image based on a given dataset. It returns the fine tuning job id.
+
+    Parameters:
+        bboxes (List[BboxInput]): A list of BboxInput containing the
+            image path, labels and bounding boxes.
+        task (str): The florencev2 fine-tuning task. The options are
+            'phrase_grounding'.
+
+    Returns:
+        UUID: The fine tuning job id, this id will used to retrieve the fine
+            tuned model.
+
+    Example
+    -------
+        >>> fine_tuning_job_id = florencev2_fine_tuning(
+            [{'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[370, 30, 560, 290]]},
+             {'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[120, 0, 300, 170]]}],
+             "OBJECT_DETECTION"
+        )
+    """
+    bboxes_input = [BboxInput.model_validate(bbox) for bbox in bboxes]
+    task_type = PromptTask(task.upper())
+    fine_tuning_request = [
+        BboxInputBase64(
+            image=convert_to_b64(bbox_input.image_path),
+            filename=bbox_input.image_path.split("/")[-1],
+            labels=bbox_input.labels,
+            bboxes=bbox_input.bboxes,
+        )
+        for bbox_input in bboxes_input
+    ]
+    landing_api = LandingPublicAPI()
+    return landing_api.launch_fine_tuning_job(
+        "florencev2", task_type, fine_tuning_request
+    )
+
+
 META_TOOL_DOCSTRING = get_tool_documentation(
     [
         get_tool_descriptions,
diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py
index 0695b547..92a47a99 100644
--- a/vision_agent/tools/tools.py
+++ b/vision_agent/tools/tools.py
@@ -29,7 +29,7 @@
     BboxInput,
     BboxInputBase64,
     FineTuning,
-    Florencev2FtRequest,
+    Florence2FtRequest,
     JobStatus,
     PromptTask,
 )
@@ -762,7 +762,7 @@ def florence2_image_caption(image: np.ndarray, detail_caption: bool = True) -> s
     return answer[task]  # type: ignore
 
 
-def florence2_phrase_grounding(prompt: str, image: np.ndarray) -> List[Dict[str, Any]]:
+def florence2_phrase_grounding(prompt: str, image: np.ndarray, fine_tune_id: Optional[str] = None) -> List[Dict[str, Any]]:
     """'florence2_phrase_grounding' is a tool that can detect multiple
     objects given a text prompt which can be object names or caption. You
     can optionally separate the object names in the text with commas. It returns a list
@@ -790,14 +790,31 @@ def florence2_phrase_grounding(prompt: str, image: np.ndarray) -> List[Dict[str,
     """
     image_size = image.shape[:2]
     image_b64 = convert_to_b64(image)
-    data = {
-        "image": image_b64,
-        "task": "<CAPTION_TO_PHRASE_GROUNDING>",
-        "prompt": prompt,
-        "function_name": "florence2_phrase_grounding",
-    }
 
-    detections = send_inference_request(data, "florence2", v2=True)
+    if fine_tune_id is not None:
+        landing_api = LandingPublicAPI()
+        status = landing_api.check_fine_tuning_job(UUID(fine_tune_id))
+        if status is not JobStatus.SUCCEEDED:
+            raise FineTuneModelIsNotReady(f"Fine-tuned model {fine_tune_id} is not ready yet")
+
+        data_obj = Florence2FtRequest(
+            image=image_b64,
+            task=PromptTask.PHRASE_GROUNDING,
+            tool="florence2_fine_tuning",
+            prompt=prompt,
+            fine_tuning=FineTuning(job_id=UUID(fine_tune_id))
+        )
+        data = data_obj.model_dump(by_alias=True)
+        detections = send_inference_request(data, "tools", v2=False)
+    else:
+        data = {
+            "image": image_b64,
+            "task": "<CAPTION_TO_PHRASE_GROUNDING>",
+            "prompt": prompt,
+            "function_name": "florence2_phrase_grounding",
+        }
+        detections = send_inference_request(data, "florence2", v2=True)
+
     detections = detections["<CAPTION_TO_PHRASE_GROUNDING>"]
     return_data = []
     for i in range(len(detections["bboxes"])):
@@ -1560,116 +1577,75 @@ def overlay_heat_map(
 
 
 # TODO: add this function to the imports so that is picked in the agent
-def florencev2_fine_tuning(bboxes: List[Dict[str, Any]], task: str) -> UUID:
-    """'florencev2_fine_tuning' is a tool that fine-tune florencev2 to be able
-    to detect objects in an image based on a given dataset. It returns the fine
-    tuning job id.
-
-    Parameters:
-        bboxes (List[BboxInput]): A list of BboxInput containing the
-            image path, labels and bounding boxes.
-        task (PromptTask): The florencev2 fine-tuning task. The options are
-            CAPTION, CAPTION_TO_PHRASE_GROUNDING and OBJECT_DETECTION.
-
-    Returns:
-        UUID: The fine tuning job id, this id will used to retrieve the fine
-            tuned model.
-
-    Example
-    -------
-        >>> fine_tuning_job_id = florencev2_fine_tuning(
-            [{'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[370, 30, 560, 290]]},
-             {'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[120, 0, 300, 170]]}],
-             "OBJECT_DETECTION"
-        )
-    """
-    bboxes_input = [BboxInput.model_validate(bbox) for bbox in bboxes]
-    task_input = PromptTask[task]
-    fine_tuning_request = [
-        BboxInputBase64(
-            image=convert_to_b64(bbox_input.image_path),
-            filename=bbox_input.image_path.split("/")[-1],
-            labels=bbox_input.labels,
-            bboxes=bbox_input.bboxes,
-        )
-        for bbox_input in bboxes_input
-    ]
-    landing_api = LandingPublicAPI()
-    return landing_api.launch_fine_tuning_job(
-        "florencev2", task_input, fine_tuning_request
-    )
-
-
-# TODO: add this function to the imports so that is picked in the agent
-def florencev2_fine_tuned_object_detection(
-    image: np.ndarray, prompt: str, model_id: UUID, task: str
-) -> List[Dict[str, Any]]:
-    """'florencev2_fine_tuned_object_detection' is a tool that uses a fine tuned model
-    to detect objects given a text prompt such as a phrase or class names separated by
-    commas. It returns a list of detected objects as labels and their location as
-    bounding boxes with score of 1.0.
-
-    Parameters:
-        image (np.ndarray): The image to used to detect objects.
-        prompt (str): The prompt to help find objects in the image.
-        model_id (UUID): The fine-tuned model id.
-        task (PromptTask): The florencev2 fine-tuning task. The options are
-            CAPTION, CAPTION_TO_PHRASE_GROUNDING and OBJECT_DETECTION.
-
-    Returns:
-        List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
-            bounding box of the detected objects with normalized coordinates between 0
-            and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the
-            top-left and xmax and ymax are the coordinates of the bottom-right of the
-            bounding box. The scores are always 1.0 and cannot be thresholded
-
-    Example
-    -------
-        >>> florencev2_fine_tuned_object_detection(
-            image,
-            'person looking at a coyote',
-            UUID("381cd5f9-5dc4-472d-9260-f3bb89d31f83")
-        )
-        [
-            {'score': 1.0, 'label': 'person', 'bbox': [0.1, 0.11, 0.35, 0.4]},
-            {'score': 1.0, 'label': 'coyote', 'bbox': [0.34, 0.21, 0.85, 0.5},
-        ]
-    """
-    # check if job succeeded first
-    landing_api = LandingPublicAPI()
-    status = landing_api.check_fine_tuning_job(model_id)
-    if status is not JobStatus.SUCCEEDED:
-        raise FineTuneModelIsNotReady()
-
-    task = PromptTask[task]
-    if task is PromptTask.OBJECT_DETECTION:
-        prompt = ""
-
-    data_obj = Florencev2FtRequest(
-        image=convert_to_b64(image),
-        task=task,
-        tool="florencev2_fine_tuning",
-        prompt=prompt,
-        fine_tuning=FineTuning(job_id=model_id),
-    )
-    data = data_obj.model_dump(by_alias=True)
-    metadata_payload = {"function_name": "florencev2_fine_tuned_object_detection"}
-    detections = send_inference_request(
-        data, "tools", v2=False, metadata_payload=metadata_payload
-    )
-
-    detections = detections[task.value]
-    return_data = []
-    image_size = image.shape[:2]
-    for i in range(len(detections["bboxes"])):
-        return_data.append(
-            {
-                "score": 1.0,
-                "label": detections["labels"][i],
-                "bbox": normalize_bbox(detections["bboxes"][i], image_size),
-            }
-        )
-    return return_data
+# def florencev2_fine_tuned_object_detection(
+#     image: np.ndarray, prompt: str, model_id: UUID, task: str
+# ) -> List[Dict[str, Any]]:
+#     """'florencev2_fine_tuned_object_detection' is a tool that uses a fine tuned model
+#     to detect objects given a text prompt such as a phrase or class names separated by
+#     commas. It returns a list of detected objects as labels and their location as
+#     bounding boxes with score of 1.0.
+
+#     Parameters:
+#         image (np.ndarray): The image to used to detect objects.
+#         prompt (str): The prompt to help find objects in the image.
+#         model_id (UUID): The fine-tuned model id.
+#         task (PromptTask): The florencev2 fine-tuning task. The options are
+#             CAPTION, CAPTION_TO_PHRASE_GROUNDING and OBJECT_DETECTION.
+
+#     Returns:
+#         List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
+#             bounding box of the detected objects with normalized coordinates between 0
+#             and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the
+#             top-left and xmax and ymax are the coordinates of the bottom-right of the
+#             bounding box. The scores are always 1.0 and cannot be thresholded
+
+#     Example
+#     -------
+#         >>> florencev2_fine_tuned_object_detection(
+#             image,
+#             'person looking at a coyote',
+#             UUID("381cd5f9-5dc4-472d-9260-f3bb89d31f83")
+#         )
+#         [
+#             {'score': 1.0, 'label': 'person', 'bbox': [0.1, 0.11, 0.35, 0.4]},
+#             {'score': 1.0, 'label': 'coyote', 'bbox': [0.34, 0.21, 0.85, 0.5},
+#         ]
+#     """
+#     # check if job succeeded first
+#     landing_api = LandingPublicAPI()
+#     status = landing_api.check_fine_tuning_job(model_id)
+#     if status is not JobStatus.SUCCEEDED:
+#         raise FineTuneModelIsNotReady()
+
+#     task = PromptTask[task]
+#     if task is PromptTask.OBJECT_DETECTION:
+#         prompt = ""
+
+#     data_obj = Florencev2FtRequest(
+#         image=convert_to_b64(image),
+#         task=task,
+#         tool="florencev2_fine_tuning",
+#         prompt=prompt,
+#         fine_tuning=FineTuning(job_id=model_id),
+#     )
+#     data = data_obj.model_dump(by_alias=True)
+#     metadata_payload = {"function_name": "florencev2_fine_tuned_object_detection"}
+#     detections = send_inference_request(
+#         data, "tools", v2=False, metadata_payload=metadata_payload
+#     )
+
+#     detections = detections[task.value]
+#     return_data = []
+#     image_size = image.shape[:2]
+#     for i in range(len(detections["bboxes"])):
+#         return_data.append(
+#             {
+#                 "score": 1.0,
+#                 "label": detections["labels"][i],
+#                 "bbox": normalize_bbox(detections["bboxes"][i], image_size),
+#             }
+#         )
+#     return return_data
 
 
 FUNCTION_TOOLS = [
diff --git a/vision_agent/tools/tools_types.py b/vision_agent/tools/tools_types.py
index 7b640adb..20d178d7 100644
--- a/vision_agent/tools/tools_types.py
+++ b/vision_agent/tools/tools_types.py
@@ -19,16 +19,8 @@ class BboxInputBase64(BaseModel):
 
 
 class PromptTask(str, Enum):
-    """
-    Valid task prompts options for the Florencev2 model.
-    """
-
-    CAPTION = "<CAPTION>"
-    """"""
-    CAPTION_TO_PHRASE_GROUNDING = "<CAPTION_TO_PHRASE_GROUNDING>"
-    """"""
-    OBJECT_DETECTION = "<OD>"
-    """"""
+    """Valid task prompts options for the Florence2 model."""
+    PHRASE_GROUNDING = "<CAPTION_TO_PHRASE_GROUNDING>"
 
 
 class FineTuning(BaseModel):
@@ -41,7 +33,7 @@ def serialize_job_id(self, job_id: UUID, _info: SerializationInfo) -> str:
         return str(job_id)
 
 
-class Florencev2FtRequest(BaseModel):
+class Florence2FtRequest(BaseModel):
     model_config = ConfigDict(populate_by_name=True)
 
     image: str

From 4f32079ae9a47d46d226d34f5865e5ef1f5fee23 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Tue, 3 Sep 2024 07:49:51 -0700
Subject: [PATCH 02/15] fix error messages

---
 vision_agent/agent/vision_agent.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/vision_agent/agent/vision_agent.py b/vision_agent/agent/vision_agent.py
index 2bb04343..77237954 100644
--- a/vision_agent/agent/vision_agent.py
+++ b/vision_agent/agent/vision_agent.py
@@ -76,11 +76,16 @@ def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
 
 def run_code_action(
     code: str, code_interpreter: CodeInterpreter, artifact_remote_path: str
-) -> Execution:
-    return code_interpreter.exec_isolation(
+) -> Tuple[Execution, str]:
+    result = code_interpreter.exec_isolation(
         BoilerplateCode.add_boilerplate(code, remote_path=artifact_remote_path)
     )
 
+    obs = str(result.logs)
+    if result.error:
+        obs += f"\n{result.error}"
+    return result, obs
+
 
 def parse_execution(response: str) -> Optional[str]:
     code = None
@@ -260,10 +265,9 @@ def chat_with_code(
                 code_action = parse_execution(response["response"])
 
                 if code_action is not None:
-                    result = run_code_action(
+                    result, obs = run_code_action(
                         code_action, code_interpreter, str(remote_artifacts_path)
                     )
-                    obs = str(result.logs)
 
                     if self.verbosity >= 1:
                         _LOGGER.info(obs)

From b9e7541f66afc961776abb5846c1035739520306 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Tue, 3 Sep 2024 07:50:41 -0700
Subject: [PATCH 03/15] move get_diff and add use_florence2_fine_tuning

---
 vision_agent/agent/vision_agent_coder.py | 10 +---
 vision_agent/tools/meta_tools.py         | 71 ++++++++++++++++++++++--
 2 files changed, 66 insertions(+), 15 deletions(-)

diff --git a/vision_agent/agent/vision_agent_coder.py b/vision_agent/agent/vision_agent_coder.py
index c8488902..dd893d1d 100644
--- a/vision_agent/agent/vision_agent_coder.py
+++ b/vision_agent/agent/vision_agent_coder.py
@@ -1,5 +1,4 @@
 import copy
-import difflib
 import logging
 import os
 import sys
@@ -29,6 +28,7 @@
     USER_REQ,
 )
 from vision_agent.lmm import LMM, AzureOpenAILMM, Message, OllamaLMM, OpenAILMM
+from vision_agent.tools.meta_tools import get_diff
 from vision_agent.utils import CodeInterpreterFactory, Execution
 from vision_agent.utils.execute import CodeInterpreter
 from vision_agent.utils.image_utils import b64_to_pil
@@ -63,14 +63,6 @@ def prepend_imports(code: str) -> str:
         return DefaultImports.to_code_string() + "\n\n" + code
 
 
-def get_diff(before: str, after: str) -> str:
-    return "".join(
-        difflib.unified_diff(
-            before.splitlines(keepends=True), after.splitlines(keepends=True)
-        )
-    )
-
-
 def format_memory(memory: List[Dict[str, str]]) -> str:
     output_str = ""
     for i, m in enumerate(memory):
diff --git a/vision_agent/tools/meta_tools.py b/vision_agent/tools/meta_tools.py
index e04a055d..ee2e7c30 100644
--- a/vision_agent/tools/meta_tools.py
+++ b/vision_agent/tools/meta_tools.py
@@ -1,5 +1,7 @@
+import difflib
 import os
 import pickle as pkl
+import re
 import subprocess
 import tempfile
 from pathlib import Path
@@ -394,6 +396,13 @@ def write_media_artifact(artifacts: Artifacts, local_path: str) -> str:
     return f"[Media {Path(local_path).name} saved]"
 
 
+def list_artifacts(artifacts: Artifacts) -> str:
+    """Lists all the artifacts that have been loaded into the artifacts object."""
+    output_str = artifacts.show()
+    print(output_str)
+    return output_str
+
+
 def get_tool_descriptions() -> str:
     """Returns a description of all the tools that `generate_vision_code` has access to.
     Helpful for answering questions about what types of vision tasks you can do with
@@ -401,7 +410,7 @@ def get_tool_descriptions() -> str:
     return TOOL_DESCRIPTIONS
 
 
-def florence2_fine_tuning(bboxes: List[Dict[str, Any]], task: str) -> UUID:
+def florence2_fine_tuning(bboxes: List[Dict[str, Any]], task: str) -> str:
     """'florence2_fine_tuning' is a tool that fine-tune florence2 to be able to detect
     objects in an image based on a given dataset. It returns the fine tuning job id.
 
@@ -420,26 +429,73 @@ def florence2_fine_tuning(bboxes: List[Dict[str, Any]], task: str) -> UUID:
         >>> fine_tuning_job_id = florencev2_fine_tuning(
             [{'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[370, 30, 560, 290]]},
              {'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[120, 0, 300, 170]]}],
-             "OBJECT_DETECTION"
+             "phrase_grounding"
         )
     """
     bboxes_input = [BboxInput.model_validate(bbox) for bbox in bboxes]
-    task_type = PromptTask(task.upper())
+    task_type = PromptTask[task.upper()]
     fine_tuning_request = [
         BboxInputBase64(
             image=convert_to_b64(bbox_input.image_path),
-            filename=bbox_input.image_path.split("/")[-1],
+            filename=Path(bbox_input.image_path).name,
             labels=bbox_input.labels,
             bboxes=bbox_input.bboxes,
         )
         for bbox_input in bboxes_input
     ]
     landing_api = LandingPublicAPI()
-    return landing_api.launch_fine_tuning_job(
-        "florencev2", task_type, fine_tuning_request
+    # fine_tune_id = str(landing_api.launch_fine_tuning_job(
+    #     "florencev2", task_type, fine_tuning_request
+    # ))
+    fine_tune_id = "23b3b022-5ebf-4798-9373-20ef36429abf"
+    print(f"[Florence2 fine tuning id: {fine_tune_id}]")
+    return fine_tune_id
+
+
+def get_diff(before: str, after: str) -> str:
+    return "".join(
+        difflib.unified_diff(
+            before.splitlines(keepends=True), after.splitlines(keepends=True)
+        )
     )
 
 
+def use_florence2_fine_tuning(
+    artifacts: Artifacts, name: str, task: str, fine_tune_id: str
+) -> str:
+    """Replaces florence2 calls with the fine tuning id. This ensures that the code
+    utilizes the fined tuned florence2 model. Returns the diff between the original
+    code and the new code.
+
+    Parameters:
+        artifacts (Artifacts): The artifacts object to edit the code from.
+        name (str): The name of the artifact to edit.
+        task (str): The task to fine tune the model for. The options are
+            'phrase_grounding'.
+        fine_tune_id (str): The fine tuning job id.
+
+    Examples
+    --------
+        >>> diff = use_florence2_fine_tuning(artifacts, "code.py", "phrase_grounding", "23b3b022-5ebf-4798-9373-20ef36429abf")
+    """
+    code = artifacts[name]
+    if task.lower() == "phrase_grounding":
+        pattern = r'florence2_phrase_grounding\((".*?", .*?)\)'
+
+        def replacer(match):
+            return f'florence2_phrase_grounding({match.group(1)}, "{fine_tune_id}")'
+
+    else:
+        raise ValueError(f"Task {task} is not supported.")
+
+    new_code = re.sub(pattern, replacer, code)
+    artifacts[name] = new_code
+
+    diff = get_diff(code, new_code)
+    print(diff)
+    return diff
+
+
 META_TOOL_DOCSTRING = get_tool_documentation(
     [
         get_tool_descriptions,
@@ -449,5 +505,8 @@ def florence2_fine_tuning(bboxes: List[Dict[str, Any]], task: str) -> UUID:
         generate_vision_code,
         edit_vision_code,
         write_media_artifact,
+        florence2_fine_tuning,
+        use_florence2_fine_tuning,
+        list_artifacts,
     ]
 )

From d0bf79e2e4916153fba652187703fc45a8903cec Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Tue, 3 Sep 2024 07:52:27 -0700
Subject: [PATCH 04/15] add fine tuning arg to florence2

---
 vision_agent/tools/tools.py       | 14 ++++++++++----
 vision_agent/tools/tools_types.py |  1 +
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py
index 92a47a99..1ea19f62 100644
--- a/vision_agent/tools/tools.py
+++ b/vision_agent/tools/tools.py
@@ -762,7 +762,9 @@ def florence2_image_caption(image: np.ndarray, detail_caption: bool = True) -> s
     return answer[task]  # type: ignore
 
 
-def florence2_phrase_grounding(prompt: str, image: np.ndarray, fine_tune_id: Optional[str] = None) -> List[Dict[str, Any]]:
+def florence2_phrase_grounding(
+    prompt: str, image: np.ndarray, fine_tune_id: Optional[str] = None
+) -> List[Dict[str, Any]]:
     """'florence2_phrase_grounding' is a tool that can detect multiple
     objects given a text prompt which can be object names or caption. You
     can optionally separate the object names in the text with commas. It returns a list
@@ -772,6 +774,8 @@ def florence2_phrase_grounding(prompt: str, image: np.ndarray, fine_tune_id: Opt
     Parameters:
         prompt (str): The prompt to ground to the image.
         image (np.ndarray): The image to used to detect objects
+        fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
+            fine-tuned model ID here to use it.
 
     Returns:
         List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
@@ -795,14 +799,16 @@ def florence2_phrase_grounding(prompt: str, image: np.ndarray, fine_tune_id: Opt
         landing_api = LandingPublicAPI()
         status = landing_api.check_fine_tuning_job(UUID(fine_tune_id))
         if status is not JobStatus.SUCCEEDED:
-            raise FineTuneModelIsNotReady(f"Fine-tuned model {fine_tune_id} is not ready yet")
+            raise FineTuneModelIsNotReady(
+                f"Fine-tuned model {fine_tune_id} is not ready yet"
+            )
 
         data_obj = Florence2FtRequest(
             image=image_b64,
             task=PromptTask.PHRASE_GROUNDING,
-            tool="florence2_fine_tuning",
+            tool="florencev2_fine_tuning",
             prompt=prompt,
-            fine_tuning=FineTuning(job_id=UUID(fine_tune_id))
+            fine_tuning=FineTuning(job_id=UUID(fine_tune_id)),
         )
         data = data_obj.model_dump(by_alias=True)
         detections = send_inference_request(data, "tools", v2=False)
diff --git a/vision_agent/tools/tools_types.py b/vision_agent/tools/tools_types.py
index 20d178d7..eb436d94 100644
--- a/vision_agent/tools/tools_types.py
+++ b/vision_agent/tools/tools_types.py
@@ -20,6 +20,7 @@ class BboxInputBase64(BaseModel):
 
 class PromptTask(str, Enum):
     """Valid task prompts options for the Florence2 model."""
+
     PHRASE_GROUNDING = "<CAPTION_TO_PHRASE_GROUNDING>"
 
 

From c56d73b8de0e2b7b9e16ac95818c5d05e7e80e8d Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Tue, 3 Sep 2024 08:57:42 -0700
Subject: [PATCH 05/15] set notebook execute path to remote path'

---
 vision_agent/utils/execute.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/vision_agent/utils/execute.py b/vision_agent/utils/execute.py
index 37c8d260..447743d1 100644
--- a/vision_agent/utils/execute.py
+++ b/vision_agent/utils/execute.py
@@ -564,7 +564,12 @@ def __init__(
     ) -> None:
         super().__init__(timeout=timeout)
         self.nb = nbformat.v4.new_notebook()
-        self.nb_client = NotebookClient(self.nb, timeout=self.timeout)
+        # Set the notebook execution path to the remote path
+        self.nb_client = NotebookClient(
+            self.nb,
+            timeout=self.timeout,
+            resources={"metadata": {"path": str(self.remote_path)}},
+        )
         _LOGGER.info(
             f"""Local code interpreter initialized
 Python version: {sys.version}

From 1802d698d32ab45f8f3eeb3bd32f09466e6c40f3 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Tue, 3 Sep 2024 08:57:55 -0700
Subject: [PATCH 06/15] remove comments

---
 vision_agent/tools/tools.py | 74 -------------------------------------
 1 file changed, 74 deletions(-)

diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py
index 1ea19f62..828b1ba9 100644
--- a/vision_agent/tools/tools.py
+++ b/vision_agent/tools/tools.py
@@ -26,8 +26,6 @@
     send_inference_request,
 )
 from vision_agent.tools.tools_types import (
-    BboxInput,
-    BboxInputBase64,
     FineTuning,
     Florence2FtRequest,
     JobStatus,
@@ -1582,78 +1580,6 @@ def overlay_heat_map(
     return np.array(combined)
 
 
-# TODO: add this function to the imports so that is picked in the agent
-# def florencev2_fine_tuned_object_detection(
-#     image: np.ndarray, prompt: str, model_id: UUID, task: str
-# ) -> List[Dict[str, Any]]:
-#     """'florencev2_fine_tuned_object_detection' is a tool that uses a fine tuned model
-#     to detect objects given a text prompt such as a phrase or class names separated by
-#     commas. It returns a list of detected objects as labels and their location as
-#     bounding boxes with score of 1.0.
-
-#     Parameters:
-#         image (np.ndarray): The image to used to detect objects.
-#         prompt (str): The prompt to help find objects in the image.
-#         model_id (UUID): The fine-tuned model id.
-#         task (PromptTask): The florencev2 fine-tuning task. The options are
-#             CAPTION, CAPTION_TO_PHRASE_GROUNDING and OBJECT_DETECTION.
-
-#     Returns:
-#         List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
-#             bounding box of the detected objects with normalized coordinates between 0
-#             and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the
-#             top-left and xmax and ymax are the coordinates of the bottom-right of the
-#             bounding box. The scores are always 1.0 and cannot be thresholded
-
-#     Example
-#     -------
-#         >>> florencev2_fine_tuned_object_detection(
-#             image,
-#             'person looking at a coyote',
-#             UUID("381cd5f9-5dc4-472d-9260-f3bb89d31f83")
-#         )
-#         [
-#             {'score': 1.0, 'label': 'person', 'bbox': [0.1, 0.11, 0.35, 0.4]},
-#             {'score': 1.0, 'label': 'coyote', 'bbox': [0.34, 0.21, 0.85, 0.5},
-#         ]
-#     """
-#     # check if job succeeded first
-#     landing_api = LandingPublicAPI()
-#     status = landing_api.check_fine_tuning_job(model_id)
-#     if status is not JobStatus.SUCCEEDED:
-#         raise FineTuneModelIsNotReady()
-
-#     task = PromptTask[task]
-#     if task is PromptTask.OBJECT_DETECTION:
-#         prompt = ""
-
-#     data_obj = Florencev2FtRequest(
-#         image=convert_to_b64(image),
-#         task=task,
-#         tool="florencev2_fine_tuning",
-#         prompt=prompt,
-#         fine_tuning=FineTuning(job_id=model_id),
-#     )
-#     data = data_obj.model_dump(by_alias=True)
-#     metadata_payload = {"function_name": "florencev2_fine_tuned_object_detection"}
-#     detections = send_inference_request(
-#         data, "tools", v2=False, metadata_payload=metadata_payload
-#     )
-
-#     detections = detections[task.value]
-#     return_data = []
-#     image_size = image.shape[:2]
-#     for i in range(len(detections["bboxes"])):
-#         return_data.append(
-#             {
-#                 "score": 1.0,
-#                 "label": detections["labels"][i],
-#                 "bbox": normalize_bbox(detections["bboxes"][i], image_size),
-#             }
-#         )
-#     return return_data
-
-
 FUNCTION_TOOLS = [
     owl_v2,
     extract_frames,

From c8453e75ba3388c8e36b0f174fabb7e7fb6377a4 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Tue, 3 Sep 2024 09:30:14 -0700
Subject: [PATCH 07/15] fix bug exec isolation wasn't setting resources

---
 vision_agent/utils/execute.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/vision_agent/utils/execute.py b/vision_agent/utils/execute.py
index 447743d1..33667f17 100644
--- a/vision_agent/utils/execute.py
+++ b/vision_agent/utils/execute.py
@@ -565,10 +565,11 @@ def __init__(
         super().__init__(timeout=timeout)
         self.nb = nbformat.v4.new_notebook()
         # Set the notebook execution path to the remote path
+        self.resources = {"metadata": {"path": str(self.remote_path)}}
         self.nb_client = NotebookClient(
             self.nb,
             timeout=self.timeout,
-            resources={"metadata": {"path": str(self.remote_path)}},
+            resources=self.resources,
         )
         _LOGGER.info(
             f"""Local code interpreter initialized
@@ -611,7 +612,9 @@ def close(self) -> None:
     def restart_kernel(self) -> None:
         self.close()
         self.nb = nbformat.v4.new_notebook()
-        self.nb_client = NotebookClient(self.nb, timeout=self.timeout)
+        self.nb_client = NotebookClient(
+            self.nb, timeout=self.timeout, resources=self.resources
+        )
         sleep(1)
         self._new_kernel()
 
@@ -677,7 +680,8 @@ def get_default_instance() -> CodeInterpreter:
 
     @staticmethod
     def new_instance(
-        code_sandbox_runtime: Optional[str] = None, remote_path: Optional[str] = None
+        code_sandbox_runtime: Optional[str] = None,
+        remote_path: Optional[Union[str, Path]] = None,
     ) -> CodeInterpreter:
         if not code_sandbox_runtime:
             code_sandbox_runtime = os.getenv("CODE_SANDBOX_RUNTIME", "local")

From 5b3c0f01059df0b8cbf3bdae59e2701d40207172 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Tue, 3 Sep 2024 09:49:28 -0700
Subject: [PATCH 08/15] ensure agent uses print to view results

---
 vision_agent/agent/vision_agent_prompts.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vision_agent/agent/vision_agent_prompts.py b/vision_agent/agent/vision_agent_prompts.py
index 85e34cd5..bf9fac80 100644
--- a/vision_agent/agent/vision_agent_prompts.py
+++ b/vision_agent/agent/vision_agent_prompts.py
@@ -48,7 +48,7 @@
 4|    return dogs
 [End of artifact]
 
-AGENT: {"thoughts": "I have generated the code to detect the dogs in the image, I must now run the code to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n    print(detect_dogs('/example/workspace/dog.jpg'))</execute_python>", "let_user_respond": false}
+AGENT: {"thoughts": "I have generated the code to detect the dogs in the image, I must now run the code and print the results to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n    print(detect_dogs('/example/workspace/dog.jpg'))</execute_python>", "let_user_respond": false}
 
 OBSERVATION:
 ----- stdout -----
@@ -75,7 +75,7 @@
 4|    return dogs
 [End of artifact]
 
-AGENT: {"thoughts": "I have edited the code to detect only one dog, I must now run the code to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n    print(detect_dogs('/path/to/images/dog.jpg'))</execute_python>", "let_user_respond": false}
+AGENT: {"thoughts": "I have edited the code to detect only one dog, I must now run the code and print the results to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n    print(detect_dogs('/path/to/images/dog.jpg'))</execute_python>", "let_user_respond": false}
 
 OBSERVATION:
 ----- stdout -----
@@ -126,7 +126,7 @@
 15|    return count
 [End of artifact]
 
-AGENT: {"thoughts": "I have generated the code to count the workers with helmets in the image, I must now run the code to get the output and write the visualization to the artifacts so the user can see it.", "response": "<execute_python>from code import count_workers_with_helmets\n    print(count_workers_with_helmets('/path/to/images/workers.png', 'workers_viz.png'))\n    write_media_artifact(artifacts, 'workers_viz.png')</execute_python>", "let_user_respond": false}
+AGENT: {"thoughts": "I have generated the code to count the workers with helmets in the image, I must now run the code and print the output and write the visualization to the artifacts so I can see the result and the user can see the visaulization.", "response": "<execute_python>from code import count_workers_with_helmets\n    print(count_workers_with_helmets('/path/to/images/workers.png', 'workers_viz.png'))\n    write_media_artifact(artifacts, 'workers_viz.png')</execute_python>", "let_user_respond": false}
 
 OBSERVATION:
 ----- stdout -----

From cc0e866d55cfed6bea6acfb36c0f699411bb6fd4 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Tue, 3 Sep 2024 14:09:06 -0700
Subject: [PATCH 09/15] fixed bug with edit code errors

---
 vision_agent/tools/meta_tools.py | 29 +++++++++++++++++++++++------
 1 file changed, 23 insertions(+), 6 deletions(-)

diff --git a/vision_agent/tools/meta_tools.py b/vision_agent/tools/meta_tools.py
index ee2e7c30..50c2c873 100644
--- a/vision_agent/tools/meta_tools.py
+++ b/vision_agent/tools/meta_tools.py
@@ -6,7 +6,6 @@
 import tempfile
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Union
-from uuid import UUID
 
 from IPython.display import display
 
@@ -105,13 +104,14 @@ def load(self, file_path: Union[str, Path]) -> None:
 
     def show(self) -> str:
         """Shows the artifacts that have been loaded and their remote save paths."""
-        out_str = "[Artifacts loaded]\n"
+        output_str = "[Artifacts loaded]\n"
         for k in self.artifacts.keys():
-            out_str += (
+            output_str += (
                 f"Artifact {k} loaded to {str(self.remote_save_path.parent / k)}\n"
             )
-        out_str += "[End of artifacts]\n"
-        return out_str
+        output_str += "[End of artifacts]\n"
+        print(output_str)
+        return output_str
 
     def save(self, local_path: Optional[Union[str, Path]] = None) -> None:
         save_path = (
@@ -237,7 +237,7 @@ def edit_code_artifact(
     new_content_lines = [
         line if line.endswith("\n") else line + "\n" for line in new_content_lines
     ]
-    lines = artifacts[name].splitlines()
+    lines = artifacts[name].splitlines(keepends=True)
     edited_lines = lines[:start] + new_content_lines + lines[end:]
 
     cur_line = start + len(content.split("\n")) // 2
@@ -274,6 +274,7 @@ def edit_code_artifact(
             )
 
             error_msg += f"\n[This is how your edit would have looked like if applied]\n{edited_view}\n\n[This is the original code before your edit]\n{original_view}"
+            print(error_msg)
             return error_msg
 
     artifacts[name] = "".join(edited_lines)
@@ -478,6 +479,16 @@ def use_florence2_fine_tuning(
     --------
         >>> diff = use_florence2_fine_tuning(artifacts, "code.py", "phrase_grounding", "23b3b022-5ebf-4798-9373-20ef36429abf")
     """
+
+    task_to_fn = {
+        "phrase_grounding": "florence2_phrase_grounding"
+    }
+
+    if name not in artifacts:
+        output_str = f"[Artifact {name} does not exist]"
+        print(output_str)
+        return output_str
+
     code = artifacts[name]
     if task.lower() == "phrase_grounding":
         pattern = r'florence2_phrase_grounding\((".*?", .*?)\)'
@@ -489,6 +500,12 @@ def replacer(match):
         raise ValueError(f"Task {task} is not supported.")
 
     new_code = re.sub(pattern, replacer, code)
+
+    if new_code == code:
+        output_str = f"[Fine tuning task {task} function {task_to_fn[task]} not found in code]"
+        print(output_str)
+        return output_str
+
     artifacts[name] = new_code
 
     diff = get_diff(code, new_code)

From 7125e620f0971ce2163103498cca095ec0869431 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Tue, 3 Sep 2024 14:28:25 -0700
Subject: [PATCH 10/15] fixed bug with edit code errors, and fixed replace code
 for fine tune

---
 vision_agent/tools/meta_tools.py | 33 +++++++++++++++++++++++---------
 1 file changed, 24 insertions(+), 9 deletions(-)

diff --git a/vision_agent/tools/meta_tools.py b/vision_agent/tools/meta_tools.py
index 50c2c873..93cdccf2 100644
--- a/vision_agent/tools/meta_tools.py
+++ b/vision_agent/tools/meta_tools.py
@@ -141,7 +141,12 @@ def format_lines(lines: List[str], start_idx: int) -> str:
 
 
 def view_lines(
-    lines: List[str], line_num: int, window_size: int, name: str, total_lines: int
+    lines: List[str],
+    line_num: int,
+    window_size: int,
+    name: str,
+    total_lines: int,
+    print_output: bool = True,
 ) -> str:
     start = max(0, line_num - window_size)
     end = min(len(lines), line_num + window_size)
@@ -154,7 +159,9 @@ def view_lines(
             else f"[{len(lines) - end} more lines]"
         )
     )
-    print(return_str)
+
+    if print_output:
+        print(return_str)
     return return_str
 
 
@@ -267,10 +274,16 @@ def edit_code_artifact(
                 DEFAULT_WINDOW_SIZE,
                 name,
                 total_lines,
+                print_output=False,
             )
             total_lines_edit = sum(1 for _ in edited_lines)
             edited_view = view_lines(
-                edited_lines, cur_line, DEFAULT_WINDOW_SIZE, name, total_lines_edit
+                edited_lines,
+                cur_line,
+                DEFAULT_WINDOW_SIZE,
+                name,
+                total_lines_edit,
+                print_output=False,
             )
 
             error_msg += f"\n[This is how your edit would have looked like if applied]\n{edited_view}\n\n[This is the original code before your edit]\n{original_view}"
@@ -480,9 +493,7 @@ def use_florence2_fine_tuning(
         >>> diff = use_florence2_fine_tuning(artifacts, "code.py", "phrase_grounding", "23b3b022-5ebf-4798-9373-20ef36429abf")
     """
 
-    task_to_fn = {
-        "phrase_grounding": "florence2_phrase_grounding"
-    }
+    task_to_fn = {"phrase_grounding": "florence2_phrase_grounding"}
 
     if name not in artifacts:
         output_str = f"[Artifact {name} does not exist]"
@@ -491,10 +502,12 @@ def use_florence2_fine_tuning(
 
     code = artifacts[name]
     if task.lower() == "phrase_grounding":
-        pattern = r'florence2_phrase_grounding\((".*?", .*?)\)'
+        pattern = r"florence2_phrase_grounding\(([^,]+),\s*([^\)]+)\)"
 
         def replacer(match):
-            return f'florence2_phrase_grounding({match.group(1)}, "{fine_tune_id}")'
+            arg1 = match.group(1)
+            arg2 = match.group(2)
+            return f'florence2_phrase_grounding({arg1}, {arg2}, "{fine_tune_id}")'
 
     else:
         raise ValueError(f"Task {task} is not supported.")
@@ -502,7 +515,9 @@ def replacer(match):
     new_code = re.sub(pattern, replacer, code)
 
     if new_code == code:
-        output_str = f"[Fine tuning task {task} function {task_to_fn[task]} not found in code]"
+        output_str = (
+            f"[Fine tuning task {task} function {task_to_fn[task]} not found in code]"
+        )
         print(output_str)
         return output_str
 

From 42cf17237fcd15b50422735a8cea9ce79120cd8b Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Tue, 3 Sep 2024 15:13:12 -0700
Subject: [PATCH 11/15] add imports for new meta tools

---
 vision_agent/agent/vision_agent.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vision_agent/agent/vision_agent.py b/vision_agent/agent/vision_agent.py
index 77237954..4733bb24 100644
--- a/vision_agent/agent/vision_agent.py
+++ b/vision_agent/agent/vision_agent.py
@@ -30,7 +30,7 @@ class BoilerplateCode:
     pre_code = [
         "from typing import *",
         "from vision_agent.utils.execute import CodeInterpreter",
-        "from vision_agent.tools.meta_tools import Artifacts, open_code_artifact, create_code_artifact, edit_code_artifact, get_tool_descriptions, generate_vision_code, edit_vision_code, write_media_artifact",
+        "from vision_agent.tools.meta_tools import Artifacts, open_code_artifact, create_code_artifact, edit_code_artifact, get_tool_descriptions, generate_vision_code, edit_vision_code, write_media_artifact, florence2_fine_tuning, use_florence2_fine_tuning",
         "artifacts = Artifacts('{remote_path}')",
         "artifacts.load('{remote_path}')",
     ]
@@ -197,7 +197,7 @@ def chat_with_code(
             artifacts = Artifacts(WORKSPACE / "artifacts.pkl")
 
         with CodeInterpreterFactory.new_instance(
-            code_sandbox_runtime=self.code_sandbox_runtime
+            code_sandbox_runtime=self.code_sandbox_runtime,
         ) as code_interpreter:
             orig_chat = copy.deepcopy(chat)
             int_chat = copy.deepcopy(chat)

From 619780eb839b1c195451e420eab3094c2dc965e9 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Tue, 3 Sep 2024 15:14:38 -0700
Subject: [PATCH 12/15] fixed type errors

---
 vision_agent/tools/meta_tools.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/vision_agent/tools/meta_tools.py b/vision_agent/tools/meta_tools.py
index 93cdccf2..aa809818 100644
--- a/vision_agent/tools/meta_tools.py
+++ b/vision_agent/tools/meta_tools.py
@@ -458,10 +458,9 @@ def florence2_fine_tuning(bboxes: List[Dict[str, Any]], task: str) -> str:
         for bbox_input in bboxes_input
     ]
     landing_api = LandingPublicAPI()
-    # fine_tune_id = str(landing_api.launch_fine_tuning_job(
-    #     "florencev2", task_type, fine_tuning_request
-    # ))
-    fine_tune_id = "23b3b022-5ebf-4798-9373-20ef36429abf"
+    fine_tune_id = str(landing_api.launch_fine_tuning_job(
+        "florencev2", task_type, fine_tuning_request
+    ))
     print(f"[Florence2 fine tuning id: {fine_tune_id}]")
     return fine_tune_id
 
@@ -504,7 +503,7 @@ def use_florence2_fine_tuning(
     if task.lower() == "phrase_grounding":
         pattern = r"florence2_phrase_grounding\(([^,]+),\s*([^\)]+)\)"
 
-        def replacer(match):
+        def replacer(match: re.Match) -> str:
             arg1 = match.group(1)
             arg2 = match.group(2)
             return f'florence2_phrase_grounding({arg1}, {arg2}, "{fine_tune_id}")'

From f722be4a9f5924b0794e56aa35c2df6c4a1c4e62 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Tue, 3 Sep 2024 15:23:04 -0700
Subject: [PATCH 13/15] fix format issue

---
 vision_agent/tools/meta_tools.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vision_agent/tools/meta_tools.py b/vision_agent/tools/meta_tools.py
index aa809818..7129a94c 100644
--- a/vision_agent/tools/meta_tools.py
+++ b/vision_agent/tools/meta_tools.py
@@ -458,9 +458,9 @@ def florence2_fine_tuning(bboxes: List[Dict[str, Any]], task: str) -> str:
         for bbox_input in bboxes_input
     ]
     landing_api = LandingPublicAPI()
-    fine_tune_id = str(landing_api.launch_fine_tuning_job(
-        "florencev2", task_type, fine_tuning_request
-    ))
+    fine_tune_id = str(
+        landing_api.launch_fine_tuning_job("florencev2", task_type, fine_tuning_request)
+    )
     print(f"[Florence2 fine tuning id: {fine_tune_id}]")
     return fine_tune_id
 

From 7dac4cc9cea6b268518affb76ce312ad4caf5d60 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Tue, 3 Sep 2024 15:40:10 -0700
Subject: [PATCH 14/15] fixed regex

---
 vision_agent/tools/meta_tools.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/vision_agent/tools/meta_tools.py b/vision_agent/tools/meta_tools.py
index 7129a94c..3670e600 100644
--- a/vision_agent/tools/meta_tools.py
+++ b/vision_agent/tools/meta_tools.py
@@ -501,12 +501,11 @@ def use_florence2_fine_tuning(
 
     code = artifacts[name]
     if task.lower() == "phrase_grounding":
-        pattern = r"florence2_phrase_grounding\(([^,]+),\s*([^\)]+)\)"
+        pattern = r"florence2_phrase_grounding\(\s*([^\)]+)\)"
 
         def replacer(match: re.Match) -> str:
-            arg1 = match.group(1)
-            arg2 = match.group(2)
-            return f'florence2_phrase_grounding({arg1}, {arg2}, "{fine_tune_id}")'
+            arg = match.group(1)  # capture all initial arguments
+            return f'florence2_phrase_grounding({arg}, "{fine_tune_id}")'
 
     else:
         raise ValueError(f"Task {task} is not supported.")

From 534d674114c4429a0012919837a9613fa7fcbcb6 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Tue, 3 Sep 2024 20:32:43 -0700
Subject: [PATCH 15/15] fix bug with upload return path

---
 vision_agent/utils/execute.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vision_agent/utils/execute.py b/vision_agent/utils/execute.py
index 33667f17..0de10335 100644
--- a/vision_agent/utils/execute.py
+++ b/vision_agent/utils/execute.py
@@ -644,7 +644,7 @@ def upload_file(self, file_path: Union[str, Path]) -> Path:
             f.write(contents)
         _LOGGER.info(f"File ({file_path}) is uploaded to: {str(self.remote_path)}")
 
-        return Path(self.remote_path / file_path)
+        return Path(self.remote_path / Path(file_path).name)
 
     def download_file(
         self, remote_file_path: Union[str, Path], local_file_path: Union[str, Path]