From 1b3d8a86cb4a5bd2963c5ed0341f3720224dc41e Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Wed, 11 Sep 2024 08:29:44 -0700
Subject: [PATCH 01/30] resize image for claude

---
 vision_agent/lmm/lmm.py           |  4 ++--
 vision_agent/utils/image_utils.py | 23 ++++++++++++++++-------
 2 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/vision_agent/lmm/lmm.py b/vision_agent/lmm/lmm.py
index d075dad5..d329340b 100644
--- a/vision_agent/lmm/lmm.py
+++ b/vision_agent/lmm/lmm.py
@@ -402,7 +402,7 @@ def chat(
             ]
             if "media" in msg:
                 for media_path in msg["media"]:
-                    encoded_media = encode_media(media_path)
+                    encoded_media = encode_media(media_path, resize=768)
                     content.append(
                         ImageBlockParam(
                             type="image",
@@ -449,7 +449,7 @@ def generate(
         ]
         if media:
             for m in media:
-                encoded_media = encode_media(m)
+                encoded_media = encode_media(m, resize=768)
                 content.append(
                     ImageBlockParam(
                         type="image",
diff --git a/vision_agent/utils/image_utils.py b/vision_agent/utils/image_utils.py
index e480dac5..b66390bd 100644
--- a/vision_agent/utils/image_utils.py
+++ b/vision_agent/utils/image_utils.py
@@ -5,7 +5,7 @@
 from importlib import resources
 from io import BytesIO
 from pathlib import Path
-from typing import Dict, List, Tuple, Union
+from typing import Dict, List, Tuple, Union, Optional
 
 import numpy as np
 from PIL import Image, ImageDraw, ImageFont
@@ -154,15 +154,19 @@ def convert_to_b64(data: Union[str, Path, np.ndarray, ImageType]) -> str:
         )
 
 
-def encode_image_bytes(image: bytes) -> str:
-    image = Image.open(io.BytesIO(image)).convert("RGB")  # type: ignore
+def encode_image_bytes(image: bytes, resize: Optional[int] = None) -> str:
+    if resize is not None:
+        image_pil = Image.open(io.BytesIO(image)).convert("RGB")
+        image_pil.thumbnail((resize, resize))
+    else:
+        image_pil = Image.open(io.BytesIO(image)).convert("RGB")
     buffer = io.BytesIO()
-    image.save(buffer, format="PNG")  # type: ignore
+    image_pil.save(buffer, format="PNG")
     encoded_image = base64.b64encode(buffer.getvalue()).decode("utf-8")
     return encoded_image
 
 
-def encode_media(media: Union[str, Path]) -> str:
+def encode_media(media: Union[str, Path], resize: Optional[int] = None) -> str:
     if isinstance(media, str) and media.startswith(("http", "https")):
         # for mp4 video url, we assume there is a same url but ends with png
         # vision-agent-ui will upload this png when uploading the video
@@ -192,11 +196,16 @@ def encode_media(media: Union[str, Path]) -> str:
         frames = extract_frames_from_video(str(media), fps=1)
         image = frames[len(frames) // 2]
         buffer = io.BytesIO()
-        Image.fromarray(image[0]).convert("RGB").save(buffer, format="PNG")
+        if resize is not None:
+            image_pil = Image.fromarray(image[0]).convert("RGB")
+            image_pil.thumbnail((resize, resize))
+        else:
+            image_pil = Image.fromarray(image[0]).convert("RGB")
+        image_pil.save(buffer, format="PNG")
         image_bytes = buffer.getvalue()
     else:
         image_bytes = open(media, "rb").read()
-    return encode_image_bytes(image_bytes)
+    return encode_image_bytes(image_bytes, resize=resize)
 
 
 def denormalize_bbox(

From 2b112b58dfa273f30c5e639dd0b03f031626d8b5 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Wed, 11 Sep 2024 08:37:11 -0700
Subject: [PATCH 02/30] only resize if above size

---
 vision_agent/utils/image_utils.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/vision_agent/utils/image_utils.py b/vision_agent/utils/image_utils.py
index b66390bd..45181685 100644
--- a/vision_agent/utils/image_utils.py
+++ b/vision_agent/utils/image_utils.py
@@ -157,7 +157,8 @@ def convert_to_b64(data: Union[str, Path, np.ndarray, ImageType]) -> str:
 def encode_image_bytes(image: bytes, resize: Optional[int] = None) -> str:
     if resize is not None:
         image_pil = Image.open(io.BytesIO(image)).convert("RGB")
-        image_pil.thumbnail((resize, resize))
+        if image_pil.size[0] > resize or image_pil.size[1] > resize:
+            image_pil.thumbnail((resize, resize))
     else:
         image_pil = Image.open(io.BytesIO(image)).convert("RGB")
     buffer = io.BytesIO()
@@ -198,7 +199,8 @@ def encode_media(media: Union[str, Path], resize: Optional[int] = None) -> str:
         buffer = io.BytesIO()
         if resize is not None:
             image_pil = Image.fromarray(image[0]).convert("RGB")
-            image_pil.thumbnail((resize, resize))
+            if image_pil.size[0] > resize or image_pil.size[1] > resize:
+                image_pil.thumbnail((resize, resize))
         else:
             image_pil = Image.fromarray(image[0]).convert("RGB")
         image_pil.save(buffer, format="PNG")

From 9e716978911d1fdeba59d5f15cf8d2689809f76d Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Wed, 11 Sep 2024 09:07:30 -0700
Subject: [PATCH 03/30] renamed claude to anthropic for consistency

---
 vision_agent/agent/__init__.py           |  2 +-
 vision_agent/agent/vision_agent.py       | 66 ++++++++++++++++++++++--
 vision_agent/agent/vision_agent_coder.py | 14 +++--
 vision_agent/lmm/__init__.py             |  2 +-
 vision_agent/lmm/lmm.py                  |  9 ++--
 5 files changed, 74 insertions(+), 19 deletions(-)

diff --git a/vision_agent/agent/__init__.py b/vision_agent/agent/__init__.py
index e1478a38..abd3bde0 100644
--- a/vision_agent/agent/__init__.py
+++ b/vision_agent/agent/__init__.py
@@ -2,7 +2,7 @@
 from .vision_agent import VisionAgent
 from .vision_agent_coder import (
     AzureVisionAgentCoder,
-    ClaudeVisionAgentCoder,
+    AnthropicVisionAgentCoder,
     OllamaVisionAgentCoder,
     VisionAgentCoder,
 )
diff --git a/vision_agent/agent/vision_agent.py b/vision_agent/agent/vision_agent.py
index 62682524..3b0d69ec 100644
--- a/vision_agent/agent/vision_agent.py
+++ b/vision_agent/agent/vision_agent.py
@@ -3,7 +3,7 @@
 import os
 import tempfile
 from pathlib import Path
-from typing import Any, Dict, List, Optional, Tuple, Union, cast, Callable
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast
 
 from vision_agent.agent import Agent
 from vision_agent.agent.agent_utils import extract_json
@@ -12,7 +12,7 @@
     EXAMPLES_CODE2,
     VA_CODE,
 )
-from vision_agent.lmm import LMM, Message, OpenAILMM
+from vision_agent.lmm import LMM, AnthropicLMM, Message, OpenAILMM
 from vision_agent.tools import META_TOOL_DOCSTRING
 from vision_agent.tools.meta_tools import Artifacts, use_extra_vision_agent_args
 from vision_agent.utils import CodeInterpreterFactory
@@ -139,7 +139,7 @@ def __init__(
         self.agent = (
             OpenAILMM(temperature=0.0, json_mode=True) if agent is None else agent
         )
-        self.max_iterations = 100
+        self.max_iterations = 12
         self.verbosity = verbosity
         self.code_sandbox_runtime = code_sandbox_runtime
         self.callback_message = callback_message
@@ -353,3 +353,63 @@ def streaming_message(self, message: Dict[str, Any]) -> None:
 
     def log_progress(self, data: Dict[str, Any]) -> None:
         pass
+
+
+class OpenAIVisionAgent(VisionAgent):
+    def __init__(
+        self,
+        agent: Optional[LMM] = None,
+        verbosity: int = 0,
+        local_artifacts_path: Optional[Union[str, Path]] = None,
+        code_sandbox_runtime: Optional[str] = None,
+        callback_message: Optional[Callable[[Dict[str, Any]], None]] = None,
+    ) -> None:
+        """Initialize the VisionAgent using OpenAI LMMs.
+
+        Parameters:
+            agent (Optional[LMM]): The agent to use for conversation and orchestration
+                of other agents.
+            verbosity (int): The verbosity level of the agent.
+            local_artifacts_path (Optional[Union[str, Path]]): The path to the local
+                artifacts file.
+            code_sandbox_runtime (Optional[str]): The code sandbox runtime to use.
+        """
+
+        agent = OpenAILMM(temperature=0.0, json_mode=True) if agent is None else agent
+        super().__init__(
+            agent,
+            verbosity,
+            local_artifacts_path,
+            code_sandbox_runtime,
+            callback_message,
+        )
+
+
+class AnthropicVisionAgent(VisionAgent):
+    def __init__(
+        self,
+        agent: Optional[LMM] = None,
+        verbosity: int = 0,
+        local_artifacts_path: Optional[Union[str, Path]] = None,
+        code_sandbox_runtime: Optional[str] = None,
+        callback_message: Optional[Callable[[Dict[str, Any]], None]] = None,
+    ) -> None:
+        """Initialize the VisionAgent using Anthropic LMMs.
+
+        Parameters:
+            agent (Optional[LMM]): The agent to use for conversation and orchestration
+                of other agents.
+            verbosity (int): The verbosity level of the agent.
+            local_artifacts_path (Optional[Union[str, Path]]): The path to the local
+                artifacts file.
+            code_sandbox_runtime (Optional[str]): The code sandbox runtime to use.
+        """
+
+        agent = AnthropicLMM(temperature=0.0) if agent is None else agent
+        super().__init__(
+            agent,
+            verbosity,
+            local_artifacts_path,
+            code_sandbox_runtime,
+            callback_message,
+        )
diff --git a/vision_agent/agent/vision_agent_coder.py b/vision_agent/agent/vision_agent_coder.py
index edf249ac..ed5566b9 100644
--- a/vision_agent/agent/vision_agent_coder.py
+++ b/vision_agent/agent/vision_agent_coder.py
@@ -30,7 +30,7 @@
 from vision_agent.lmm import (
     LMM,
     AzureOpenAILMM,
-    ClaudeSonnetLMM,
+    AnthropicLMM,
     Message,
     OllamaLMM,
     OpenAILMM,
@@ -904,7 +904,7 @@ def _log_plans(self, plans: Dict[str, Any], verbosity: int) -> None:
                 )
 
 
-class ClaudeVisionAgentCoder(VisionAgentCoder):
+class AnthropicVisionAgentCoder(VisionAgentCoder):
     def __init__(
         self,
         planner: Optional[LMM] = None,
@@ -917,12 +917,10 @@ def __init__(
         code_sandbox_runtime: Optional[str] = None,
     ) -> None:
         # NOTE: Claude doesn't have an official JSON mode
-        self.planner = ClaudeSonnetLMM(temperature=0.0) if planner is None else planner
-        self.coder = ClaudeSonnetLMM(temperature=0.0) if coder is None else coder
-        self.tester = ClaudeSonnetLMM(temperature=0.0) if tester is None else tester
-        self.debugger = (
-            ClaudeSonnetLMM(temperature=0.0) if debugger is None else debugger
-        )
+        self.planner = AnthropicLMM(temperature=0.0) if planner is None else planner
+        self.coder = AnthropicLMM(temperature=0.0) if coder is None else coder
+        self.tester = AnthropicLMM(temperature=0.0) if tester is None else tester
+        self.debugger = AnthropicLMM(temperature=0.0) if debugger is None else debugger
         self.verbosity = verbosity
         if self.verbosity > 0:
             _LOGGER.setLevel(logging.INFO)
diff --git a/vision_agent/lmm/__init__.py b/vision_agent/lmm/__init__.py
index 61ee1794..5a93589e 100644
--- a/vision_agent/lmm/__init__.py
+++ b/vision_agent/lmm/__init__.py
@@ -1,2 +1,2 @@
-from .lmm import LMM, AzureOpenAILMM, ClaudeSonnetLMM, OllamaLMM, OpenAILMM
+from .lmm import LMM, AzureOpenAILMM, AnthropicLMM, OllamaLMM, OpenAILMM
 from .types import Message
diff --git a/vision_agent/lmm/lmm.py b/vision_agent/lmm/lmm.py
index d329340b..05917b74 100644
--- a/vision_agent/lmm/lmm.py
+++ b/vision_agent/lmm/lmm.py
@@ -1,5 +1,4 @@
 import json
-import logging
 import os
 from abc import ABC, abstractmethod
 from pathlib import Path
@@ -14,8 +13,6 @@
 
 from .types import Message
 
-_LOGGER = logging.getLogger(__name__)
-
 
 class LMM(ABC):
     @abstractmethod
@@ -45,7 +42,7 @@ def __call__(
 
 
 class OpenAILMM(LMM):
-    r"""An LMM class for the OpenAI GPT-4 Vision model."""
+    r"""An LMM class for the OpenAI LMMs."""
 
     def __init__(
         self,
@@ -365,8 +362,8 @@ def f() -> Iterator[Optional[str]]:
             return resp["response"]  # type: ignore
 
 
-class ClaudeSonnetLMM(LMM):
-    r"""An LMM class for Anthropic's Claude Sonnet model."""
+class AnthropicLMM(LMM):
+    r"""An LMM class for Anthropic's LMMs."""
 
     def __init__(
         self,

From e4485fa64288eebe29fa409675c51cfa170c5025 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Wed, 11 Sep 2024 09:14:04 -0700
Subject: [PATCH 04/30] added openai classes and made anthropic default

---
 vision_agent/agent/__init__.py           |  2 +-
 vision_agent/agent/vision_agent.py       |  4 +-
 vision_agent/agent/vision_agent_coder.py | 47 ++++++++++++++++++++----
 vision_agent/lmm/__init__.py             |  2 +-
 vision_agent/utils/image_utils.py        |  2 +-
 5 files changed, 44 insertions(+), 13 deletions(-)

diff --git a/vision_agent/agent/__init__.py b/vision_agent/agent/__init__.py
index abd3bde0..62cb4f38 100644
--- a/vision_agent/agent/__init__.py
+++ b/vision_agent/agent/__init__.py
@@ -1,8 +1,8 @@
 from .agent import Agent
 from .vision_agent import VisionAgent
 from .vision_agent_coder import (
-    AzureVisionAgentCoder,
     AnthropicVisionAgentCoder,
+    AzureVisionAgentCoder,
     OllamaVisionAgentCoder,
     VisionAgentCoder,
 )
diff --git a/vision_agent/agent/vision_agent.py b/vision_agent/agent/vision_agent.py
index 3b0d69ec..3024a844 100644
--- a/vision_agent/agent/vision_agent.py
+++ b/vision_agent/agent/vision_agent.py
@@ -136,9 +136,7 @@ def __init__(
             code_sandbox_runtime (Optional[str]): The code sandbox runtime to use.
         """
 
-        self.agent = (
-            OpenAILMM(temperature=0.0, json_mode=True) if agent is None else agent
-        )
+        self.agent = AnthropicLMM(temperature=0.0) if agent is None else agent
         self.max_iterations = 12
         self.verbosity = verbosity
         self.code_sandbox_runtime = code_sandbox_runtime
diff --git a/vision_agent/agent/vision_agent_coder.py b/vision_agent/agent/vision_agent_coder.py
index ed5566b9..beb629ee 100644
--- a/vision_agent/agent/vision_agent_coder.py
+++ b/vision_agent/agent/vision_agent_coder.py
@@ -29,8 +29,8 @@
 )
 from vision_agent.lmm import (
     LMM,
-    AzureOpenAILMM,
     AnthropicLMM,
+    AzureOpenAILMM,
     Message,
     OllamaLMM,
     OpenAILMM,
@@ -653,12 +653,10 @@ def __init__(
                  also None, the local python runtime environment will be used.
         """
 
-        self.planner = (
-            OpenAILMM(temperature=0.0, json_mode=True) if planner is None else planner
-        )
-        self.coder = OpenAILMM(temperature=0.0) if coder is None else coder
-        self.tester = OpenAILMM(temperature=0.0) if tester is None else tester
-        self.debugger = OpenAILMM(temperature=0.0) if debugger is None else debugger
+        self.planner = AnthropicLMM(temperature=0.0) if planner is None else planner
+        self.coder = AnthropicLMM(temperature=0.0) if coder is None else coder
+        self.tester = AnthropicLMM(temperature=0.0) if tester is None else tester
+        self.debugger = AnthropicLMM(temperature=0.0) if debugger is None else debugger
         self.verbosity = verbosity
         if self.verbosity > 0:
             _LOGGER.setLevel(logging.INFO)
@@ -904,7 +902,42 @@ def _log_plans(self, plans: Dict[str, Any], verbosity: int) -> None:
                 )
 
 
+class OpenAIVisionAgentCoder(VisionAgentCoder):
+    """Initializes Vision Agent Coder using OpenAI models for planning, coding, testing."""
+
+    def __init__(
+        self,
+        planner: Optional[LMM] = None,
+        coder: Optional[LMM] = None,
+        tester: Optional[LMM] = None,
+        debugger: Optional[LMM] = None,
+        tool_recommender: Optional[Sim] = None,
+        verbosity: int = 0,
+        report_progress_callback: Optional[Callable[[Dict[str, Any]], None]] = None,
+        code_sandbox_runtime: Optional[str] = None,
+    ) -> None:
+        self.planner = (
+            OpenAILMM(temperature=0.0, json_mode=True) if planner is None else planner
+        )
+        self.coder = OpenAILMM(temperature=0.0) if coder is None else coder
+        self.tester = OpenAILMM(temperature=0.0) if tester is None else tester
+        self.debugger = OpenAILMM(temperature=0.0) if debugger is None else debugger
+        self.verbosity = verbosity
+        if self.verbosity > 0:
+            _LOGGER.setLevel(logging.INFO)
+
+        self.tool_recommender = (
+            Sim(T.TOOLS_DF, sim_key="desc")
+            if tool_recommender is None
+            else tool_recommender
+        )
+        self.report_progress_callback = report_progress_callback
+        self.code_sandbox_runtime = code_sandbox_runtime
+
+
 class AnthropicVisionAgentCoder(VisionAgentCoder):
+    """Initializes Vision Agent Coder using Anthropic models for planning, coding, testing."""
+
     def __init__(
         self,
         planner: Optional[LMM] = None,
diff --git a/vision_agent/lmm/__init__.py b/vision_agent/lmm/__init__.py
index 5a93589e..c75881c1 100644
--- a/vision_agent/lmm/__init__.py
+++ b/vision_agent/lmm/__init__.py
@@ -1,2 +1,2 @@
-from .lmm import LMM, AzureOpenAILMM, AnthropicLMM, OllamaLMM, OpenAILMM
+from .lmm import LMM, AnthropicLMM, AzureOpenAILMM, OllamaLMM, OpenAILMM
 from .types import Message
diff --git a/vision_agent/utils/image_utils.py b/vision_agent/utils/image_utils.py
index 45181685..0536fefd 100644
--- a/vision_agent/utils/image_utils.py
+++ b/vision_agent/utils/image_utils.py
@@ -5,7 +5,7 @@
 from importlib import resources
 from io import BytesIO
 from pathlib import Path
-from typing import Dict, List, Tuple, Union, Optional
+from typing import Dict, List, Optional, Tuple, Union
 
 import numpy as np
 from PIL import Image, ImageDraw, ImageFont

From bd8d245484b96e179702f75e462339919e53b3ef Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Wed, 11 Sep 2024 13:56:30 -0700
Subject: [PATCH 05/30] add ability to view images

---
 vision_agent/agent/vision_agent.py         | 37 +++++++++++++++++-----
 vision_agent/agent/vision_agent_prompts.py | 34 ++++++++++++++++----
 vision_agent/tools/meta_tools.py           | 32 +++++++++++++++++--
 3 files changed, 87 insertions(+), 16 deletions(-)

diff --git a/vision_agent/agent/vision_agent.py b/vision_agent/agent/vision_agent.py
index 3024a844..c0737b9e 100644
--- a/vision_agent/agent/vision_agent.py
+++ b/vision_agent/agent/vision_agent.py
@@ -10,11 +10,16 @@
 from vision_agent.agent.vision_agent_prompts import (
     EXAMPLES_CODE1,
     EXAMPLES_CODE2,
+    EXAMPLES_CODE3,
     VA_CODE,
 )
 from vision_agent.lmm import LMM, AnthropicLMM, Message, OpenAILMM
 from vision_agent.tools import META_TOOL_DOCSTRING
-from vision_agent.tools.meta_tools import Artifacts, use_extra_vision_agent_args
+from vision_agent.tools.meta_tools import (
+    Artifacts,
+    check_and_load_image,
+    use_extra_vision_agent_args,
+)
 from vision_agent.utils import CodeInterpreterFactory
 from vision_agent.utils.execute import CodeInterpreter, Execution
 
@@ -30,7 +35,7 @@ class BoilerplateCode:
     pre_code = [
         "from typing import *",
         "from vision_agent.utils.execute import CodeInterpreter",
-        "from vision_agent.tools.meta_tools import Artifacts, open_code_artifact, create_code_artifact, edit_code_artifact, get_tool_descriptions, generate_vision_code, edit_vision_code, write_media_artifact, florence2_fine_tuning, use_florence2_fine_tuning",
+        "from vision_agent.tools.meta_tools import Artifacts, open_code_artifact, create_code_artifact, edit_code_artifact, get_tool_descriptions, generate_vision_code, edit_vision_code, write_media_artifact, view_media_artifact, florence2_fine_tuning, use_florence2_fine_tuning",
         "artifacts = Artifacts('{remote_path}')",
         "artifacts.load('{remote_path}')",
     ]
@@ -68,10 +73,17 @@ def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
 
     prompt = VA_CODE.format(
         documentation=META_TOOL_DOCSTRING,
-        examples=f"{EXAMPLES_CODE1}\n{EXAMPLES_CODE2}",
+        examples=f"{EXAMPLES_CODE1}\n{EXAMPLES_CODE2}\n{EXAMPLES_CODE3}",
         conversation=conversation,
     )
-    return extract_json(orch([{"role": "user", "content": prompt}], stream=False))  # type: ignore
+    message: Message = {"role": "user", "content": prompt}
+    if (
+        chat[-1]["role"] == "observation"
+        and "media" in chat[-1]
+        and len(chat[-1]["media"]) > 0  # type: ignore
+    ):
+        message["media"] = chat[-1]["media"]
+    return extract_json(orch([message], stream=False))  # type: ignore
 
 
 def run_code_action(
@@ -319,13 +331,22 @@ def chat_with_code(
                         code_action, code_interpreter, str(remote_artifacts_path)
                     )
 
+                    media_obs = check_and_load_image(code_action)
+
                     if self.verbosity >= 1:
                         _LOGGER.info(obs)
+
+                    chat_elt: Message = {"role": "observation", "content": obs}
+                    if media_obs and result.success:
+                        chat_elt["media"] = [
+                            Path(code_interpreter.remote_path) / media_ob
+                            for media_ob in media_obs
+                        ]
+
                     # don't add execution results to internal chat
-                    int_chat.append({"role": "observation", "content": obs})
-                    orig_chat.append(
-                        {"role": "observation", "content": obs, "execution": result}
-                    )
+                    int_chat.append(chat_elt)
+                    chat_elt["execution"] = result
+                    orig_chat.append(chat_elt)
                     self.streaming_message(
                         {
                             "role": "observation",
diff --git a/vision_agent/agent/vision_agent_prompts.py b/vision_agent/agent/vision_agent_prompts.py
index 7b0dd600..557283a7 100644
--- a/vision_agent/agent/vision_agent_prompts.py
+++ b/vision_agent/agent/vision_agent_prompts.py
@@ -18,19 +18,24 @@
 {examples}
 --- END EXAMPLES ---
 
-**Instructions**:
-1. **Understand and Clarify**: Make sure you understand the task, ask clarifying questions if the task is not clear.
-2. **Output in JSON**: Respond in JSON format, {{"thoughts": <your thoughts>, "response": <your response to the user>, "let_user_respond": <a boolean whether or not to let the user respond>}}.
-
 **Conversation**:
 Here is the current conversation so far:
 --- START CONVERSATION ---
 {conversation}
+--- END CONVERSATION ---
+
+**Instructions**:
+1. **Understand and Clarify**: Make sure you understand the task, ask clarifying questions if the task is not clear.
+2. **Output in JSON**: Respond in the following format in JSON:
+
+```json
+{{"thoughts": <your thoughts>, "response": <your response to the user>, "let_user_respond": <a boolean whether or not to let the user respond>}}.
+```
 """
 
 
 EXAMPLES_CODE1 = """
-USER: Can you detect the dogs in this image? Media name dog.jpg
+USER: Can you write code to detect the dogs in this image? Media name dog.jpg
 
 OBSERVATION:
 [Artifacts loaded]
@@ -61,6 +66,7 @@
 EXAMPLES_CODE1_EXTRA = """
 USER: The the image only has one dog, can you fix this?
 
+OBSERVATION:
 [Artifacts loaded]
 Artifact dog.jpg loaded to /path/to/images/dog.jpg
 Artifact dog_detector.py loaded to /path/to/code/dog_detector.py
@@ -86,8 +92,24 @@
 AGENT: {"thoughts": "One dog is detected, I will show this to the user and ask them if the result looks good.", "response": "I have written the code to detect one dog and shown you the output, do the results look good to you?", "let_user_respond": true}
 """
 
-
 EXAMPLES_CODE2 = """
+USER: Can you describe this image?
+
+OBSERVATION:
+[Artifacts loaded]
+Artifact image.jpg loaded to /path/to/images/image.jpg
+[End of artifacts]
+
+AGENT: {"thoughts": "The user hasn't asked me to write any code and the task is very simple so I will view the image and answer myself to respond to the user quickly.", "response": "<execute_python>view_media_artifacts('image.jpg')</execute_python>", "let_user_respond": false}
+
+OBSERVATION:
+[Image image.jpg displayed]
+
+AGENT: {"thoughts": "The image shows a cat and a dog sitting on the couch, I will tell the user and ask them if they need any other assistance.", "response": "The image contains a dog and a cat sitting on a couch. Can I help you with any other tasks?", "let_user_respond": true}
+"""
+
+
+EXAMPLES_CODE3 = """
 USER: Can you create a function to count workers with helmets, return the count and save a visaulization of the bounding boxes?
 
 OBSERVATION:
diff --git a/vision_agent/tools/meta_tools.py b/vision_agent/tools/meta_tools.py
index 0d20cb28..25add9f9 100644
--- a/vision_agent/tools/meta_tools.py
+++ b/vision_agent/tools/meta_tools.py
@@ -486,6 +486,33 @@ def list_artifacts(artifacts: Artifacts) -> str:
     return output_str
 
 
+def check_and_load_image(code: str) -> List[str]:
+    if not code.strip():
+        return []
+
+    pattern = r"show_media_artifact\(\s*([^\)]+),\s*['\"]([^\)]+)['\"]\s*\)"
+    match = re.search(pattern, code)
+    if match:
+        name = match.group(2)
+        return [name]
+    return []
+
+
+def view_media_artifact(artifacts: Artifacts, name: str) -> str:
+    """Views the image artifact with the given name.
+
+    Parameters:
+        artifacts (Artifacts): The artifacts object to show the image from.
+        name (str): The name of the image artifact to show.
+    """
+    if name not in artifacts:
+        output_str = f"[Artifact {name} does not exist]"
+    else:
+        output_str = f"[Image {name} displayed]"
+    print(output_str)
+    return output_str
+
+
 def get_tool_descriptions() -> str:
     """Returns a description of all the tools that `generate_vision_code` has access to.
     Helpful for answering questions about what types of vision tasks you can do with
@@ -564,7 +591,7 @@ def use_extra_vision_agent_args(
     Returns:
         str: The edited code.
     """
-    generate_pattern = r"generate_vision_code\(\s*([^\)]+)\)"
+    generate_pattern = r"generate_vision_code\(\s*([^\)]+)\s*\)"
 
     def generate_replacer(match: re.Match) -> str:
         arg = match.group(1)
@@ -575,7 +602,7 @@ def generate_replacer(match: re.Match) -> str:
             out_str += ")"
         return out_str
 
-    edit_pattern = r"edit_vision_code\(\s*([^\)]+)\)"
+    edit_pattern = r"edit_vision_code\(\s*([^\)]+)\s*\)"
 
     def edit_replacer(match: re.Match) -> str:
         arg = match.group(1)
@@ -662,6 +689,7 @@ def replacer(match: re.Match) -> str:
         generate_vision_code,
         edit_vision_code,
         write_media_artifact,
+        view_media_artifact,
         florence2_fine_tuning,
         use_florence2_fine_tuning,
         list_artifacts,

From d64f86dbafc561b3783bfcca0d8642a4b3795ea2 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Wed, 11 Sep 2024 14:12:30 -0700
Subject: [PATCH 06/30] add florence2 fine tune to owl_v2 args

---
 vision_agent/tools/tools.py | 46 ++++++++++++++++++++++++++++++++-----
 1 file changed, 40 insertions(+), 6 deletions(-)

diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py
index 63927f01..3167d3be 100644
--- a/vision_agent/tools/tools.py
+++ b/vision_agent/tools/tools.py
@@ -149,6 +149,7 @@ def owl_v2_image(
     prompt: str,
     image: np.ndarray,
     box_threshold: float = 0.10,
+    fine_tune_id: Optional[str] = None,
 ) -> List[Dict[str, Any]]:
     """'owl_v2_image' is a tool that can detect and count multiple objects given a text
     prompt such as category names or referring expressions on images. The categories in
@@ -160,6 +161,8 @@ def owl_v2_image(
         image (np.ndarray): The image to ground the prompt to.
         box_threshold (float, optional): The threshold for the box detection. Defaults
             to 0.10.
+        fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
+            fine-tuned model ID here to use it.
 
     Returns:
         List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
@@ -176,7 +179,38 @@ def owl_v2_image(
             {'score': 0.98, 'label': 'car', 'bbox': [0.2, 0.21, 0.45, 0.5},
         ]
     """
+
     image_size = image.shape[:2]
+
+    if fine_tune_id is not None:
+        image_b64 = convert_to_b64(image)
+        landing_api = LandingPublicAPI()
+        status = landing_api.check_fine_tuning_job(UUID(fine_tune_id))
+        if status is not JobStatus.SUCCEEDED:
+            raise FineTuneModelIsNotReady(
+                f"Fine-tuned model {fine_tune_id} is not ready yet"
+            )
+
+        data_obj = Florence2FtRequest(
+            image=image_b64,
+            task=PromptTask.PHRASE_GROUNDING,
+            tool="florencev2_fine_tuning",
+            prompt=prompt,
+            fine_tuning=FineTuning(job_id=UUID(fine_tune_id)),
+        )
+        data = data_obj.model_dump(by_alias=True)
+        detections = send_inference_request(data, "tools", v2=False)
+        detections = detections["<CAPTION_TO_PHRASE_GROUNDING>"]
+        bboxes_formatted = [
+            ODResponseData(
+                label=detections["labels"][i],
+                bbox=normalize_bbox(detections["bboxes"][i], image_size),
+                score=1.0,
+            )
+            for i in range(len(detections["bboxes"]))
+        ]
+        return [bbox.model_dump() for bbox in bboxes_formatted]
+
     buffer_bytes = numpy_to_bytes(image)
     files = [("image", buffer_bytes)]
     payload = {
@@ -1119,13 +1153,13 @@ def florence2_phrase_grounding(
     return_data = []
     for i in range(len(detections["bboxes"])):
         return_data.append(
-            {
-                "score": 1.0,
-                "label": detections["labels"][i],
-                "bbox": normalize_bbox(detections["bboxes"][i], image_size),
-            }
+            ODResponseData(
+                label=detections["labels"][i],
+                bbox=normalize_bbox(detections["bboxes"][i], image_size),
+                score=1.0,
+            )
         )
-    return return_data
+    return [bbox.model_dump() for bbox in return_data]
 
 
 def florence2_ocr(image: np.ndarray) -> List[Dict[str, Any]]:

From ad54abb2405677d39af5a32b04f9b4da4f10258b Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Wed, 11 Sep 2024 14:34:09 -0700
Subject: [PATCH 07/30] added fine tune id for florence2sam2

---
 vision_agent/tools/tools.py       | 42 +++++++++++++++++++++++++++++--
 vision_agent/tools/tools_types.py |  1 +
 2 files changed, 41 insertions(+), 2 deletions(-)

diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py
index 3167d3be..53259e27 100644
--- a/vision_agent/tools/tools.py
+++ b/vision_agent/tools/tools.py
@@ -369,7 +369,9 @@ def grounding_sam(
     return return_data
 
 
-def florence2_sam2_image(prompt: str, image: np.ndarray) -> List[Dict[str, Any]]:
+def florence2_sam2_image(
+    prompt: str, image: np.ndarray, fine_tune_id: Optional[str] = None
+) -> List[Dict[str, Any]]:
     """'florence2_sam2_image' is a tool that can segment multiple objects given a text
     prompt such as category names or referring expressions. The categories in the text
     prompt are separated by commas. It returns a list of bounding boxes, label names,
@@ -378,6 +380,8 @@ def florence2_sam2_image(prompt: str, image: np.ndarray) -> List[Dict[str, Any]]
     Parameters:
         prompt (str): The prompt to ground to the image.
         image (np.ndarray): The image to ground the prompt to.
+        fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
+            fine-tuned model ID here to use it.
 
     Returns:
         List[Dict[str, Any]]: A list of dictionaries containing the score, label,
@@ -403,8 +407,42 @@ def florence2_sam2_image(prompt: str, image: np.ndarray) -> List[Dict[str, Any]]
             },
         ]
     """
-    buffer_bytes = numpy_to_bytes(image)
+    if fine_tune_id is not None:
+        image_b64 = convert_to_b64(image)
+        landing_api = LandingPublicAPI()
+        status = landing_api.check_fine_tuning_job(UUID(fine_tune_id))
+        if status is not JobStatus.SUCCEEDED:
+            raise FineTuneModelIsNotReady(
+                f"Fine-tuned model {fine_tune_id} is not ready yet"
+            )
 
+        data_obj = Florence2FtRequest(
+            image=image_b64,
+            task=PromptTask.PHRASE_GROUNDING,
+            tool="florencev2_fine_tuning",
+            prompt=prompt,
+            fine_tuning=FineTuning(
+                job_id=UUID(fine_tune_id),
+                postprocessing="sam2",
+            ),
+        )
+        data = data_obj.model_dump(by_alias=True)
+        detections = send_inference_request(data, "tools", v2=False)
+        detections = detections["<CAPTION_TO_PHRASE_GROUNDING>"]
+        return_data = []
+        all_masks = np.array(detections["masks"])
+        for i in range(len(detections["bboxes"])):
+            return_data.append(
+                {
+                    "score": 1.0,
+                    "label": detections["labels"][i],
+                    "bbox": detections["bboxes"][i],
+                    "mask": all_masks[i, :, :].astype(np.uint8),
+                }
+            )
+        return return_data
+
+    buffer_bytes = numpy_to_bytes(image)
     files = [("image", buffer_bytes)]
     payload = {
         "prompts": [s.strip() for s in prompt.split(",")],
diff --git a/vision_agent/tools/tools_types.py b/vision_agent/tools/tools_types.py
index 6ebcf468..aa0e430f 100644
--- a/vision_agent/tools/tools_types.py
+++ b/vision_agent/tools/tools_types.py
@@ -28,6 +28,7 @@ class FineTuning(BaseModel):
     model_config = ConfigDict(populate_by_name=True)
 
     job_id: UUID = Field(alias="jobId")
+    postprocessing: Optional[str] = None
 
     @field_serializer("job_id")
     def serialize_job_id(self, job_id: UUID, _info: SerializationInfo) -> str:

From 7432e1044fd1440776f48c58c2c50c44fe49756f Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Wed, 11 Sep 2024 14:55:18 -0700
Subject: [PATCH 08/30] add generic OD fine tuning

---
 vision_agent/tools/meta_tools.py | 62 +++++++++++++++++---------------
 1 file changed, 33 insertions(+), 29 deletions(-)

diff --git a/vision_agent/tools/meta_tools.py b/vision_agent/tools/meta_tools.py
index 25add9f9..6fb3045a 100644
--- a/vision_agent/tools/meta_tools.py
+++ b/vision_agent/tools/meta_tools.py
@@ -520,16 +520,15 @@ def get_tool_descriptions() -> str:
     return TOOL_DESCRIPTIONS
 
 
-def florence2_fine_tuning(bboxes: List[Dict[str, Any]], task: str) -> str:
+def object_detection_fine_tuning(bboxes: List[Dict[str, Any]]) -> str:
     """DO NOT use this function unless the user has supplied you with bboxes.
-    'florence2_fine_tuning' is a tool that fine-tune florence2 to be able to detect
-    objects in an image based on a given dataset. It returns the fine tuning job id.
+    'object_detection_fine_tuning' is a tool that fine-tunes object detection models to
+    be able to detect objects in an image based on a given dataset. It returns the fine
+    tuning job id.
 
     Parameters:
         bboxes (List[BboxInput]): A list of BboxInput containing the image path, labels
             and bounding boxes. The coordinates are unnormalized.
-        task (str): The florencev2 fine-tuning task. The options are
-            'phrase_grounding'.
 
     Returns:
         str: The fine tuning job id, this id will used to retrieve the fine tuned
@@ -537,12 +536,13 @@ def florence2_fine_tuning(bboxes: List[Dict[str, Any]], task: str) -> str:
 
     Example
     -------
-        >>> fine_tuning_job_id = florencev2_fine_tuning(
+        >>> fine_tuning_job_id = object_detection_fine_tuning(
             [{'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[370, 30, 560, 290]]},
              {'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[120, 0, 300, 170]]}],
              "phrase_grounding"
         )
     """
+    task = "phrase_grounding"
     bboxes_input = [BboxInput.model_validate(bbox) for bbox in bboxes]
     task_type = PromptTask[task.upper()]
     fine_tuning_request = [
@@ -618,48 +618,52 @@ def edit_replacer(match: re.Match) -> str:
     return new_code
 
 
-def use_florence2_fine_tuning(
-    artifacts: Artifacts, name: str, task: str, fine_tune_id: str
+def use_object_detection_fine_tuning(
+    artifacts: Artifacts, name: str, fine_tune_id: str
 ) -> str:
-    """Replaces florence2 calls with the fine tuning id. This ensures that the code
-    utilizes the fined tuned florence2 model. Returns the diff between the original
-    code and the new code.
+    """Replaces calls to 'owl_v2_image', 'florence2_phrase_detection' and
+    'florence2_sam2_image' with the fine tuning id. This ensures that the code utilizes
+    the fined tuned florence2 model. Returns the diff between the original code and the
+    new code.
 
     Parameters:
         artifacts (Artifacts): The artifacts object to edit the code from.
         name (str): The name of the artifact to edit.
-        task (str): The task to fine tune the model for. The options are
-            'phrase_grounding'.
         fine_tune_id (str): The fine tuning job id.
 
     Examples
     --------
-        >>> diff = use_florence2_fine_tuning(artifacts, "code.py", "phrase_grounding", "23b3b022-5ebf-4798-9373-20ef36429abf")
+        >>> diff = use_object_detection_fine_tuning(artifacts, "code.py", "23b3b022-5ebf-4798-9373-20ef36429abf")
     """
 
-    task_to_fn = {"phrase_grounding": "florence2_phrase_grounding"}
-
     if name not in artifacts:
         output_str = f"[Artifact {name} does not exist]"
         print(output_str)
         return output_str
 
     code = artifacts[name]
-    if task.lower() == "phrase_grounding":
-        pattern = r"florence2_phrase_grounding\(\s*([^\)]+)\)"
-
-        def replacer(match: re.Match) -> str:
-            arg = match.group(1)  # capture all initial arguments
-            return f'florence2_phrase_grounding({arg}, "{fine_tune_id}")'
-
-    else:
-        raise ValueError(f"Task {task} is not supported.")
+    patterns = [
+        (
+            r"florence2_phrase_grounding\(\s*([^\)]+)\s*\)",
+            lambda match: f'florence2_phrase_grounding({match.group(1)}, "{fine_tune_id}")',
+        ),
+        (
+            r"owl_v2_image\(\s*([^\)]+)\s*\)",
+            lambda match: f'owl_v2_image({match.group(1)}, "{fine_tune_id}")',
+        ),
+        (
+            r"florence2_sam2_image\(\s*([^\)]+)\s*\)",
+            lambda match: f'florence2_sam2_image({match.group(1)}, "{fine_tune_id}")',
+        ),
+    ]
 
-    new_code = re.sub(pattern, replacer, code)
+    new_code = code
+    for pattern, replacer in patterns:
+        new_code = re.sub(pattern, replacer, new_code)
 
     if new_code == code:
         output_str = (
-            f"[Fine tuning task {task} function {task_to_fn[task]} not found in code]"
+            f"[No function calls to replace with fine tuning id in artifact {name}]"
         )
         print(output_str)
         return output_str
@@ -690,8 +694,8 @@ def replacer(match: re.Match) -> str:
         edit_vision_code,
         write_media_artifact,
         view_media_artifact,
-        florence2_fine_tuning,
-        use_florence2_fine_tuning,
+        object_detection_fine_tuning,
+        use_object_detection_fine_tuning,
         list_artifacts,
     ]
 )

From 7d27d638b364bafdd743c043c72c88c975cf4a37 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Wed, 11 Sep 2024 14:57:47 -0700
Subject: [PATCH 09/30] fixed type error

---
 vision_agent/tools/tools.py | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py
index 53259e27..91e918d0 100644
--- a/vision_agent/tools/tools.py
+++ b/vision_agent/tools/tools.py
@@ -416,7 +416,7 @@ def florence2_sam2_image(
                 f"Fine-tuned model {fine_tune_id} is not ready yet"
             )
 
-        data_obj = Florence2FtRequest(
+        req_data_obj = Florence2FtRequest(
             image=image_b64,
             task=PromptTask.PHRASE_GROUNDING,
             tool="florencev2_fine_tuning",
@@ -426,17 +426,17 @@ def florence2_sam2_image(
                 postprocessing="sam2",
             ),
         )
-        data = data_obj.model_dump(by_alias=True)
-        detections = send_inference_request(data, "tools", v2=False)
-        detections = detections["<CAPTION_TO_PHRASE_GROUNDING>"]
+        req_data = req_data_obj.model_dump(by_alias=True)
+        detections_ft = send_inference_request(req_data, "tools", v2=False)
+        detections_ft = detections_ft["<CAPTION_TO_PHRASE_GROUNDING>"]
         return_data = []
-        all_masks = np.array(detections["masks"])
-        for i in range(len(detections["bboxes"])):
+        all_masks = np.array(detections_ft["masks"])
+        for i in range(len(detections_ft["bboxes"])):
             return_data.append(
                 {
                     "score": 1.0,
-                    "label": detections["labels"][i],
-                    "bbox": detections["bboxes"][i],
+                    "label": detections_ft["labels"][i],
+                    "bbox": detections_ft["bboxes"][i],
                     "mask": all_masks[i, :, :].astype(np.uint8),
                 }
             )
@@ -448,11 +448,11 @@ def florence2_sam2_image(
         "prompts": [s.strip() for s in prompt.split(",")],
         "function_name": "florence2_sam2_image",
     }
-    data: Dict[str, Any] = send_inference_request(
+    detections: Dict[str, Any] = send_inference_request(
         payload, "florence2-sam2", files=files, v2=True
     )
     return_data = []
-    for _, data_i in data["0"].items():
+    for _, data_i in detections["0"].items():
         mask = rle_decode_array(data_i["mask"])
         label = data_i["label"]
         bbox = normalize_bbox(data_i["bounding_box"], data_i["mask"]["size"])

From e13d0195de8d22a4e8afe92a3d4db74517aeb998 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Wed, 11 Sep 2024 15:19:54 -0700
Subject: [PATCH 10/30] added comment

---
 vision_agent/agent/vision_agent.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vision_agent/agent/vision_agent.py b/vision_agent/agent/vision_agent.py
index c0737b9e..5a3d9c4f 100644
--- a/vision_agent/agent/vision_agent.py
+++ b/vision_agent/agent/vision_agent.py
@@ -77,6 +77,7 @@ def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
         conversation=conversation,
     )
     message: Message = {"role": "user", "content": prompt}
+    # only add recent media so we don't overload the model with old images
     if (
         chat[-1]["role"] == "observation"
         and "media" in chat[-1]

From c3c210b2123566211e6a77e243f1ea581513b7cf Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Wed, 11 Sep 2024 15:32:56 -0700
Subject: [PATCH 11/30] fix prompt for florence2 sam2 video tracking

---
 vision_agent/tools/tools.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py
index 91e918d0..0e58049a 100644
--- a/vision_agent/tools/tools.py
+++ b/vision_agent/tools/tools.py
@@ -467,7 +467,8 @@ def florence2_sam2_video_tracking(
     entities in a video given a text prompt such as category names or referring
     expressions. You can optionally separate the categories in the text with commas. It
     only tracks entities present in the first frame and only returns segmentation
-    masks. It is useful for tracking and counting without duplicating counts.
+    masks. It is useful for tracking and counting without duplicating counts if they
+    appear in the first frame, always outputs scores of 1.0.
 
     Parameters:
         prompt (str): The prompt to ground to the video.

From 39a854863d23d21a6044a5f9890c839d3acfcff1 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Wed, 11 Sep 2024 17:18:50 -0700
Subject: [PATCH 12/30] fixed import bug

---
 vision_agent/agent/vision_agent.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vision_agent/agent/vision_agent.py b/vision_agent/agent/vision_agent.py
index 5a3d9c4f..9142c36c 100644
--- a/vision_agent/agent/vision_agent.py
+++ b/vision_agent/agent/vision_agent.py
@@ -35,7 +35,7 @@ class BoilerplateCode:
     pre_code = [
         "from typing import *",
         "from vision_agent.utils.execute import CodeInterpreter",
-        "from vision_agent.tools.meta_tools import Artifacts, open_code_artifact, create_code_artifact, edit_code_artifact, get_tool_descriptions, generate_vision_code, edit_vision_code, write_media_artifact, view_media_artifact, florence2_fine_tuning, use_florence2_fine_tuning",
+        "from vision_agent.tools.meta_tools import Artifacts, open_code_artifact, create_code_artifact, edit_code_artifact, get_tool_descriptions, generate_vision_code, edit_vision_code, write_media_artifact, view_media_artifact, object_detection_fine_tuning, use_object_detection_fine_tuning",
         "artifacts = Artifacts('{remote_path}')",
         "artifacts.load('{remote_path}')",
     ]

From 30f00f7a2ab955543975738711d49def55006b04 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Wed, 11 Sep 2024 17:43:33 -0700
Subject: [PATCH 13/30] updated fine tuning names in prompts

---
 vision_agent/agent/vision_agent_prompts.py | 6 +++---
 vision_agent/tools/meta_tools.py           | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/vision_agent/agent/vision_agent_prompts.py b/vision_agent/agent/vision_agent_prompts.py
index 557283a7..80623016 100644
--- a/vision_agent/agent/vision_agent_prompts.py
+++ b/vision_agent/agent/vision_agent_prompts.py
@@ -159,13 +159,13 @@
 
 USER: The detections are slightly off. Can you fine tune florence2 using these labels? "[{'image_path': 'image1.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}, {'image_path': 'image2.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}]"
 
-AGENT: {"thoughts": "Because the user has supplied me with labels I can call florence2_fine_tuning on their behalf to fine tune the model", "response": "I will fine tune florence2 with the labels you provided <execute_python>florence2_fine_tuning([{'image_path': 'image1.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}, {'image_path': 'image2.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}], "phrase_grounding")</execute_python>", "let_user_respond": false}
+AGENT: {"thoughts": "Because the user has supplied me with labels I can call object_detection_fine_tuning on their behalf to fine tune the model", "response": "I will fine tune florence2 with the labels you provided <execute_python>object_detection_fine_tuning([{'image_path': 'image1.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}, {'image_path': 'image2.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}])</execute_python>", "let_user_respond": false}
 
 OBSERVATION:
-[Florence2 fine tuning id: 23b3b022-5ebf-4798-9373-20ef36429abf]
+[Fine tuning id: 23b3b022-5ebf-4798-9373-20ef36429abf]
 
 
-AGENT: {"thoughts": "The model has finished fine tuning, I will now replace the original florence2_phrase_grounding call with the fine tuning id.", "response": "I will not update the code to use the fine tuned model. <execute_python>use_florence2_fine_tuning(artifacts, "code.py", "phrase_grounding", "23b3b022-5ebf-4798-9373-20ef36429abf")</execute_python>", "let_user_respond": false}
+AGENT: {"thoughts": "The model has finished fine tuning, I will now replace the original florence2_phrase_grounding call with the fine tuning id.", "response": "I will not update the code to use the fine tuned model. <execute_python>use_object_detection_fine_tuning(artifacts, "code.py", "23b3b022-5ebf-4798-9373-20ef36429abf")</execute_python>", "let_user_respond": false}
 
 OBSERVATION:
 [Artifact code.py edits]
diff --git a/vision_agent/tools/meta_tools.py b/vision_agent/tools/meta_tools.py
index 6fb3045a..52d732f7 100644
--- a/vision_agent/tools/meta_tools.py
+++ b/vision_agent/tools/meta_tools.py
@@ -558,7 +558,7 @@ def object_detection_fine_tuning(bboxes: List[Dict[str, Any]]) -> str:
     fine_tune_id = str(
         landing_api.launch_fine_tuning_job("florencev2", task_type, fine_tuning_request)
     )
-    print(f"[Florence2 fine tuning id: {fine_tune_id}]")
+    print(f"[Fine tuning id: {fine_tune_id}]")
     return fine_tune_id
 
 

From 51ca06b6229cd3607ee11bf84eaef5566da9041f Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Tue, 17 Sep 2024 13:31:51 -0700
Subject: [PATCH 14/30] improve json parsing

---
 vision_agent/agent/agent_utils.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/vision_agent/agent/agent_utils.py b/vision_agent/agent/agent_utils.py
index 2a193a4a..5fd294d8 100644
--- a/vision_agent/agent/agent_utils.py
+++ b/vision_agent/agent/agent_utils.py
@@ -41,6 +41,8 @@ def _strip_markdown_code(inp_str: str) -> str:
 
 def extract_json(json_str: str) -> Dict[str, Any]:
     json_str = json_str.replace("\n", " ").strip()
+    json_str = json_str.replace("'", '"')
+    json_str = json_str.replace(": True", ": true").replace(": False", ": false")
 
     try:
         return json.loads(json_str)  # type: ignore

From 891def569c5d1b812c726e3dc37b43d0ab148960 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Tue, 17 Sep 2024 16:19:49 -0700
Subject: [PATCH 15/30] update json extract, add tests

---
 tests/unit/test_utils.py          | 45 +++++++++++++++++++++++++++++++
 vision_agent/agent/agent_utils.py | 12 ++++++---
 2 files changed, 53 insertions(+), 4 deletions(-)
 create mode 100644 tests/unit/test_utils.py

diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py
new file mode 100644
index 00000000..4db319f9
--- /dev/null
+++ b/tests/unit/test_utils.py
@@ -0,0 +1,45 @@
+from vision_agent.agent.agent_utils import extract_code, extract_json
+
+
+def test_basic_json_extract():
+    a = '{"a": 1, "b": 2}'
+    assert extract_json(a) == {"a": 1, "b": 2}
+
+
+def test_side_case_quotes_json_extract():
+    a = "{'0': 'no', '3': 'no', '6': 'no', '9': 'yes', '12': 'no', '15': 'no'}"
+    a_json = extract_json(a)
+    assert len(a_json) == 6
+
+
+def test_side_case_bool_json_extract():
+    a = "{'0': False, '3': False, '6': False, '9': True, '12': False, '15': False}"
+    a_json = extract_json(a)
+    assert len(a_json) == 6
+
+
+def test_complicated_case_json_extract_1():
+    a = """```json {     "plan1": {         "thoughts": "This plan uses the owl_v2_video tool to detect the truck and then uses ocr to read the USDOT and trailer numbers. This approach is efficient as it can process the entire video at once for truck detection.",         "instructions": [             "Use extract_frames to get frames from truck1.mp4",             "Use owl_v2_video with prompt 'truck' to detect if a truck is present in the video",             "If a truck is detected, use ocr on relevant frames to read the USDOT and trailer numbers",             "Process the OCR results to extract the USDOT and trailer numbers",             "Compile results into JSON format and save using save_json"         ]     },     "plan2": {         "thoughts": "This plan uses florence2_sam2_video_tracking to segment and track the truck, then uses florence2_ocr for text detection. This approach might be more accurate for text detection as it can focus on the relevant parts of the truck.",         "instructions": [             "Use extract_frames to get frames from truck1.mp4",             "Use florence2_sam2_video_tracking with prompt 'truck' to segment and track the truck",             "If a truck is segmented, use florence2_ocr on the segmented area to detect text",             "Process the OCR results to extract the USDOT and trailer numbers",             "Compile results into JSON format and save using save_json"         ]     },     "plan3": {         "thoughts": "This plan uses ixc25_video_vqa to directly ask questions about the truck, USDOT number, and trailer number. This approach leverages the model's ability to understand and answer complex questions about video content.",         "instructions": [             "Use extract_frames to get frames from truck1.mp4",             "Use ixc25_video_vqa with the question 'Is there a truck in this video?' to detect the presence of a truck",             "If a truck is present, use ixc25_video_vqa with the question 'What is the USDOT number on the truck?'",             "Use ixc25_video_vqa with the question 'What is the trailer number on the truck?'",             "Process the answers to extract the required information",             "Compile results into JSON format and save using save_json"         ]     } } ```"""
+
+    a_json = extract_json(a)
+    assert len(a_json) == 3
+    assert "plan1" in a_json
+
+
+def test_complicated_case_json_extract_2():
+    a = """{\n    "predicted_answer": "2",\n    "thoughts": "After analyzing the image and the tool outputs, I can see that there are indeed 2 dogs in the image. One is a small grey dog on the grass, and the other is a larger white dog on the patio. All three plans correctly identified 2 dogs, but I believe plan2 using the countgd_counting tool is the best choice for this task. Here\'s why:\n\n    1. Accuracy: The countgd_counting tool provided high confidence scores (0.92 and 0.9) for both dogs, which aligns with what I can see in the image.\n    \n    2. Precision: The bounding boxes from the countgd_counting tool seem to be more precise and tightly fit around the dogs compared to the other tools.\n    \n    3. Simplicity: While plan3 offers a more complex approach with additional verification, it\'s not necessary in this case as the dogs are clearly visible and easily identifiable. The extra steps in plan3 would add unnecessary complexity and potential for errors.\n    \n    4. Efficiency: Plan2 is more straightforward and efficient than plan3, while potentially offering better accuracy than plan1 (owl_v2_image tool had lower confidence scores).",\n    "best_plan": "plan2"\n}"""
+    a_json = extract_json(a)
+    assert len(a_json) == 3
+    assert "predicted_answer" in a_json
+
+
+def test_basic_code_extract():
+    a = """```python
+def test_basic_json_extract():
+    a = '{"a": 1, "b": 2}'
+    assert extract_json(a) == {"a": 1, "b": 2}
+```
+"""
+    a_code = extract_code(a)
+    assert "def test_basic_json_extract():" in a_code
+    assert "assert extract_json(a) == {" in a_code
diff --git a/vision_agent/agent/agent_utils.py b/vision_agent/agent/agent_utils.py
index 5fd294d8..dc0debee 100644
--- a/vision_agent/agent/agent_utils.py
+++ b/vision_agent/agent/agent_utils.py
@@ -40,14 +40,18 @@ def _strip_markdown_code(inp_str: str) -> str:
 
 
 def extract_json(json_str: str) -> Dict[str, Any]:
-    json_str = json_str.replace("\n", " ").strip()
-    json_str = json_str.replace("'", '"')
-    json_str = json_str.replace(": True", ": true").replace(": False", ": false")
+    json_str_mod = json_str.replace("\n", " ").strip()
+    json_str_mod = json_str_mod.replace("'", '"')
+    json_str_mod = json_str_mod.replace(": True", ": true").replace(
+        ": False", ": false"
+    )
 
     try:
-        return json.loads(json_str)  # type: ignore
+        return json.loads(json_str_mod)  # type: ignore
     except json.JSONDecodeError:
         json_orig = json_str
+        # don't replace quotes here or booleans since it can also introduce errors
+        json_str = json_str.replace("\n", " ").strip()
         json_str = _strip_markdown_code(json_str)
         json_str = _find_markdown_json(json_str)
         json_dict = _extract_sub_json(json_str)

From 0d9c00bfb8bd6abe735abdfb85307b23939f69c8 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Wed, 18 Sep 2024 19:34:30 -0700
Subject: [PATCH 16/30] removed old code

---
 vision_agent/agent/vision_agent_coder.py | 23 -----------------------
 1 file changed, 23 deletions(-)

diff --git a/vision_agent/agent/vision_agent_coder.py b/vision_agent/agent/vision_agent_coder.py
index beb629ee..5dc52bb6 100644
--- a/vision_agent/agent/vision_agent_coder.py
+++ b/vision_agent/agent/vision_agent_coder.py
@@ -92,29 +92,6 @@ def format_plans(plans: Dict[str, Any]) -> str:
     return plan_str
 
 
-def extract_image(
-    media: Optional[Sequence[Union[str, Path]]],
-) -> Optional[Sequence[Union[str, Path]]]:
-    if media is None:
-        return None
-
-    new_media = []
-    for m in media:
-        m = Path(m)
-        extension = m.suffix
-        if extension in [".jpg", ".jpeg", ".png", ".bmp"]:
-            new_media.append(m)
-        elif extension in [".mp4", ".mov"]:
-            frames = T.extract_frames(m)
-            with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
-                if len(frames) > 0:
-                    Image.fromarray(frames[0][0]).save(tmp.name)
-                    new_media.append(Path(tmp.name))
-    if len(new_media) == 0:
-        return None
-    return new_media
-
-
 def write_plans(
     chat: List[Message],
     tool_desc: str,

From 54785deffc370a725d19227e71c53db6c008e5ef Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Wed, 18 Sep 2024 19:34:50 -0700
Subject: [PATCH 17/30] minor improvements to prompt to improve benchmark

---
 vision_agent/agent/vision_agent_coder_prompts.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/vision_agent/agent/vision_agent_coder_prompts.py b/vision_agent/agent/vision_agent_coder_prompts.py
index 6d7b18d6..040ee0dc 100644
--- a/vision_agent/agent/vision_agent_coder_prompts.py
+++ b/vision_agent/agent/vision_agent_coder_prompts.py
@@ -30,9 +30,10 @@
 
 **Instructions**:
 1. Based on the context and tools you have available, create a plan of subtasks to achieve the user request.
-2. Output three different plans each utilize a different strategy or set of tools.
+2. Call out specific tools for each subtask and explain why you chose them, even basic tools for loading and saving media.
+3. Output three different plans each utilize a different strategy or set of tools.
 
-Output a list of jsons in the following format
+Output a list of jsons in the following format:
 
 ```json
 {{
@@ -209,7 +210,7 @@ def remove_arrays(o):
 
 **Instructions**:
 1. **Understand and Clarify**: Make sure you understand the task.
-2. **Algorithm/Method Selection**: Decide on the most efficient way.
+2. **Algorithm/Method Selection**: Decide on the most efficient method, use the tool output to guide your decision.
 3. **Pseudocode Creation**: Write down the steps you will follow in pseudocode.
 4. **Code Generation**: Translate your pseudocode into executable Python code. Ensure you use correct arguments, remember coordinates are always returned normalized from `vision_agent.tools`. All images from `vision_agent.tools` are in RGB format, red is (255, 0, 0) and blue is (0, 0, 255).
 """

From fb5cfc37d586f4a794292a9674f789fffc53499f Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Thu, 19 Sep 2024 09:02:30 -0700
Subject: [PATCH 18/30] pass plan thoughts to coder

---
 vision_agent/agent/vision_agent_coder.py      | 32 ++++++++++++-------
 .../agent/vision_agent_coder_prompts.py       | 13 +++++---
 2 files changed, 29 insertions(+), 16 deletions(-)

diff --git a/vision_agent/agent/vision_agent_coder.py b/vision_agent/agent/vision_agent_coder.py
index 5dc52bb6..46d9c3bf 100644
--- a/vision_agent/agent/vision_agent_coder.py
+++ b/vision_agent/agent/vision_agent_coder.py
@@ -123,7 +123,7 @@ def pick_plan(
     log_progress: Callable[[Dict[str, Any]], None],
     verbosity: int = 0,
     max_retries: int = 3,
-) -> Tuple[str, str]:
+) -> Tuple[Dict[str, str], str]:
     log_progress(
         {
             "type": "log",
@@ -233,10 +233,10 @@ def pick_plan(
     chat[-1]["content"] = prompt
 
     count = 0
-    best_plan = None
-    while best_plan is None and count < max_retries:
+    plan_thoughts = None
+    while plan_thoughts is None and count < max_retries:
         try:
-            best_plan = extract_json(model(chat, stream=False))  # type: ignore
+            plan_thoughts = extract_json(model(chat, stream=False))  # type: ignore
         except JSONDecodeError as e:
             _LOGGER.exception(
                 f"Error while extracting JSON during picking best plan {str(e)}"
@@ -245,23 +245,23 @@ def pick_plan(
         count += 1
 
     if (
-        best_plan is None
-        or "best_plan" not in best_plan
-        or ("best_plan" in best_plan and best_plan["best_plan"] not in plans)
+        plan_thoughts is None
+        or "best_plan" not in plan_thoughts
+        or ("best_plan" in plan_thoughts and plan_thoughts["best_plan"] not in plans)
     ):
-        best_plan = {"best_plan": list(plans.keys())[0]}
+        plan_thoughts = {"best_plan": list(plans.keys())[0]}
 
     if verbosity >= 1:
-        _LOGGER.info(f"Best plan:\n{best_plan}")
+        _LOGGER.info(f"Best plan:\n{plan_thoughts}")
     log_progress(
         {
             "type": "log",
             "log_content": "Picked best plan",
             "status": "completed",
-            "payload": plans[best_plan["best_plan"]],
+            "payload": plans[plan_thoughts["best_plan"]],
         }
     )
-    return best_plan["best_plan"], tool_output_str
+    return plan_thoughts, tool_output_str
 
 
 def write_code(
@@ -269,6 +269,7 @@ def write_code(
     chat: List[Message],
     plan: str,
     tool_info: str,
+    plan_thoughts: str,
     tool_output: str,
     feedback: str,
 ) -> str:
@@ -281,6 +282,7 @@ def write_code(
         docstring=tool_info,
         question=FULL_TASK.format(user_request=user_request, subtasks=plan),
         tool_output=tool_output,
+        plan_thoughts=plan_thoughts,
         feedback=feedback,
     )
     chat[-1]["content"] = prompt
@@ -316,6 +318,7 @@ def write_and_test_code(
     plan: str,
     tool_info: str,
     tool_output: str,
+    plan_thoughts: str,
     tool_utils: str,
     working_memory: List[Dict[str, str]],
     coder: LMM,
@@ -340,6 +343,7 @@ def write_and_test_code(
         plan,
         tool_info,
         tool_output,
+        plan_thoughts,
         format_memory(working_memory),
     )
     test = write_test(
@@ -760,7 +764,7 @@ def chat_with_workflow(
             )
 
             if test_multi_plan:
-                best_plan, tool_output_str = pick_plan(
+                plan_thoughts, tool_output_str = pick_plan(
                     int_chat,
                     plans,
                     tool_infos["all"],
@@ -770,9 +774,12 @@ def chat_with_workflow(
                     self.log_progress,
                     verbosity=self.verbosity,
                 )
+                best_plan = plan_thoughts["best_plan"]
+                plan_thoughts = plan_thoughts["thoughts"]
             else:
                 best_plan = list(plans.keys())[0]
                 tool_output_str = ""
+                plan_thoughts = ""
 
             if best_plan in plans and best_plan in tool_infos:
                 plan_i = plans[best_plan]
@@ -807,6 +814,7 @@ def chat_with_workflow(
                 + "\n-".join([e for e in plan_i["instructions"]]),
                 tool_info=tool_info,
                 tool_output=tool_output_str,
+                plan_thoughts=plan_thoughts,
                 tool_utils=T.UTILITIES_DOCSTRING,
                 working_memory=working_memory,
                 coder=self.coder,
diff --git a/vision_agent/agent/vision_agent_coder_prompts.py b/vision_agent/agent/vision_agent_coder_prompts.py
index 040ee0dc..e961b896 100644
--- a/vision_agent/agent/vision_agent_coder_prompts.py
+++ b/vision_agent/agent/vision_agent_coder_prompts.py
@@ -114,13 +114,14 @@
 
 
 ```python
-import numpy as np
 from vision_agent.tools import extract_frames, owl_v2_image, florence2_phrase_grounding, florence2_sam2_video_tracking
 
 # sample at 1 FPS and use the first 10 frames to reduce processing time
 frames = extract_frames("video.mp4", 1)
 frames = [f[0] for f in frames][:10]
 
+# import numpy for remove_array auxiliary function
+import numpy as np
 def remove_arrays(o):
     if isinstance(o, list):
         return [remove_arrays(e) for e in o]
@@ -179,7 +180,7 @@ def remove_arrays(o):
 3. Output a JSON object with the following format:
 {{
     "predicted_answer": str # the answer you would expect from the best plan
-    "thoughts": str # your thought process for choosing the best plan
+    "thoughts": str # your thought process for choosing the best plan, any adjustments you would make to the plan
     "best_plan": str # the best plan you have chosen
 }}
 """
@@ -202,15 +203,19 @@ def remove_arrays(o):
 **User Instructions**:
 {question}
 
-**Tool Output**:
+**Tool Outputs**:
 {tool_output}
 
+
+**Tool Output Thoughts**:
+{plan_thoughts}
+
 **Previous Feedback**:
 {feedback}
 
 **Instructions**:
 1. **Understand and Clarify**: Make sure you understand the task.
-2. **Algorithm/Method Selection**: Decide on the most efficient method, use the tool output to guide your decision.
+2. **Algorithm/Method Selection**: Decide on the most efficient method, use the tool outputs to guide your decision.
 3. **Pseudocode Creation**: Write down the steps you will follow in pseudocode.
 4. **Code Generation**: Translate your pseudocode into executable Python code. Ensure you use correct arguments, remember coordinates are always returned normalized from `vision_agent.tools`. All images from `vision_agent.tools` are in RGB format, red is (255, 0, 0) and blue is (0, 0, 255).
 """

From 64df1e816477e48e5e1154aa6b70609c954751d0 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Thu, 19 Sep 2024 09:07:43 -0700
Subject: [PATCH 19/30] fixed comments

---
 vision_agent/agent/vision_agent_coder.py | 27 ++++++++++++------------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/vision_agent/agent/vision_agent_coder.py b/vision_agent/agent/vision_agent_coder.py
index 46d9c3bf..2efc7a50 100644
--- a/vision_agent/agent/vision_agent_coder.py
+++ b/vision_agent/agent/vision_agent_coder.py
@@ -615,23 +615,24 @@ def __init__(
         """Initialize the Vision Agent Coder.
 
         Parameters:
-            planner (Optional[LMM]): The planner model to use. Defaults to OpenAILMM.
-            coder (Optional[LMM]): The coder model to use. Defaults to OpenAILMM.
-            tester (Optional[LMM]): The tester model to use. Defaults to OpenAILMM.
-            debugger (Optional[LMM]): The debugger model to
+            planner (Optional[LMM]): The planner model to use. Defaults to AnthropicLMM.
+            coder (Optional[LMM]): The coder model to use. Defaults to AnthropicLMM.
+            tester (Optional[LMM]): The tester model to use. Defaults to AnthropicLMM.
+            debugger (Optional[LMM]): The debugger model to use. Defaults to AnthropicLMM.
             tool_recommender (Optional[Sim]): The tool recommender model to use.
             verbosity (int): The verbosity level of the agent. Defaults to 0. 2 is the
                 highest verbosity level which will output all intermediate debugging
                 code.
-            report_progress_callback: a callback to report the progress of the agent.
-                This is useful for streaming logs in a web application where multiple
-                VisionAgentCoder instances are running in parallel. This callback
-                ensures that the progress are not mixed up.
-            code_sandbox_runtime: the code sandbox runtime to use. A code sandbox is
-                 used to run the generated code. It can be one of the following
-                 values: None, "local" or "e2b". If None, VisionAgentCoder will read
-                 the value from the environment variable CODE_SANDBOX_RUNTIME. If it's
-                 also None, the local python runtime environment will be used.
+            report_progress_callback (Optional[Callable[Dict[str, Any]]]): a callback
+                to report the progress of the agent. This is useful for streaming logs
+                in a web application where multiple VisionAgentCoder instances are
+                running in parallel. This callback ensures that the progress are not
+                mixed up.
+            code_sandbox_runtime (Optional[str]): the code sandbox runtime to use. A
+                code sandbox is used to run the generated code. It can be one of the
+                following values: None, "local" or "e2b". If None, VisionAgentCoder
+                will read the value from the environment variable CODE_SANDBOX_RUNTIME.
+                If it's also None, the local python runtime environment will be used.
         """
 
         self.planner = AnthropicLMM(temperature=0.0) if planner is None else planner

From 14fc101b9c6c37ce03c3361bdd832bd3266f1cea Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Thu, 19 Sep 2024 09:13:40 -0700
Subject: [PATCH 20/30] fix type and lint errors

---
 vision_agent/agent/vision_agent_coder.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/vision_agent/agent/vision_agent_coder.py b/vision_agent/agent/vision_agent_coder.py
index 2efc7a50..2d29a8bf 100644
--- a/vision_agent/agent/vision_agent_coder.py
+++ b/vision_agent/agent/vision_agent_coder.py
@@ -2,12 +2,10 @@
 import logging
 import os
 import sys
-import tempfile
 from json import JSONDecodeError
 from pathlib import Path
 from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union, cast
 
-from PIL import Image
 from rich.console import Console
 from rich.style import Style
 from rich.syntax import Syntax
@@ -776,11 +774,11 @@ def chat_with_workflow(
                     verbosity=self.verbosity,
                 )
                 best_plan = plan_thoughts["best_plan"]
-                plan_thoughts = plan_thoughts["thoughts"]
+                plan_thoughts_str = plan_thoughts["thoughts"]
             else:
                 best_plan = list(plans.keys())[0]
                 tool_output_str = ""
-                plan_thoughts = ""
+                plan_thoughts_str = ""
 
             if best_plan in plans and best_plan in tool_infos:
                 plan_i = plans[best_plan]
@@ -815,7 +813,7 @@ def chat_with_workflow(
                 + "\n-".join([e for e in plan_i["instructions"]]),
                 tool_info=tool_info,
                 tool_output=tool_output_str,
-                plan_thoughts=plan_thoughts,
+                plan_thoughts=plan_thoughts_str,
                 tool_utils=T.UTILITIES_DOCSTRING,
                 working_memory=working_memory,
                 coder=self.coder,

From 957ed5637ee93ebb38c768c8d952d46565c8d399 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Thu, 19 Sep 2024 10:19:49 -0700
Subject: [PATCH 21/30] update tests

---
 tests/integ/test_tools.py | 43 +++++++++++++++++++++++++++++++++++++--
 1 file changed, 41 insertions(+), 2 deletions(-)

diff --git a/tests/integ/test_tools.py b/tests/integ/test_tools.py
index ba5b989e..4954738c 100644
--- a/tests/integ/test_tools.py
+++ b/tests/integ/test_tools.py
@@ -21,8 +21,8 @@
     grounding_dino,
     grounding_sam,
     ixc25_image_vqa,
-    ixc25_video_vqa,
     ixc25_temporal_localization,
+    ixc25_video_vqa,
     loca_visual_prompt_counting,
     loca_zero_shot_counting,
     ocr,
@@ -33,6 +33,8 @@
     vit_nsfw_classification,
 )
 
+FINE_TUNE_ID = "65ebba4a-88b7-419f-9046-0750e30250da"
+
 
 def test_grounding_dino():
     img = ski.data.coins()
@@ -65,6 +67,18 @@ def test_owl_v2_image():
     assert [res["label"] for res in result] == ["coin"] * len(result)
 
 
+def test_owl_v2_fine_tune_id():
+    img = ski.data.coins()
+    result = owl_v2_image(
+        prompt="coin",
+        image=img,
+        fine_tune_id=FINE_TUNE_ID,
+    )
+    # this calls a fine-tuned florence2 model which is going to be worse at this task
+    assert 14 <= len(result) <= 26
+    assert [res["label"] for res in result] == ["coin"] * len(result)
+
+
 def test_owl_v2_video():
     frames = [
         np.array(Image.fromarray(ski.data.coins()).convert("RGB")) for _ in range(10)
@@ -78,7 +92,7 @@ def test_owl_v2_video():
     assert 24 <= len([res["label"] for res in result[0]]) <= 26
 
 
-def test_object_detection():
+def test_florence2_phrase_grounding():
     img = ski.data.coins()
     result = florence2_phrase_grounding(
         image=img,
@@ -88,6 +102,18 @@ def test_object_detection():
     assert [res["label"] for res in result] == ["coin"] * 25
 
 
+def test_florence2_phrase_grounding_fine_tune_id():
+    img = ski.data.coins()
+    result = florence2_phrase_grounding(
+        prompt="coin",
+        image=img,
+        fine_tune_id=FINE_TUNE_ID,
+    )
+    # this calls a fine-tuned florence2 model which is going to be worse at this task
+    assert 14 <= len(result) <= 26
+    assert [res["label"] for res in result] == ["coin"] * len(result)
+
+
 def test_template_match():
     img = ski.data.coins()
     result = template_match(
@@ -119,6 +145,19 @@ def test_florence2_sam2_image():
     assert len([res["mask"] for res in result]) == 25
 
 
+def test_florence2_sam2_image_fine_tune_id():
+    img = ski.data.coins()
+    result = florence2_sam2_image(
+        prompt="coin",
+        image=img,
+        fine_tune_id=FINE_TUNE_ID,
+    )
+    # this calls a fine-tuned florence2 model which is going to be worse at this task
+    assert 14 <= len(result) <= 26
+    assert [res["label"] for res in result] == ["coin"] * len(result)
+    assert len([res["mask"] for res in result]) == len(result)
+
+
 def test_florence2_sam2_video():
     frames = [
         np.array(Image.fromarray(ski.data.coins()).convert("RGB")) for _ in range(10)

From 152ac1337b642aeb7691cebb25dec95415bdb8fc Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Sun, 22 Sep 2024 12:38:01 -0700
Subject: [PATCH 22/30] make imports easier, pass more code info

---
 vision_agent/agent/vision_agent_coder.py | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/vision_agent/agent/vision_agent_coder.py b/vision_agent/agent/vision_agent_coder.py
index 2d29a8bf..ec5ece0b 100644
--- a/vision_agent/agent/vision_agent_coder.py
+++ b/vision_agent/agent/vision_agent_coder.py
@@ -51,6 +51,9 @@ class DefaultImports:
     """Container for default imports used in the code execution."""
 
     common_imports = [
+        "import os",
+        "import numpy as np",
+        "from vision_agent.tools import *",
         "from typing import *",
         "from pillow_heif import register_heif_opener",
         "register_heif_opener()",
@@ -174,7 +177,10 @@ def pick_plan(
 
     # retry if the tool output is empty or code fails
     count = 0
-    while (not tool_output.success or tool_output_str == "") and count < max_retries:
+    while (
+        not tool_output.success
+        or (len(tool_output.logs.stdout) == 0 and len(tool_output.logs.stderr) == 0)
+    ) and count < max_retries:
         prompt = TEST_PLANS.format(
             docstring=tool_info,
             plans=plan_str,
@@ -213,6 +219,7 @@ def pick_plan(
         if verbosity == 2:
             _print_code("Code and test after attempted fix:", code)
             _LOGGER.info(f"Code execution result after attempt {count + 1}")
+            _LOGGER.info(f"{tool_output_str}")
 
         count += 1
 
@@ -247,8 +254,12 @@ def pick_plan(
         or "best_plan" not in plan_thoughts
         or ("best_plan" in plan_thoughts and plan_thoughts["best_plan"] not in plans)
     ):
+        _LOGGER.info(f"Failed to pick best plan. Using the first plan. {plan_thoughts}")
         plan_thoughts = {"best_plan": list(plans.keys())[0]}
 
+    if "thoughts" not in plan_thoughts:
+        plan_thoughts["thoughts"] = ""
+
     if verbosity >= 1:
         _LOGGER.info(f"Best plan:\n{plan_thoughts}")
     log_progress(
@@ -259,7 +270,7 @@ def pick_plan(
             "payload": plans[plan_thoughts["best_plan"]],
         }
     )
-    return plan_thoughts, tool_output_str
+    return plan_thoughts, "```python\n" + code + "\n```\n" + tool_output_str
 
 
 def write_code(
@@ -844,7 +855,8 @@ def chat_with_workflow(
                 "code": DefaultImports.prepend_imports(code),
                 "test": test,
                 "test_result": execution_result,
-                "plan": plan,
+                "plans": plans,
+                "plan_thoughts": plan_thoughts_str,
                 "working_memory": working_memory,
             }
 

From c4ee0896456717d1cfda98a5ad92f1c36cbe5cc4 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Sun, 22 Sep 2024 12:38:52 -0700
Subject: [PATCH 23/30] update prompts

---
 .../agent/vision_agent_coder_prompts.py       | 72 +++++++++++++------
 1 file changed, 50 insertions(+), 22 deletions(-)

diff --git a/vision_agent/agent/vision_agent_coder_prompts.py b/vision_agent/agent/vision_agent_coder_prompts.py
index e961b896..e117a2e1 100644
--- a/vision_agent/agent/vision_agent_coder_prompts.py
+++ b/vision_agent/agent/vision_agent_coder_prompts.py
@@ -30,8 +30,8 @@
 
 **Instructions**:
 1. Based on the context and tools you have available, create a plan of subtasks to achieve the user request.
-2. Call out specific tools for each subtask and explain why you chose them, even basic tools for loading and saving media.
-3. Output three different plans each utilize a different strategy or set of tools.
+2. For each subtask, be sure to include the tool(s) you want to use to accomplish that subtask.
+3. Output three different plans each utilize a different strategy or set of tools ordering them from most likely to least likely to succeed.
 
 Output a list of jsons in the following format:
 
@@ -68,7 +68,7 @@
 {previous_attempts}
 
 **Instructions**:
-1. Write a program to load the media and call each tool and save it's output.
+1. Write a program to load the media and call each tool and print it's output along with other relevant information.
 2. Create a dictionary where the keys are the tool name and the values are the tool outputs. Remove numpy arrays from the printed dictionary.
 3. Your test case MUST run only on the given images which are {media}
 4. Print this final dictionary.
@@ -103,25 +103,25 @@
 
 --- EXAMPLE2 ---
 plan1:
-- Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames' tool.
-- Use the 'owl_v2_image' tool with the prompt 'person' to detect where the people are in the video.
+- Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames_and_timestamps' tool.
+- Use the 'owl_v2_video' tool with the prompt 'person' to detect where the people are in the video.
 plan2:
-- Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames' tool.
+- Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames_and_timestamps' tool.
 - Use the 'florence2_phrase_grounding' tool with the prompt 'person' to detect where the people are in the video.
 plan3:
-- Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames' tool.
+- Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames_and_timestamps' tool.
 - Use the 'florence2_sam2_video_tracking' tool with the prompt 'person' to detect where the people are in the video.
 
 
 ```python
-from vision_agent.tools import extract_frames, owl_v2_image, florence2_phrase_grounding, florence2_sam2_video_tracking
+import numpy as np
+from vision_agent.tools import extract_frames_and_timestamps, owl_v2_video, florence2_phrase_grounding, florence2_sam2_video_tracking
 
 # sample at 1 FPS and use the first 10 frames to reduce processing time
-frames = extract_frames("video.mp4", 1)
-frames = [f[0] for f in frames][:10]
+frames = extract_frames_and_timestamps("video.mp4", 1)
+frames = [f["frame"] for f in frames][:10]
 
-# import numpy for remove_array auxiliary function
-import numpy as np
+# strip arrays from the output to make it easier to read
 def remove_arrays(o):
     if isinstance(o, list):
         return [remove_arrays(e) for e in o]
@@ -132,18 +132,46 @@ def remove_arrays(o):
     else:
         return o
 
+# return the counts of each label per frame to help determine the stability of the model results
+def get_counts(preds):
+    counts = {{}}
+    for i, pred_frame in enumerate(preds):
+        counts_i = {{}}
+        for pred in pred_frame:
+            label = pred["label"].split(":")[1] if ":" in pred["label"] else pred["label"]
+            counts_i[label] = counts_i.get(label, 0) + 1
+        counts[f"frame_{{i}}"] = counts_i
+    return counts
+
+
 # plan1
-owl_v2_out = [owl_v2_image("person", f) for f in frames]
+owl_v2_out = owl_v2_video("person", frames)
+owl_v2_counts = get_counts(owl_v2_out)
 
 # plan2
 florence2_out = [florence2_phrase_grounding("person", f) for f in frames]
+florence2_counts = get_counts(florence2_out)
 
 # plan3
 f2s2_tracking_out = florence2_sam2_video_tracking("person", frames)
 remove_arrays(f2s2_tracking_out)
+f2s2_counts = get_counts(f2s2_tracking_out)
+
+final_out = {{
+    "owl_v2_video": owl_v2_out,
+    "florence2_phrase_grounding": florence2_out,
+    "florence2_sam2_video_tracking": f2s2_out,
+}}
+
+counts = {{
+    "owl_v2_video": owl_v2_counts,
+    "florence2_phrase_grounding": florence2_counts,
+    "florence2_sam2_video_tracking": f2s2_counts,
+}}
 
-final_out = {{"owl_v2_image": owl_v2_out, "florence2_phrase_grounding": florence2_out, "florence2_sam2_video_tracking": f2s2_tracking_out}}
 print(final_out)
+print(labels_and_scores)
+print(counts)
 ```
 """
 
@@ -161,7 +189,7 @@ def remove_arrays(o):
 
 
 PICK_PLAN = """
-**Role**: You are a software programmer.
+**Role**: You are an advanced AI model that can understand the user request and construct plans to accomplish it.
 
 **Task**: Your responsibility is to pick the best plan from the three plans provided.
 
@@ -175,13 +203,14 @@ def remove_arrays(o):
 {tool_output}
 
 **Instructions**:
-1. Given the plans, image, and tool outputs, decide which plan is the best to achieve the user request.
-2. Solve the problem yourself given the image and pick the plan that matches your solution the best.
+1. Re-read the user request, plans, tool outputs and examine the image.
+2. Solve the problem yourself given the image and pick the most accurate plan that matches your solution the best.
+3. Add modifications to improve the plan including: changing a tool, adding thresholds, string matching.
 3. Output a JSON object with the following format:
 {{
     "predicted_answer": str # the answer you would expect from the best plan
-    "thoughts": str # your thought process for choosing the best plan, any adjustments you would make to the plan
-    "best_plan": str # the best plan you have chosen
+    "thoughts": str # your thought process for choosing the best plan over other plans and any modifications you made
+    "best_plan": str # the best plan you have chosen, must be `plan1`, `plan2`, or `plan3`
 }}
 """
 
@@ -203,10 +232,9 @@ def remove_arrays(o):
 **User Instructions**:
 {question}
 
-**Tool Outputs**:
+**Tool Tests and Outputs**:
 {tool_output}
 
-
 **Tool Output Thoughts**:
 {plan_thoughts}
 
@@ -215,7 +243,7 @@ def remove_arrays(o):
 
 **Instructions**:
 1. **Understand and Clarify**: Make sure you understand the task.
-2. **Algorithm/Method Selection**: Decide on the most efficient method, use the tool outputs to guide your decision.
+2. **Algorithm/Method Selection**: Decide on the most efficient method, use the tool outputs and tool thoughts to guide you.
 3. **Pseudocode Creation**: Write down the steps you will follow in pseudocode.
 4. **Code Generation**: Translate your pseudocode into executable Python code. Ensure you use correct arguments, remember coordinates are always returned normalized from `vision_agent.tools`. All images from `vision_agent.tools` are in RGB format, red is (255, 0, 0) and blue is (0, 0, 255).
 """

From 94f950172c46e4b97c6341c0262e0db230b5cf5d Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Sun, 22 Sep 2024 12:39:12 -0700
Subject: [PATCH 24/30] standardize fps to 1

---
 vision_agent/utils/video.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vision_agent/utils/video.py b/vision_agent/utils/video.py
index ba6b0c76..0bb6fb18 100644
--- a/vision_agent/utils/video.py
+++ b/vision_agent/utils/video.py
@@ -61,6 +61,7 @@ def video_writer(
     stream.height = height - (height % 2)
     stream.width = width - (width % 2)
     stream.pix_fmt = "yuv420p"
+    stream.options = {"crf": "10"}
     for frame in frames:
         # Remove the alpha channel (convert RGBA to RGB)
         frame_rgb = frame[:, :, :3]
@@ -77,7 +78,7 @@ def video_writer(
 
 
 def frames_to_bytes(
-    frames: List[np.ndarray], fps: float = 10, file_ext: str = ".mp4"
+    frames: List[np.ndarray], fps: float = 1.0, file_ext: str = ".mp4"
 ) -> bytes:
     r"""Convert a list of frames to a video file encoded into a byte string.
 

From 85e2e8ad4828ac907f959d068cab38d667e5adf3 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Sun, 22 Sep 2024 12:39:29 -0700
Subject: [PATCH 25/30] rename functions to make them easier to understand by
 llm

---
 vision_agent/tools/__init__.py |  2 +-
 vision_agent/tools/tools.py    | 54 +++++++++++++++++++++-------------
 2 files changed, 34 insertions(+), 22 deletions(-)

diff --git a/vision_agent/tools/__init__.py b/vision_agent/tools/__init__.py
index a401fb46..22453224 100644
--- a/vision_agent/tools/__init__.py
+++ b/vision_agent/tools/__init__.py
@@ -21,7 +21,7 @@
     depth_anything_v2,
     detr_segmentation,
     dpt_hybrid_midas,
-    extract_frames,
+    extract_frames_and_timestamps,
     florence2_image_caption,
     florence2_ocr,
     florence2_phrase_grounding,
diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py
index 0e58049a..309e9ba2 100644
--- a/vision_agent/tools/tools.py
+++ b/vision_agent/tools/tools.py
@@ -240,10 +240,10 @@ def owl_v2_video(
     box_threshold: float = 0.10,
 ) -> List[List[Dict[str, Any]]]:
     """'owl_v2_video' will run owl_v2 on each frame of a video. It can detect multiple
-    objects per frame given a text prompt sucha s a category name or referring
-    expression. The categories in text prompt are separated by commas. It returns a list
-    of lists where each inner list contains the score, label, and bounding box of the
-    detections for that frame.
+    objects indepdently per frame given a text prompt such as a category name or
+    referring expression but does not track objects across frames. The categories in
+    text prompt are separated by commas. It returns a list of lists where each inner
+    list contains the score, label, and bounding box of the detections for that frame.
 
     Parameters:
         prompt (str): The prompt to ground to the video.
@@ -461,18 +461,19 @@ def florence2_sam2_image(
 
 
 def florence2_sam2_video_tracking(
-    prompt: str, frames: List[np.ndarray]
+    prompt: str, frames: List[np.ndarray], chunk_length: Optional[int] = None
 ) -> List[List[Dict[str, Any]]]:
     """'florence2_sam2_video_tracking' is a tool that can segment and track multiple
     entities in a video given a text prompt such as category names or referring
     expressions. You can optionally separate the categories in the text with commas. It
-    only tracks entities present in the first frame and only returns segmentation
-    masks. It is useful for tracking and counting without duplicating counts if they
-    appear in the first frame, always outputs scores of 1.0.
+    can find new objects every 'chunk_length' frames and is useful for tracking and
+    counting without duplicating counts and always outputs scores of 1.0.
 
     Parameters:
         prompt (str): The prompt to ground to the video.
         frames (List[np.ndarray]): The list of frames to ground the prompt to.
+        chunk_length (Optional[int]): The number of frames to re-run florence2 to find
+            new objects.
 
     Returns:
         List[List[Dict[str, Any]]]: A list of list of dictionaries containing the label
@@ -505,6 +506,8 @@ def florence2_sam2_video_tracking(
         "prompts": [s.strip() for s in prompt.split(",")],
         "function_name": "florence2_sam2_video_tracking",
     }
+    if chunk_length is not None:
+        payload["chunk_length"] = chunk_length
     data: Dict[str, Any] = send_inference_request(
         payload, "florence2-sam2", files=files, v2=True
     )
@@ -1570,12 +1573,14 @@ def closest_box_distance(
 # Utility and visualization functions
 
 
-def extract_frames(
+def extract_frames_and_timestamps(
     video_uri: Union[str, Path], fps: float = 1
-) -> List[Tuple[np.ndarray, float]]:
-    """'extract_frames' extracts frames from a video which can be a file path, url or
-    youtube link, returns a list of tuples (frame, timestamp), where timestamp is the
-    relative time in seconds where the frame was captured. The frame is a numpy array.
+) -> List[Dict[str, Union[np.ndarray, float]]]:
+    """'extract_frames_and_timestamps' extracts frames and timestamps from a video
+    which can be a file path, url or youtube link, returns a list of dictionaries
+    with keys "frame" and "timestamp" where "frame" is a numpy array and "timestamp" is
+    the relative time in seconds where the frame was captured. The frame is a numpy
+    array.
 
     Parameters:
         video_uri (Union[str, Path]): The path to the video file, url or youtube link
@@ -1583,15 +1588,23 @@ def extract_frames(
             to 1.
 
     Returns:
-        List[Tuple[np.ndarray, float]]: A list of tuples containing the extracted frame
-            as a numpy array and the timestamp in seconds.
+        List[Dict[str, Union[np.ndarray, float]]]: A list of dictionaries containing the
+            extracted frame as a numpy array and the timestamp in seconds.
 
     Example
     -------
         >>> extract_frames("path/to/video.mp4")
-        [(frame1, 0.0), (frame2, 0.5), ...]
+        [{"frame": np.ndarray, "timestamp": 0.0}, ...]
     """
 
+    def reformat(
+        frames_and_timestamps: List[Tuple[np.ndarray, float]]
+    ) -> List[Dict[str, Union[np.ndarray, float]]]:
+        return [
+            {"frame": frame, "timestamp": timestamp}
+            for frame, timestamp in frames_and_timestamps
+        ]
+
     if str(video_uri).startswith(
         (
             "http://www.youtube.com/",
@@ -1613,16 +1626,16 @@ def extract_frames(
                 raise Exception("No suitable video stream found")
             video_file_path = video.download(output_path=temp_dir)
 
-            return extract_frames_from_video(video_file_path, fps)
+            return reformat(extract_frames_from_video(video_file_path, fps))
     elif str(video_uri).startswith(("http", "https")):
         _, image_suffix = os.path.splitext(video_uri)
         with tempfile.NamedTemporaryFile(delete=False, suffix=image_suffix) as tmp_file:
             # Download the video and save it to the temporary file
             with urllib.request.urlopen(str(video_uri)) as response:
                 tmp_file.write(response.read())
-            return extract_frames_from_video(tmp_file.name, fps)
+            return reformat(extract_frames_from_video(tmp_file.name, fps))
 
-    return extract_frames_from_video(str(video_uri), fps)
+    return reformat(extract_frames_from_video(str(video_uri), fps))
 
 
 def save_json(data: Any, file_path: str) -> None:
@@ -2026,7 +2039,6 @@ def overlay_counting_results(
     vit_image_classification,
     vit_nsfw_classification,
     countgd_counting,
-    florence2_image_caption,
     florence2_ocr,
     florence2_sam2_image,
     florence2_sam2_video_tracking,
@@ -2041,7 +2053,7 @@ def overlay_counting_results(
 ]
 
 UTIL_TOOLS = [
-    extract_frames,
+    extract_frames_and_timestamps,
     save_json,
     load_image,
     save_image,

From 9b26db338ad1aec271fd48db9dd5b3cf70263ac7 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Sun, 22 Sep 2024 12:40:43 -0700
Subject: [PATCH 26/30] add openai vision agent coder

---
 vision_agent/agent/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vision_agent/agent/__init__.py b/vision_agent/agent/__init__.py
index 62cb4f38..793f44cf 100644
--- a/vision_agent/agent/__init__.py
+++ b/vision_agent/agent/__init__.py
@@ -4,5 +4,6 @@
     AnthropicVisionAgentCoder,
     AzureVisionAgentCoder,
     OllamaVisionAgentCoder,
+    OpenAIVisionAgentCoder,
     VisionAgentCoder,
 )

From 921d3b76b35168e778cb1c05f96001c255346a25 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Sun, 22 Sep 2024 13:15:51 -0700
Subject: [PATCH 27/30] fix complexity

---
 vision_agent/agent/vision_agent.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vision_agent/agent/vision_agent.py b/vision_agent/agent/vision_agent.py
index 9142c36c..c64390d5 100644
--- a/vision_agent/agent/vision_agent.py
+++ b/vision_agent/agent/vision_agent.py
@@ -278,7 +278,8 @@ def chat_with_code(
             orig_chat.append({"role": "observation", "content": artifacts_loaded})
             self.streaming_message({"role": "observation", "content": artifacts_loaded})
 
-            if isinstance(last_user_message_content, str):
+            if int_chat[-1]["role"] == "user":
+                last_user_message_content = cast(str, int_chat[-1].get("content", ""))
                 user_code_action = parse_execution(last_user_message_content, False)
                 if user_code_action is not None:
                     user_result, user_obs = run_code_action(
@@ -320,8 +321,7 @@ def chat_with_code(
                 else:
                     self.streaming_message({"role": "assistant", "content": response})
 
-                if response["let_user_respond"]:
-                    break
+                finished = response["let_user_respond"]
 
                 code_action = parse_execution(
                     response["response"], test_multi_plan, customized_tool_names

From 4d37e304baa4007188af900da7f95f1bca636ff1 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Sun, 22 Sep 2024 13:17:03 -0700
Subject: [PATCH 28/30] fix type issue

---
 vision_agent/tools/tools.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py
index 309e9ba2..fca3819c 100644
--- a/vision_agent/tools/tools.py
+++ b/vision_agent/tools/tools.py
@@ -507,7 +507,7 @@ def florence2_sam2_video_tracking(
         "function_name": "florence2_sam2_video_tracking",
     }
     if chunk_length is not None:
-        payload["chunk_length"] = chunk_length
+        payload["chunk_length"] = chunk_length  # type: ignore
     data: Dict[str, Any] = send_inference_request(
         payload, "florence2-sam2", files=files, v2=True
     )

From b11cb88e8bfbb78bcbc7e1509e852f1e8f689ea5 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Sun, 22 Sep 2024 13:55:53 -0700
Subject: [PATCH 29/30] fix lmm version

---
 vision_agent/lmm/lmm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vision_agent/lmm/lmm.py b/vision_agent/lmm/lmm.py
index 05917b74..0362fb3e 100644
--- a/vision_agent/lmm/lmm.py
+++ b/vision_agent/lmm/lmm.py
@@ -46,7 +46,7 @@ class OpenAILMM(LMM):
 
     def __init__(
         self,
-        model_name: str = "gpt-4o",
+        model_name: str = "gpt-4o-2024-05-13",
         api_key: Optional[str] = None,
         max_tokens: int = 4096,
         json_mode: bool = False,

From 2c9c5c5dd8d044a033bdd51c074670300b6159f6 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Mon, 23 Sep 2024 10:49:06 -0700
Subject: [PATCH 30/30] updated readme

---
 README.md     | 70 +++++++++++++++++++++++++++++++++++++++++++--------
 docs/index.md | 70 +++++++++++++++++++++++++++++++++++++++++++--------
 2 files changed, 118 insertions(+), 22 deletions(-)

diff --git a/README.md b/README.md
index 88c59973..1529e354 100644
--- a/README.md
+++ b/README.md
@@ -33,10 +33,11 @@ To get started, you can install the library using pip:
 pip install vision-agent
 ```
 
-Ensure you have an OpenAI API key and set it as an environment variable (if you are
-using Azure OpenAI please see the Azure setup section):
+Ensure you have an Anthropic key and an OpenAI API key and set in your environment
+variables (if you are using Azure OpenAI please see the Azure setup section):
 
 ```bash
+export ANTHROPIC_API_KEY="your-api-key"
 export OPENAI_API_KEY="your-api-key"
 ```
 
@@ -71,6 +72,9 @@ You can find more details about the streamlit app [here](examples/chat/).
 >>> resp = agent(resp)
 ```
 
+`VisionAgent` currently utilizes Claude-3.5 as it's default LMM and uses OpenAI for
+embeddings for tool searching.
+
 ### Vision Agent Coder
 #### Basic Usage
 You can interact with the agent as you would with any LLM or LMM model:
@@ -132,7 +136,8 @@ of the input is a list of dictionaries with the keys `role`, `content`, and `med
     "code": "from vision_agent.tools import ..."
     "test": "calculate_filled_percentage('jar.jpg')",
     "test_result": "...",
-    "plan": [{"code": "...", "test": "...", "plan": "..."}, ...],
+    "plans": {"plan1": {"thoughts": "..."}, ...},
+    "plan_thoughts": "...",
     "working_memory": ...,
 }
 ```
@@ -169,20 +174,25 @@ result = agent.chat_with_workflow(conv)
 ### Tools
 There are a variety of tools for the model or the user to use. Some are executed locally
 while others are hosted for you. You can easily access them yourself, for example if
-you want to run `owl_v2` and visualize the output you can run:
+you want to run `owl_v2_image` and visualize the output you can run:
 
 ```python
 import vision_agent.tools as T
 import matplotlib.pyplot as plt
 
 image = T.load_image("dogs.jpg")
-dets = T.owl_v2("dogs", image)
+dets = T.owl_v2_image("dogs", image)
 viz = T.overlay_bounding_boxes(image, dets)
 plt.imshow(viz)
 plt.show()
 ```
 
-You can also add custom tools to the agent:
+You can find all available tools in `vision_agent/tools/tools.py`, however,
+`VisionAgentCoder` only utilizes a subset of tools that have been tested and provide
+the best performance. Those can be found in the same file under the `TOOLS` variable.
+
+If you can't find the tool you are looking for you can also add custom tools to the
+agent:
 
 ```python
 import vision_agent as va
@@ -217,9 +227,48 @@ Can't find the tool you need and want add it to `VisionAgent`? Check out our
 we add the source code for all the tools used in `VisionAgent`.
 
 ## Additional Backends
+### Anthropic
+`AnthropicVisionAgentCoder` uses Anthropic. To get started you just need to get an
+Anthropic API key and set it in your environment variables:
+
+```bash
+export ANTHROPIC_API_KEY="your-api-key"
+```
+
+Because Anthropic does not support embedding models, the default embedding model used
+is the OpenAI model so you will also need to set your OpenAI API key:
+
+```bash
+export OPEN_AI_API_KEY="your-api-key"
+```
+
+Usage is the same as `VisionAgentCoder`:
+
+```python
+>>> import vision_agent as va
+>>> agent = va.agent.AnthropicVisionAgentCoder()
+>>> agent("Count the apples in the image", media="apples.jpg")
+```
+
+### OpenAI
+`OpenAIVisionAgentCoder` uses OpenAI. To get started you just need to get an OpenAI API
+key and set it in your environment variables:
+
+```bash
+export OPEN_AI_API_KEY="your-api-key"
+```
+
+Usage is the same as `VisionAgentCoder`:
+
+```python
+>>> import vision_agent as va
+>>> agent = va.agent.OpenAIVisionAgentCoder()
+>>> agent("Count the apples in the image", media="apples.jpg")
+```
+
+
 ### Ollama
-We also provide a `VisionAgentCoder` that uses Ollama. To get started you must download
-a few models:
+`OllamaVisionAgentCoder` uses Ollama. To get started you must download a few models:
 
 ```bash
 ollama pull llama3.1
@@ -240,9 +289,8 @@ tools. You can use it just like you would use `VisionAgentCoder`:
 > WARNING: VisionAgent doesn't work well unless the underlying LMM is sufficiently powerful. Do not expect good results or even working code with smaller models like Llama 3.1 8B.
 
 ### Azure OpenAI
-We also provide a `AzureVisionAgentCoder` that uses Azure OpenAI models. To get started
-follow the Azure Setup section below. You can use it just like you would use=
-`VisionAgentCoder`:
+`AzureVisionAgentCoder` uses Azure OpenAI models. To get started follow the Azure Setup
+section below. You can use it just like you would use `VisionAgentCoder`:
 
 ```python
 >>> import vision_agent as va
diff --git a/docs/index.md b/docs/index.md
index 0f5022f9..a83e343e 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -30,10 +30,11 @@ To get started, you can install the library using pip:
 pip install vision-agent
 ```
 
-Ensure you have an OpenAI API key and set it as an environment variable (if you are
-using Azure OpenAI please see the Azure setup section):
+Ensure you have an Anthropic key and an OpenAI API key and set in your environment
+variables (if you are using Azure OpenAI please see the Azure setup section):
 
 ```bash
+export ANTHROPIC_API_KEY="your-api-key"
 export OPENAI_API_KEY="your-api-key"
 ```
 
@@ -68,6 +69,9 @@ You can find more details about the streamlit app [here](examples/chat/).
 >>> resp = agent(resp)
 ```
 
+`VisionAgent` currently utilizes Claude-3.5 as it's default LMM and uses OpenAI for
+embeddings for tool searching.
+
 ### Vision Agent Coder
 #### Basic Usage
 You can interact with the agent as you would with any LLM or LMM model:
@@ -129,7 +133,8 @@ of the input is a list of dictionaries with the keys `role`, `content`, and `med
     "code": "from vision_agent.tools import ..."
     "test": "calculate_filled_percentage('jar.jpg')",
     "test_result": "...",
-    "plan": [{"code": "...", "test": "...", "plan": "..."}, ...],
+    "plans": {"plan1": {"thoughts": "..."}, ...},
+    "plan_thoughts": "...",
     "working_memory": ...,
 }
 ```
@@ -166,20 +171,25 @@ result = agent.chat_with_workflow(conv)
 ### Tools
 There are a variety of tools for the model or the user to use. Some are executed locally
 while others are hosted for you. You can easily access them yourself, for example if
-you want to run `owl_v2` and visualize the output you can run:
+you want to run `owl_v2_image` and visualize the output you can run:
 
 ```python
 import vision_agent.tools as T
 import matplotlib.pyplot as plt
 
 image = T.load_image("dogs.jpg")
-dets = T.owl_v2("dogs", image)
+dets = T.owl_v2_image("dogs", image)
 viz = T.overlay_bounding_boxes(image, dets)
 plt.imshow(viz)
 plt.show()
 ```
 
-You can also add custom tools to the agent:
+You can find all available tools in `vision_agent/tools/tools.py`, however,
+`VisionAgentCoder` only utilizes a subset of tools that have been tested and provide
+the best performance. Those can be found in the same file under the `TOOLS` variable.
+
+If you can't find the tool you are looking for you can also add custom tools to the
+agent:
 
 ```python
 import vision_agent as va
@@ -214,9 +224,48 @@ Can't find the tool you need and want add it to `VisionAgent`? Check out our
 we add the source code for all the tools used in `VisionAgent`.
 
 ## Additional Backends
+### Anthropic
+`AnthropicVisionAgentCoder` uses Anthropic. To get started you just need to get an
+Anthropic API key and set it in your environment variables:
+
+```bash
+export ANTHROPIC_API_KEY="your-api-key"
+```
+
+Because Anthropic does not support embedding models, the default embedding model used
+is the OpenAI model so you will also need to set your OpenAI API key:
+
+```bash
+export OPEN_AI_API_KEY="your-api-key"
+```
+
+Usage is the same as `VisionAgentCoder`:
+
+```python
+>>> import vision_agent as va
+>>> agent = va.agent.AnthropicVisionAgentCoder()
+>>> agent("Count the apples in the image", media="apples.jpg")
+```
+
+### OpenAI
+`OpenAIVisionAgentCoder` uses OpenAI. To get started you just need to get an OpenAI API
+key and set it in your environment variables:
+
+```bash
+export OPEN_AI_API_KEY="your-api-key"
+```
+
+Usage is the same as `VisionAgentCoder`:
+
+```python
+>>> import vision_agent as va
+>>> agent = va.agent.OpenAIVisionAgentCoder()
+>>> agent("Count the apples in the image", media="apples.jpg")
+```
+
+
 ### Ollama
-We also provide a `VisionAgentCoder` that uses Ollama. To get started you must download
-a few models:
+`OllamaVisionAgentCoder` uses Ollama. To get started you must download a few models:
 
 ```bash
 ollama pull llama3.1
@@ -237,9 +286,8 @@ tools. You can use it just like you would use `VisionAgentCoder`:
 > WARNING: VisionAgent doesn't work well unless the underlying LMM is sufficiently powerful. Do not expect good results or even working code with smaller models like Llama 3.1 8B.
 
 ### Azure OpenAI
-We also provide a `AzureVisionAgentCoder` that uses Azure OpenAI models. To get started
-follow the Azure Setup section below. You can use it just like you would use=
-`VisionAgentCoder`:
+`AzureVisionAgentCoder` uses Azure OpenAI models. To get started follow the Azure Setup
+section below. You can use it just like you would use `VisionAgentCoder`:
 
 ```python
 >>> import vision_agent as va