From 696da6c3d1a461e8a9050645a30b5dd287258a2f Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Mon, 23 Sep 2024 11:31:45 -0700
Subject: [PATCH] Full Claude Sonnet 3.5 Support (#234)

* resize image for claude

* only resize if above size

* renamed claude to anthropic for consistency

* added openai classes and made anthropic default

* add ability to view images

* add florence2 fine tune to owl_v2 args

* added fine tune id for florence2sam2

* add generic OD fine tuning

* fixed type error

* added comment

* fix prompt for florence2 sam2 video tracking

* fixed import bug

* updated fine tuning names in prompts

* improve json parsing

* update json extract, add tests

* removed old code

* minor improvements to prompt to improve benchmark

* pass plan thoughts to coder

* fixed comments

* fix type and lint errors

* update tests

* make imports easier, pass more code info

* update prompts

* standardize fps to 1

* rename functions to make them easier to understand by llm

* add openai vision agent coder

* fix complexity

* fix type issue

* fix lmm version

* updated readme
---
 README.md                                     |  70 ++++++--
 docs/index.md                                 |  70 ++++++--
 tests/integ/test_tools.py                     |  43 ++++-
 tests/unit/test_utils.py                      |  45 +++++
 vision_agent/agent/__init__.py                |   3 +-
 vision_agent/agent/agent_utils.py             |  10 +-
 vision_agent/agent/vision_agent.py            | 114 +++++++++++--
 vision_agent/agent/vision_agent_coder.py      | 159 ++++++++++--------
 .../agent/vision_agent_coder_prompts.py       |  72 +++++---
 vision_agent/agent/vision_agent_prompts.py    |  40 ++++-
 vision_agent/lmm/__init__.py                  |   2 +-
 vision_agent/lmm/lmm.py                       |  15 +-
 vision_agent/tools/__init__.py                |   2 +-
 vision_agent/tools/meta_tools.py              |  96 +++++++----
 vision_agent/tools/tools.py                   | 145 ++++++++++++----
 vision_agent/tools/tools_types.py             |   1 +
 vision_agent/utils/image_utils.py             |  25 ++-
 vision_agent/utils/video.py                   |   3 +-
 18 files changed, 696 insertions(+), 219 deletions(-)
 create mode 100644 tests/unit/test_utils.py

diff --git a/README.md b/README.md
index 88c59973..1529e354 100644
--- a/README.md
+++ b/README.md
@@ -33,10 +33,11 @@ To get started, you can install the library using pip:
 pip install vision-agent
 ```
 
-Ensure you have an OpenAI API key and set it as an environment variable (if you are
-using Azure OpenAI please see the Azure setup section):
+Ensure you have an Anthropic key and an OpenAI API key and set in your environment
+variables (if you are using Azure OpenAI please see the Azure setup section):
 
 ```bash
+export ANTHROPIC_API_KEY="your-api-key"
 export OPENAI_API_KEY="your-api-key"
 ```
 
@@ -71,6 +72,9 @@ You can find more details about the streamlit app [here](examples/chat/).
 >>> resp = agent(resp)
 ```
 
+`VisionAgent` currently utilizes Claude-3.5 as it's default LMM and uses OpenAI for
+embeddings for tool searching.
+
 ### Vision Agent Coder
 #### Basic Usage
 You can interact with the agent as you would with any LLM or LMM model:
@@ -132,7 +136,8 @@ of the input is a list of dictionaries with the keys `role`, `content`, and `med
     "code": "from vision_agent.tools import ..."
     "test": "calculate_filled_percentage('jar.jpg')",
     "test_result": "...",
-    "plan": [{"code": "...", "test": "...", "plan": "..."}, ...],
+    "plans": {"plan1": {"thoughts": "..."}, ...},
+    "plan_thoughts": "...",
     "working_memory": ...,
 }
 ```
@@ -169,20 +174,25 @@ result = agent.chat_with_workflow(conv)
 ### Tools
 There are a variety of tools for the model or the user to use. Some are executed locally
 while others are hosted for you. You can easily access them yourself, for example if
-you want to run `owl_v2` and visualize the output you can run:
+you want to run `owl_v2_image` and visualize the output you can run:
 
 ```python
 import vision_agent.tools as T
 import matplotlib.pyplot as plt
 
 image = T.load_image("dogs.jpg")
-dets = T.owl_v2("dogs", image)
+dets = T.owl_v2_image("dogs", image)
 viz = T.overlay_bounding_boxes(image, dets)
 plt.imshow(viz)
 plt.show()
 ```
 
-You can also add custom tools to the agent:
+You can find all available tools in `vision_agent/tools/tools.py`, however,
+`VisionAgentCoder` only utilizes a subset of tools that have been tested and provide
+the best performance. Those can be found in the same file under the `TOOLS` variable.
+
+If you can't find the tool you are looking for you can also add custom tools to the
+agent:
 
 ```python
 import vision_agent as va
@@ -217,9 +227,48 @@ Can't find the tool you need and want add it to `VisionAgent`? Check out our
 we add the source code for all the tools used in `VisionAgent`.
 
 ## Additional Backends
+### Anthropic
+`AnthropicVisionAgentCoder` uses Anthropic. To get started you just need to get an
+Anthropic API key and set it in your environment variables:
+
+```bash
+export ANTHROPIC_API_KEY="your-api-key"
+```
+
+Because Anthropic does not support embedding models, the default embedding model used
+is the OpenAI model so you will also need to set your OpenAI API key:
+
+```bash
+export OPEN_AI_API_KEY="your-api-key"
+```
+
+Usage is the same as `VisionAgentCoder`:
+
+```python
+>>> import vision_agent as va
+>>> agent = va.agent.AnthropicVisionAgentCoder()
+>>> agent("Count the apples in the image", media="apples.jpg")
+```
+
+### OpenAI
+`OpenAIVisionAgentCoder` uses OpenAI. To get started you just need to get an OpenAI API
+key and set it in your environment variables:
+
+```bash
+export OPEN_AI_API_KEY="your-api-key"
+```
+
+Usage is the same as `VisionAgentCoder`:
+
+```python
+>>> import vision_agent as va
+>>> agent = va.agent.OpenAIVisionAgentCoder()
+>>> agent("Count the apples in the image", media="apples.jpg")
+```
+
+
 ### Ollama
-We also provide a `VisionAgentCoder` that uses Ollama. To get started you must download
-a few models:
+`OllamaVisionAgentCoder` uses Ollama. To get started you must download a few models:
 
 ```bash
 ollama pull llama3.1
@@ -240,9 +289,8 @@ tools. You can use it just like you would use `VisionAgentCoder`:
 > WARNING: VisionAgent doesn't work well unless the underlying LMM is sufficiently powerful. Do not expect good results or even working code with smaller models like Llama 3.1 8B.
 
 ### Azure OpenAI
-We also provide a `AzureVisionAgentCoder` that uses Azure OpenAI models. To get started
-follow the Azure Setup section below. You can use it just like you would use=
-`VisionAgentCoder`:
+`AzureVisionAgentCoder` uses Azure OpenAI models. To get started follow the Azure Setup
+section below. You can use it just like you would use `VisionAgentCoder`:
 
 ```python
 >>> import vision_agent as va
diff --git a/docs/index.md b/docs/index.md
index 0f5022f9..a83e343e 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -30,10 +30,11 @@ To get started, you can install the library using pip:
 pip install vision-agent
 ```
 
-Ensure you have an OpenAI API key and set it as an environment variable (if you are
-using Azure OpenAI please see the Azure setup section):
+Ensure you have an Anthropic key and an OpenAI API key and set in your environment
+variables (if you are using Azure OpenAI please see the Azure setup section):
 
 ```bash
+export ANTHROPIC_API_KEY="your-api-key"
 export OPENAI_API_KEY="your-api-key"
 ```
 
@@ -68,6 +69,9 @@ You can find more details about the streamlit app [here](examples/chat/).
 >>> resp = agent(resp)
 ```
 
+`VisionAgent` currently utilizes Claude-3.5 as it's default LMM and uses OpenAI for
+embeddings for tool searching.
+
 ### Vision Agent Coder
 #### Basic Usage
 You can interact with the agent as you would with any LLM or LMM model:
@@ -129,7 +133,8 @@ of the input is a list of dictionaries with the keys `role`, `content`, and `med
     "code": "from vision_agent.tools import ..."
     "test": "calculate_filled_percentage('jar.jpg')",
     "test_result": "...",
-    "plan": [{"code": "...", "test": "...", "plan": "..."}, ...],
+    "plans": {"plan1": {"thoughts": "..."}, ...},
+    "plan_thoughts": "...",
     "working_memory": ...,
 }
 ```
@@ -166,20 +171,25 @@ result = agent.chat_with_workflow(conv)
 ### Tools
 There are a variety of tools for the model or the user to use. Some are executed locally
 while others are hosted for you. You can easily access them yourself, for example if
-you want to run `owl_v2` and visualize the output you can run:
+you want to run `owl_v2_image` and visualize the output you can run:
 
 ```python
 import vision_agent.tools as T
 import matplotlib.pyplot as plt
 
 image = T.load_image("dogs.jpg")
-dets = T.owl_v2("dogs", image)
+dets = T.owl_v2_image("dogs", image)
 viz = T.overlay_bounding_boxes(image, dets)
 plt.imshow(viz)
 plt.show()
 ```
 
-You can also add custom tools to the agent:
+You can find all available tools in `vision_agent/tools/tools.py`, however,
+`VisionAgentCoder` only utilizes a subset of tools that have been tested and provide
+the best performance. Those can be found in the same file under the `TOOLS` variable.
+
+If you can't find the tool you are looking for you can also add custom tools to the
+agent:
 
 ```python
 import vision_agent as va
@@ -214,9 +224,48 @@ Can't find the tool you need and want add it to `VisionAgent`? Check out our
 we add the source code for all the tools used in `VisionAgent`.
 
 ## Additional Backends
+### Anthropic
+`AnthropicVisionAgentCoder` uses Anthropic. To get started you just need to get an
+Anthropic API key and set it in your environment variables:
+
+```bash
+export ANTHROPIC_API_KEY="your-api-key"
+```
+
+Because Anthropic does not support embedding models, the default embedding model used
+is the OpenAI model so you will also need to set your OpenAI API key:
+
+```bash
+export OPEN_AI_API_KEY="your-api-key"
+```
+
+Usage is the same as `VisionAgentCoder`:
+
+```python
+>>> import vision_agent as va
+>>> agent = va.agent.AnthropicVisionAgentCoder()
+>>> agent("Count the apples in the image", media="apples.jpg")
+```
+
+### OpenAI
+`OpenAIVisionAgentCoder` uses OpenAI. To get started you just need to get an OpenAI API
+key and set it in your environment variables:
+
+```bash
+export OPEN_AI_API_KEY="your-api-key"
+```
+
+Usage is the same as `VisionAgentCoder`:
+
+```python
+>>> import vision_agent as va
+>>> agent = va.agent.OpenAIVisionAgentCoder()
+>>> agent("Count the apples in the image", media="apples.jpg")
+```
+
+
 ### Ollama
-We also provide a `VisionAgentCoder` that uses Ollama. To get started you must download
-a few models:
+`OllamaVisionAgentCoder` uses Ollama. To get started you must download a few models:
 
 ```bash
 ollama pull llama3.1
@@ -237,9 +286,8 @@ tools. You can use it just like you would use `VisionAgentCoder`:
 > WARNING: VisionAgent doesn't work well unless the underlying LMM is sufficiently powerful. Do not expect good results or even working code with smaller models like Llama 3.1 8B.
 
 ### Azure OpenAI
-We also provide a `AzureVisionAgentCoder` that uses Azure OpenAI models. To get started
-follow the Azure Setup section below. You can use it just like you would use=
-`VisionAgentCoder`:
+`AzureVisionAgentCoder` uses Azure OpenAI models. To get started follow the Azure Setup
+section below. You can use it just like you would use `VisionAgentCoder`:
 
 ```python
 >>> import vision_agent as va
diff --git a/tests/integ/test_tools.py b/tests/integ/test_tools.py
index ba5b989e..4954738c 100644
--- a/tests/integ/test_tools.py
+++ b/tests/integ/test_tools.py
@@ -21,8 +21,8 @@
     grounding_dino,
     grounding_sam,
     ixc25_image_vqa,
-    ixc25_video_vqa,
     ixc25_temporal_localization,
+    ixc25_video_vqa,
     loca_visual_prompt_counting,
     loca_zero_shot_counting,
     ocr,
@@ -33,6 +33,8 @@
     vit_nsfw_classification,
 )
 
+FINE_TUNE_ID = "65ebba4a-88b7-419f-9046-0750e30250da"
+
 
 def test_grounding_dino():
     img = ski.data.coins()
@@ -65,6 +67,18 @@ def test_owl_v2_image():
     assert [res["label"] for res in result] == ["coin"] * len(result)
 
 
+def test_owl_v2_fine_tune_id():
+    img = ski.data.coins()
+    result = owl_v2_image(
+        prompt="coin",
+        image=img,
+        fine_tune_id=FINE_TUNE_ID,
+    )
+    # this calls a fine-tuned florence2 model which is going to be worse at this task
+    assert 14 <= len(result) <= 26
+    assert [res["label"] for res in result] == ["coin"] * len(result)
+
+
 def test_owl_v2_video():
     frames = [
         np.array(Image.fromarray(ski.data.coins()).convert("RGB")) for _ in range(10)
@@ -78,7 +92,7 @@ def test_owl_v2_video():
     assert 24 <= len([res["label"] for res in result[0]]) <= 26
 
 
-def test_object_detection():
+def test_florence2_phrase_grounding():
     img = ski.data.coins()
     result = florence2_phrase_grounding(
         image=img,
@@ -88,6 +102,18 @@ def test_object_detection():
     assert [res["label"] for res in result] == ["coin"] * 25
 
 
+def test_florence2_phrase_grounding_fine_tune_id():
+    img = ski.data.coins()
+    result = florence2_phrase_grounding(
+        prompt="coin",
+        image=img,
+        fine_tune_id=FINE_TUNE_ID,
+    )
+    # this calls a fine-tuned florence2 model which is going to be worse at this task
+    assert 14 <= len(result) <= 26
+    assert [res["label"] for res in result] == ["coin"] * len(result)
+
+
 def test_template_match():
     img = ski.data.coins()
     result = template_match(
@@ -119,6 +145,19 @@ def test_florence2_sam2_image():
     assert len([res["mask"] for res in result]) == 25
 
 
+def test_florence2_sam2_image_fine_tune_id():
+    img = ski.data.coins()
+    result = florence2_sam2_image(
+        prompt="coin",
+        image=img,
+        fine_tune_id=FINE_TUNE_ID,
+    )
+    # this calls a fine-tuned florence2 model which is going to be worse at this task
+    assert 14 <= len(result) <= 26
+    assert [res["label"] for res in result] == ["coin"] * len(result)
+    assert len([res["mask"] for res in result]) == len(result)
+
+
 def test_florence2_sam2_video():
     frames = [
         np.array(Image.fromarray(ski.data.coins()).convert("RGB")) for _ in range(10)
diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py
new file mode 100644
index 00000000..4db319f9
--- /dev/null
+++ b/tests/unit/test_utils.py
@@ -0,0 +1,45 @@
+from vision_agent.agent.agent_utils import extract_code, extract_json
+
+
+def test_basic_json_extract():
+    a = '{"a": 1, "b": 2}'
+    assert extract_json(a) == {"a": 1, "b": 2}
+
+
+def test_side_case_quotes_json_extract():
+    a = "{'0': 'no', '3': 'no', '6': 'no', '9': 'yes', '12': 'no', '15': 'no'}"
+    a_json = extract_json(a)
+    assert len(a_json) == 6
+
+
+def test_side_case_bool_json_extract():
+    a = "{'0': False, '3': False, '6': False, '9': True, '12': False, '15': False}"
+    a_json = extract_json(a)
+    assert len(a_json) == 6
+
+
+def test_complicated_case_json_extract_1():
+    a = """```json {     "plan1": {         "thoughts": "This plan uses the owl_v2_video tool to detect the truck and then uses ocr to read the USDOT and trailer numbers. This approach is efficient as it can process the entire video at once for truck detection.",         "instructions": [             "Use extract_frames to get frames from truck1.mp4",             "Use owl_v2_video with prompt 'truck' to detect if a truck is present in the video",             "If a truck is detected, use ocr on relevant frames to read the USDOT and trailer numbers",             "Process the OCR results to extract the USDOT and trailer numbers",             "Compile results into JSON format and save using save_json"         ]     },     "plan2": {         "thoughts": "This plan uses florence2_sam2_video_tracking to segment and track the truck, then uses florence2_ocr for text detection. This approach might be more accurate for text detection as it can focus on the relevant parts of the truck.",         "instructions": [             "Use extract_frames to get frames from truck1.mp4",             "Use florence2_sam2_video_tracking with prompt 'truck' to segment and track the truck",             "If a truck is segmented, use florence2_ocr on the segmented area to detect text",             "Process the OCR results to extract the USDOT and trailer numbers",             "Compile results into JSON format and save using save_json"         ]     },     "plan3": {         "thoughts": "This plan uses ixc25_video_vqa to directly ask questions about the truck, USDOT number, and trailer number. This approach leverages the model's ability to understand and answer complex questions about video content.",         "instructions": [             "Use extract_frames to get frames from truck1.mp4",             "Use ixc25_video_vqa with the question 'Is there a truck in this video?' to detect the presence of a truck",             "If a truck is present, use ixc25_video_vqa with the question 'What is the USDOT number on the truck?'",             "Use ixc25_video_vqa with the question 'What is the trailer number on the truck?'",             "Process the answers to extract the required information",             "Compile results into JSON format and save using save_json"         ]     } } ```"""
+
+    a_json = extract_json(a)
+    assert len(a_json) == 3
+    assert "plan1" in a_json
+
+
+def test_complicated_case_json_extract_2():
+    a = """{\n    "predicted_answer": "2",\n    "thoughts": "After analyzing the image and the tool outputs, I can see that there are indeed 2 dogs in the image. One is a small grey dog on the grass, and the other is a larger white dog on the patio. All three plans correctly identified 2 dogs, but I believe plan2 using the countgd_counting tool is the best choice for this task. Here\'s why:\n\n    1. Accuracy: The countgd_counting tool provided high confidence scores (0.92 and 0.9) for both dogs, which aligns with what I can see in the image.\n    \n    2. Precision: The bounding boxes from the countgd_counting tool seem to be more precise and tightly fit around the dogs compared to the other tools.\n    \n    3. Simplicity: While plan3 offers a more complex approach with additional verification, it\'s not necessary in this case as the dogs are clearly visible and easily identifiable. The extra steps in plan3 would add unnecessary complexity and potential for errors.\n    \n    4. Efficiency: Plan2 is more straightforward and efficient than plan3, while potentially offering better accuracy than plan1 (owl_v2_image tool had lower confidence scores).",\n    "best_plan": "plan2"\n}"""
+    a_json = extract_json(a)
+    assert len(a_json) == 3
+    assert "predicted_answer" in a_json
+
+
+def test_basic_code_extract():
+    a = """```python
+def test_basic_json_extract():
+    a = '{"a": 1, "b": 2}'
+    assert extract_json(a) == {"a": 1, "b": 2}
+```
+"""
+    a_code = extract_code(a)
+    assert "def test_basic_json_extract():" in a_code
+    assert "assert extract_json(a) == {" in a_code
diff --git a/vision_agent/agent/__init__.py b/vision_agent/agent/__init__.py
index e1478a38..793f44cf 100644
--- a/vision_agent/agent/__init__.py
+++ b/vision_agent/agent/__init__.py
@@ -1,8 +1,9 @@
 from .agent import Agent
 from .vision_agent import VisionAgent
 from .vision_agent_coder import (
+    AnthropicVisionAgentCoder,
     AzureVisionAgentCoder,
-    ClaudeVisionAgentCoder,
     OllamaVisionAgentCoder,
+    OpenAIVisionAgentCoder,
     VisionAgentCoder,
 )
diff --git a/vision_agent/agent/agent_utils.py b/vision_agent/agent/agent_utils.py
index 2a193a4a..dc0debee 100644
--- a/vision_agent/agent/agent_utils.py
+++ b/vision_agent/agent/agent_utils.py
@@ -40,12 +40,18 @@ def _strip_markdown_code(inp_str: str) -> str:
 
 
 def extract_json(json_str: str) -> Dict[str, Any]:
-    json_str = json_str.replace("\n", " ").strip()
+    json_str_mod = json_str.replace("\n", " ").strip()
+    json_str_mod = json_str_mod.replace("'", '"')
+    json_str_mod = json_str_mod.replace(": True", ": true").replace(
+        ": False", ": false"
+    )
 
     try:
-        return json.loads(json_str)  # type: ignore
+        return json.loads(json_str_mod)  # type: ignore
     except json.JSONDecodeError:
         json_orig = json_str
+        # don't replace quotes here or booleans since it can also introduce errors
+        json_str = json_str.replace("\n", " ").strip()
         json_str = _strip_markdown_code(json_str)
         json_str = _find_markdown_json(json_str)
         json_dict = _extract_sub_json(json_str)
diff --git a/vision_agent/agent/vision_agent.py b/vision_agent/agent/vision_agent.py
index 62682524..c64390d5 100644
--- a/vision_agent/agent/vision_agent.py
+++ b/vision_agent/agent/vision_agent.py
@@ -3,18 +3,23 @@
 import os
 import tempfile
 from pathlib import Path
-from typing import Any, Dict, List, Optional, Tuple, Union, cast, Callable
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast
 
 from vision_agent.agent import Agent
 from vision_agent.agent.agent_utils import extract_json
 from vision_agent.agent.vision_agent_prompts import (
     EXAMPLES_CODE1,
     EXAMPLES_CODE2,
+    EXAMPLES_CODE3,
     VA_CODE,
 )
-from vision_agent.lmm import LMM, Message, OpenAILMM
+from vision_agent.lmm import LMM, AnthropicLMM, Message, OpenAILMM
 from vision_agent.tools import META_TOOL_DOCSTRING
-from vision_agent.tools.meta_tools import Artifacts, use_extra_vision_agent_args
+from vision_agent.tools.meta_tools import (
+    Artifacts,
+    check_and_load_image,
+    use_extra_vision_agent_args,
+)
 from vision_agent.utils import CodeInterpreterFactory
 from vision_agent.utils.execute import CodeInterpreter, Execution
 
@@ -30,7 +35,7 @@ class BoilerplateCode:
     pre_code = [
         "from typing import *",
         "from vision_agent.utils.execute import CodeInterpreter",
-        "from vision_agent.tools.meta_tools import Artifacts, open_code_artifact, create_code_artifact, edit_code_artifact, get_tool_descriptions, generate_vision_code, edit_vision_code, write_media_artifact, florence2_fine_tuning, use_florence2_fine_tuning",
+        "from vision_agent.tools.meta_tools import Artifacts, open_code_artifact, create_code_artifact, edit_code_artifact, get_tool_descriptions, generate_vision_code, edit_vision_code, write_media_artifact, view_media_artifact, object_detection_fine_tuning, use_object_detection_fine_tuning",
         "artifacts = Artifacts('{remote_path}')",
         "artifacts.load('{remote_path}')",
     ]
@@ -68,10 +73,18 @@ def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
 
     prompt = VA_CODE.format(
         documentation=META_TOOL_DOCSTRING,
-        examples=f"{EXAMPLES_CODE1}\n{EXAMPLES_CODE2}",
+        examples=f"{EXAMPLES_CODE1}\n{EXAMPLES_CODE2}\n{EXAMPLES_CODE3}",
         conversation=conversation,
     )
-    return extract_json(orch([{"role": "user", "content": prompt}], stream=False))  # type: ignore
+    message: Message = {"role": "user", "content": prompt}
+    # only add recent media so we don't overload the model with old images
+    if (
+        chat[-1]["role"] == "observation"
+        and "media" in chat[-1]
+        and len(chat[-1]["media"]) > 0  # type: ignore
+    ):
+        message["media"] = chat[-1]["media"]
+    return extract_json(orch([message], stream=False))  # type: ignore
 
 
 def run_code_action(
@@ -136,10 +149,8 @@ def __init__(
             code_sandbox_runtime (Optional[str]): The code sandbox runtime to use.
         """
 
-        self.agent = (
-            OpenAILMM(temperature=0.0, json_mode=True) if agent is None else agent
-        )
-        self.max_iterations = 100
+        self.agent = AnthropicLMM(temperature=0.0) if agent is None else agent
+        self.max_iterations = 12
         self.verbosity = verbosity
         self.code_sandbox_runtime = code_sandbox_runtime
         self.callback_message = callback_message
@@ -267,7 +278,8 @@ def chat_with_code(
             orig_chat.append({"role": "observation", "content": artifacts_loaded})
             self.streaming_message({"role": "observation", "content": artifacts_loaded})
 
-            if isinstance(last_user_message_content, str):
+            if int_chat[-1]["role"] == "user":
+                last_user_message_content = cast(str, int_chat[-1].get("content", ""))
                 user_code_action = parse_execution(last_user_message_content, False)
                 if user_code_action is not None:
                     user_result, user_obs = run_code_action(
@@ -309,8 +321,7 @@ def chat_with_code(
                 else:
                     self.streaming_message({"role": "assistant", "content": response})
 
-                if response["let_user_respond"]:
-                    break
+                finished = response["let_user_respond"]
 
                 code_action = parse_execution(
                     response["response"], test_multi_plan, customized_tool_names
@@ -321,13 +332,22 @@ def chat_with_code(
                         code_action, code_interpreter, str(remote_artifacts_path)
                     )
 
+                    media_obs = check_and_load_image(code_action)
+
                     if self.verbosity >= 1:
                         _LOGGER.info(obs)
+
+                    chat_elt: Message = {"role": "observation", "content": obs}
+                    if media_obs and result.success:
+                        chat_elt["media"] = [
+                            Path(code_interpreter.remote_path) / media_ob
+                            for media_ob in media_obs
+                        ]
+
                     # don't add execution results to internal chat
-                    int_chat.append({"role": "observation", "content": obs})
-                    orig_chat.append(
-                        {"role": "observation", "content": obs, "execution": result}
-                    )
+                    int_chat.append(chat_elt)
+                    chat_elt["execution"] = result
+                    orig_chat.append(chat_elt)
                     self.streaming_message(
                         {
                             "role": "observation",
@@ -353,3 +373,63 @@ def streaming_message(self, message: Dict[str, Any]) -> None:
 
     def log_progress(self, data: Dict[str, Any]) -> None:
         pass
+
+
+class OpenAIVisionAgent(VisionAgent):
+    def __init__(
+        self,
+        agent: Optional[LMM] = None,
+        verbosity: int = 0,
+        local_artifacts_path: Optional[Union[str, Path]] = None,
+        code_sandbox_runtime: Optional[str] = None,
+        callback_message: Optional[Callable[[Dict[str, Any]], None]] = None,
+    ) -> None:
+        """Initialize the VisionAgent using OpenAI LMMs.
+
+        Parameters:
+            agent (Optional[LMM]): The agent to use for conversation and orchestration
+                of other agents.
+            verbosity (int): The verbosity level of the agent.
+            local_artifacts_path (Optional[Union[str, Path]]): The path to the local
+                artifacts file.
+            code_sandbox_runtime (Optional[str]): The code sandbox runtime to use.
+        """
+
+        agent = OpenAILMM(temperature=0.0, json_mode=True) if agent is None else agent
+        super().__init__(
+            agent,
+            verbosity,
+            local_artifacts_path,
+            code_sandbox_runtime,
+            callback_message,
+        )
+
+
+class AnthropicVisionAgent(VisionAgent):
+    def __init__(
+        self,
+        agent: Optional[LMM] = None,
+        verbosity: int = 0,
+        local_artifacts_path: Optional[Union[str, Path]] = None,
+        code_sandbox_runtime: Optional[str] = None,
+        callback_message: Optional[Callable[[Dict[str, Any]], None]] = None,
+    ) -> None:
+        """Initialize the VisionAgent using Anthropic LMMs.
+
+        Parameters:
+            agent (Optional[LMM]): The agent to use for conversation and orchestration
+                of other agents.
+            verbosity (int): The verbosity level of the agent.
+            local_artifacts_path (Optional[Union[str, Path]]): The path to the local
+                artifacts file.
+            code_sandbox_runtime (Optional[str]): The code sandbox runtime to use.
+        """
+
+        agent = AnthropicLMM(temperature=0.0) if agent is None else agent
+        super().__init__(
+            agent,
+            verbosity,
+            local_artifacts_path,
+            code_sandbox_runtime,
+            callback_message,
+        )
diff --git a/vision_agent/agent/vision_agent_coder.py b/vision_agent/agent/vision_agent_coder.py
index edf249ac..ec5ece0b 100644
--- a/vision_agent/agent/vision_agent_coder.py
+++ b/vision_agent/agent/vision_agent_coder.py
@@ -2,12 +2,10 @@
 import logging
 import os
 import sys
-import tempfile
 from json import JSONDecodeError
 from pathlib import Path
 from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union, cast
 
-from PIL import Image
 from rich.console import Console
 from rich.style import Style
 from rich.syntax import Syntax
@@ -29,8 +27,8 @@
 )
 from vision_agent.lmm import (
     LMM,
+    AnthropicLMM,
     AzureOpenAILMM,
-    ClaudeSonnetLMM,
     Message,
     OllamaLMM,
     OpenAILMM,
@@ -53,6 +51,9 @@ class DefaultImports:
     """Container for default imports used in the code execution."""
 
     common_imports = [
+        "import os",
+        "import numpy as np",
+        "from vision_agent.tools import *",
         "from typing import *",
         "from pillow_heif import register_heif_opener",
         "register_heif_opener()",
@@ -92,29 +93,6 @@ def format_plans(plans: Dict[str, Any]) -> str:
     return plan_str
 
 
-def extract_image(
-    media: Optional[Sequence[Union[str, Path]]],
-) -> Optional[Sequence[Union[str, Path]]]:
-    if media is None:
-        return None
-
-    new_media = []
-    for m in media:
-        m = Path(m)
-        extension = m.suffix
-        if extension in [".jpg", ".jpeg", ".png", ".bmp"]:
-            new_media.append(m)
-        elif extension in [".mp4", ".mov"]:
-            frames = T.extract_frames(m)
-            with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
-                if len(frames) > 0:
-                    Image.fromarray(frames[0][0]).save(tmp.name)
-                    new_media.append(Path(tmp.name))
-    if len(new_media) == 0:
-        return None
-    return new_media
-
-
 def write_plans(
     chat: List[Message],
     tool_desc: str,
@@ -146,7 +124,7 @@ def pick_plan(
     log_progress: Callable[[Dict[str, Any]], None],
     verbosity: int = 0,
     max_retries: int = 3,
-) -> Tuple[str, str]:
+) -> Tuple[Dict[str, str], str]:
     log_progress(
         {
             "type": "log",
@@ -199,7 +177,10 @@ def pick_plan(
 
     # retry if the tool output is empty or code fails
     count = 0
-    while (not tool_output.success or tool_output_str == "") and count < max_retries:
+    while (
+        not tool_output.success
+        or (len(tool_output.logs.stdout) == 0 and len(tool_output.logs.stderr) == 0)
+    ) and count < max_retries:
         prompt = TEST_PLANS.format(
             docstring=tool_info,
             plans=plan_str,
@@ -238,6 +219,7 @@ def pick_plan(
         if verbosity == 2:
             _print_code("Code and test after attempted fix:", code)
             _LOGGER.info(f"Code execution result after attempt {count + 1}")
+            _LOGGER.info(f"{tool_output_str}")
 
         count += 1
 
@@ -256,10 +238,10 @@ def pick_plan(
     chat[-1]["content"] = prompt
 
     count = 0
-    best_plan = None
-    while best_plan is None and count < max_retries:
+    plan_thoughts = None
+    while plan_thoughts is None and count < max_retries:
         try:
-            best_plan = extract_json(model(chat, stream=False))  # type: ignore
+            plan_thoughts = extract_json(model(chat, stream=False))  # type: ignore
         except JSONDecodeError as e:
             _LOGGER.exception(
                 f"Error while extracting JSON during picking best plan {str(e)}"
@@ -268,23 +250,27 @@ def pick_plan(
         count += 1
 
     if (
-        best_plan is None
-        or "best_plan" not in best_plan
-        or ("best_plan" in best_plan and best_plan["best_plan"] not in plans)
+        plan_thoughts is None
+        or "best_plan" not in plan_thoughts
+        or ("best_plan" in plan_thoughts and plan_thoughts["best_plan"] not in plans)
     ):
-        best_plan = {"best_plan": list(plans.keys())[0]}
+        _LOGGER.info(f"Failed to pick best plan. Using the first plan. {plan_thoughts}")
+        plan_thoughts = {"best_plan": list(plans.keys())[0]}
+
+    if "thoughts" not in plan_thoughts:
+        plan_thoughts["thoughts"] = ""
 
     if verbosity >= 1:
-        _LOGGER.info(f"Best plan:\n{best_plan}")
+        _LOGGER.info(f"Best plan:\n{plan_thoughts}")
     log_progress(
         {
             "type": "log",
             "log_content": "Picked best plan",
             "status": "completed",
-            "payload": plans[best_plan["best_plan"]],
+            "payload": plans[plan_thoughts["best_plan"]],
         }
     )
-    return best_plan["best_plan"], tool_output_str
+    return plan_thoughts, "```python\n" + code + "\n```\n" + tool_output_str
 
 
 def write_code(
@@ -292,6 +278,7 @@ def write_code(
     chat: List[Message],
     plan: str,
     tool_info: str,
+    plan_thoughts: str,
     tool_output: str,
     feedback: str,
 ) -> str:
@@ -304,6 +291,7 @@ def write_code(
         docstring=tool_info,
         question=FULL_TASK.format(user_request=user_request, subtasks=plan),
         tool_output=tool_output,
+        plan_thoughts=plan_thoughts,
         feedback=feedback,
     )
     chat[-1]["content"] = prompt
@@ -339,6 +327,7 @@ def write_and_test_code(
     plan: str,
     tool_info: str,
     tool_output: str,
+    plan_thoughts: str,
     tool_utils: str,
     working_memory: List[Dict[str, str]],
     coder: LMM,
@@ -363,6 +352,7 @@ def write_and_test_code(
         plan,
         tool_info,
         tool_output,
+        plan_thoughts,
         format_memory(working_memory),
     )
     test = write_test(
@@ -634,31 +624,30 @@ def __init__(
         """Initialize the Vision Agent Coder.
 
         Parameters:
-            planner (Optional[LMM]): The planner model to use. Defaults to OpenAILMM.
-            coder (Optional[LMM]): The coder model to use. Defaults to OpenAILMM.
-            tester (Optional[LMM]): The tester model to use. Defaults to OpenAILMM.
-            debugger (Optional[LMM]): The debugger model to
+            planner (Optional[LMM]): The planner model to use. Defaults to AnthropicLMM.
+            coder (Optional[LMM]): The coder model to use. Defaults to AnthropicLMM.
+            tester (Optional[LMM]): The tester model to use. Defaults to AnthropicLMM.
+            debugger (Optional[LMM]): The debugger model to use. Defaults to AnthropicLMM.
             tool_recommender (Optional[Sim]): The tool recommender model to use.
             verbosity (int): The verbosity level of the agent. Defaults to 0. 2 is the
                 highest verbosity level which will output all intermediate debugging
                 code.
-            report_progress_callback: a callback to report the progress of the agent.
-                This is useful for streaming logs in a web application where multiple
-                VisionAgentCoder instances are running in parallel. This callback
-                ensures that the progress are not mixed up.
-            code_sandbox_runtime: the code sandbox runtime to use. A code sandbox is
-                 used to run the generated code. It can be one of the following
-                 values: None, "local" or "e2b". If None, VisionAgentCoder will read
-                 the value from the environment variable CODE_SANDBOX_RUNTIME. If it's
-                 also None, the local python runtime environment will be used.
+            report_progress_callback (Optional[Callable[Dict[str, Any]]]): a callback
+                to report the progress of the agent. This is useful for streaming logs
+                in a web application where multiple VisionAgentCoder instances are
+                running in parallel. This callback ensures that the progress are not
+                mixed up.
+            code_sandbox_runtime (Optional[str]): the code sandbox runtime to use. A
+                code sandbox is used to run the generated code. It can be one of the
+                following values: None, "local" or "e2b". If None, VisionAgentCoder
+                will read the value from the environment variable CODE_SANDBOX_RUNTIME.
+                If it's also None, the local python runtime environment will be used.
         """
 
-        self.planner = (
-            OpenAILMM(temperature=0.0, json_mode=True) if planner is None else planner
-        )
-        self.coder = OpenAILMM(temperature=0.0) if coder is None else coder
-        self.tester = OpenAILMM(temperature=0.0) if tester is None else tester
-        self.debugger = OpenAILMM(temperature=0.0) if debugger is None else debugger
+        self.planner = AnthropicLMM(temperature=0.0) if planner is None else planner
+        self.coder = AnthropicLMM(temperature=0.0) if coder is None else coder
+        self.tester = AnthropicLMM(temperature=0.0) if tester is None else tester
+        self.debugger = AnthropicLMM(temperature=0.0) if debugger is None else debugger
         self.verbosity = verbosity
         if self.verbosity > 0:
             _LOGGER.setLevel(logging.INFO)
@@ -785,7 +774,7 @@ def chat_with_workflow(
             )
 
             if test_multi_plan:
-                best_plan, tool_output_str = pick_plan(
+                plan_thoughts, tool_output_str = pick_plan(
                     int_chat,
                     plans,
                     tool_infos["all"],
@@ -795,9 +784,12 @@ def chat_with_workflow(
                     self.log_progress,
                     verbosity=self.verbosity,
                 )
+                best_plan = plan_thoughts["best_plan"]
+                plan_thoughts_str = plan_thoughts["thoughts"]
             else:
                 best_plan = list(plans.keys())[0]
                 tool_output_str = ""
+                plan_thoughts_str = ""
 
             if best_plan in plans and best_plan in tool_infos:
                 plan_i = plans[best_plan]
@@ -832,6 +824,7 @@ def chat_with_workflow(
                 + "\n-".join([e for e in plan_i["instructions"]]),
                 tool_info=tool_info,
                 tool_output=tool_output_str,
+                plan_thoughts=plan_thoughts_str,
                 tool_utils=T.UTILITIES_DOCSTRING,
                 working_memory=working_memory,
                 coder=self.coder,
@@ -862,7 +855,8 @@ def chat_with_workflow(
                 "code": DefaultImports.prepend_imports(code),
                 "test": test,
                 "test_result": execution_result,
-                "plan": plan,
+                "plans": plans,
+                "plan_thoughts": plan_thoughts_str,
                 "working_memory": working_memory,
             }
 
@@ -904,7 +898,9 @@ def _log_plans(self, plans: Dict[str, Any], verbosity: int) -> None:
                 )
 
 
-class ClaudeVisionAgentCoder(VisionAgentCoder):
+class OpenAIVisionAgentCoder(VisionAgentCoder):
+    """Initializes Vision Agent Coder using OpenAI models for planning, coding, testing."""
+
     def __init__(
         self,
         planner: Optional[LMM] = None,
@@ -916,13 +912,44 @@ def __init__(
         report_progress_callback: Optional[Callable[[Dict[str, Any]], None]] = None,
         code_sandbox_runtime: Optional[str] = None,
     ) -> None:
-        # NOTE: Claude doesn't have an official JSON mode
-        self.planner = ClaudeSonnetLMM(temperature=0.0) if planner is None else planner
-        self.coder = ClaudeSonnetLMM(temperature=0.0) if coder is None else coder
-        self.tester = ClaudeSonnetLMM(temperature=0.0) if tester is None else tester
-        self.debugger = (
-            ClaudeSonnetLMM(temperature=0.0) if debugger is None else debugger
+        self.planner = (
+            OpenAILMM(temperature=0.0, json_mode=True) if planner is None else planner
+        )
+        self.coder = OpenAILMM(temperature=0.0) if coder is None else coder
+        self.tester = OpenAILMM(temperature=0.0) if tester is None else tester
+        self.debugger = OpenAILMM(temperature=0.0) if debugger is None else debugger
+        self.verbosity = verbosity
+        if self.verbosity > 0:
+            _LOGGER.setLevel(logging.INFO)
+
+        self.tool_recommender = (
+            Sim(T.TOOLS_DF, sim_key="desc")
+            if tool_recommender is None
+            else tool_recommender
         )
+        self.report_progress_callback = report_progress_callback
+        self.code_sandbox_runtime = code_sandbox_runtime
+
+
+class AnthropicVisionAgentCoder(VisionAgentCoder):
+    """Initializes Vision Agent Coder using Anthropic models for planning, coding, testing."""
+
+    def __init__(
+        self,
+        planner: Optional[LMM] = None,
+        coder: Optional[LMM] = None,
+        tester: Optional[LMM] = None,
+        debugger: Optional[LMM] = None,
+        tool_recommender: Optional[Sim] = None,
+        verbosity: int = 0,
+        report_progress_callback: Optional[Callable[[Dict[str, Any]], None]] = None,
+        code_sandbox_runtime: Optional[str] = None,
+    ) -> None:
+        # NOTE: Claude doesn't have an official JSON mode
+        self.planner = AnthropicLMM(temperature=0.0) if planner is None else planner
+        self.coder = AnthropicLMM(temperature=0.0) if coder is None else coder
+        self.tester = AnthropicLMM(temperature=0.0) if tester is None else tester
+        self.debugger = AnthropicLMM(temperature=0.0) if debugger is None else debugger
         self.verbosity = verbosity
         if self.verbosity > 0:
             _LOGGER.setLevel(logging.INFO)
diff --git a/vision_agent/agent/vision_agent_coder_prompts.py b/vision_agent/agent/vision_agent_coder_prompts.py
index 6d7b18d6..e117a2e1 100644
--- a/vision_agent/agent/vision_agent_coder_prompts.py
+++ b/vision_agent/agent/vision_agent_coder_prompts.py
@@ -30,9 +30,10 @@
 
 **Instructions**:
 1. Based on the context and tools you have available, create a plan of subtasks to achieve the user request.
-2. Output three different plans each utilize a different strategy or set of tools.
+2. For each subtask, be sure to include the tool(s) you want to use to accomplish that subtask.
+3. Output three different plans each utilize a different strategy or set of tools ordering them from most likely to least likely to succeed.
 
-Output a list of jsons in the following format
+Output a list of jsons in the following format:
 
 ```json
 {{
@@ -67,7 +68,7 @@
 {previous_attempts}
 
 **Instructions**:
-1. Write a program to load the media and call each tool and save it's output.
+1. Write a program to load the media and call each tool and print it's output along with other relevant information.
 2. Create a dictionary where the keys are the tool name and the values are the tool outputs. Remove numpy arrays from the printed dictionary.
 3. Your test case MUST run only on the given images which are {media}
 4. Print this final dictionary.
@@ -102,24 +103,25 @@
 
 --- EXAMPLE2 ---
 plan1:
-- Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames' tool.
-- Use the 'owl_v2_image' tool with the prompt 'person' to detect where the people are in the video.
+- Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames_and_timestamps' tool.
+- Use the 'owl_v2_video' tool with the prompt 'person' to detect where the people are in the video.
 plan2:
-- Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames' tool.
+- Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames_and_timestamps' tool.
 - Use the 'florence2_phrase_grounding' tool with the prompt 'person' to detect where the people are in the video.
 plan3:
-- Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames' tool.
+- Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames_and_timestamps' tool.
 - Use the 'florence2_sam2_video_tracking' tool with the prompt 'person' to detect where the people are in the video.
 
 
 ```python
 import numpy as np
-from vision_agent.tools import extract_frames, owl_v2_image, florence2_phrase_grounding, florence2_sam2_video_tracking
+from vision_agent.tools import extract_frames_and_timestamps, owl_v2_video, florence2_phrase_grounding, florence2_sam2_video_tracking
 
 # sample at 1 FPS and use the first 10 frames to reduce processing time
-frames = extract_frames("video.mp4", 1)
-frames = [f[0] for f in frames][:10]
+frames = extract_frames_and_timestamps("video.mp4", 1)
+frames = [f["frame"] for f in frames][:10]
 
+# strip arrays from the output to make it easier to read
 def remove_arrays(o):
     if isinstance(o, list):
         return [remove_arrays(e) for e in o]
@@ -130,18 +132,46 @@ def remove_arrays(o):
     else:
         return o
 
+# return the counts of each label per frame to help determine the stability of the model results
+def get_counts(preds):
+    counts = {{}}
+    for i, pred_frame in enumerate(preds):
+        counts_i = {{}}
+        for pred in pred_frame:
+            label = pred["label"].split(":")[1] if ":" in pred["label"] else pred["label"]
+            counts_i[label] = counts_i.get(label, 0) + 1
+        counts[f"frame_{{i}}"] = counts_i
+    return counts
+
+
 # plan1
-owl_v2_out = [owl_v2_image("person", f) for f in frames]
+owl_v2_out = owl_v2_video("person", frames)
+owl_v2_counts = get_counts(owl_v2_out)
 
 # plan2
 florence2_out = [florence2_phrase_grounding("person", f) for f in frames]
+florence2_counts = get_counts(florence2_out)
 
 # plan3
 f2s2_tracking_out = florence2_sam2_video_tracking("person", frames)
 remove_arrays(f2s2_tracking_out)
+f2s2_counts = get_counts(f2s2_tracking_out)
+
+final_out = {{
+    "owl_v2_video": owl_v2_out,
+    "florence2_phrase_grounding": florence2_out,
+    "florence2_sam2_video_tracking": f2s2_out,
+}}
+
+counts = {{
+    "owl_v2_video": owl_v2_counts,
+    "florence2_phrase_grounding": florence2_counts,
+    "florence2_sam2_video_tracking": f2s2_counts,
+}}
 
-final_out = {{"owl_v2_image": owl_v2_out, "florence2_phrase_grounding": florence2_out, "florence2_sam2_video_tracking": f2s2_tracking_out}}
 print(final_out)
+print(labels_and_scores)
+print(counts)
 ```
 """
 
@@ -159,7 +189,7 @@ def remove_arrays(o):
 
 
 PICK_PLAN = """
-**Role**: You are a software programmer.
+**Role**: You are an advanced AI model that can understand the user request and construct plans to accomplish it.
 
 **Task**: Your responsibility is to pick the best plan from the three plans provided.
 
@@ -173,13 +203,14 @@ def remove_arrays(o):
 {tool_output}
 
 **Instructions**:
-1. Given the plans, image, and tool outputs, decide which plan is the best to achieve the user request.
-2. Solve the problem yourself given the image and pick the plan that matches your solution the best.
+1. Re-read the user request, plans, tool outputs and examine the image.
+2. Solve the problem yourself given the image and pick the most accurate plan that matches your solution the best.
+3. Add modifications to improve the plan including: changing a tool, adding thresholds, string matching.
 3. Output a JSON object with the following format:
 {{
     "predicted_answer": str # the answer you would expect from the best plan
-    "thoughts": str # your thought process for choosing the best plan
-    "best_plan": str # the best plan you have chosen
+    "thoughts": str # your thought process for choosing the best plan over other plans and any modifications you made
+    "best_plan": str # the best plan you have chosen, must be `plan1`, `plan2`, or `plan3`
 }}
 """
 
@@ -201,15 +232,18 @@ def remove_arrays(o):
 **User Instructions**:
 {question}
 
-**Tool Output**:
+**Tool Tests and Outputs**:
 {tool_output}
 
+**Tool Output Thoughts**:
+{plan_thoughts}
+
 **Previous Feedback**:
 {feedback}
 
 **Instructions**:
 1. **Understand and Clarify**: Make sure you understand the task.
-2. **Algorithm/Method Selection**: Decide on the most efficient way.
+2. **Algorithm/Method Selection**: Decide on the most efficient method, use the tool outputs and tool thoughts to guide you.
 3. **Pseudocode Creation**: Write down the steps you will follow in pseudocode.
 4. **Code Generation**: Translate your pseudocode into executable Python code. Ensure you use correct arguments, remember coordinates are always returned normalized from `vision_agent.tools`. All images from `vision_agent.tools` are in RGB format, red is (255, 0, 0) and blue is (0, 0, 255).
 """
diff --git a/vision_agent/agent/vision_agent_prompts.py b/vision_agent/agent/vision_agent_prompts.py
index 7b0dd600..80623016 100644
--- a/vision_agent/agent/vision_agent_prompts.py
+++ b/vision_agent/agent/vision_agent_prompts.py
@@ -18,19 +18,24 @@
 {examples}
 --- END EXAMPLES ---
 
-**Instructions**:
-1. **Understand and Clarify**: Make sure you understand the task, ask clarifying questions if the task is not clear.
-2. **Output in JSON**: Respond in JSON format, {{"thoughts": <your thoughts>, "response": <your response to the user>, "let_user_respond": <a boolean whether or not to let the user respond>}}.
-
 **Conversation**:
 Here is the current conversation so far:
 --- START CONVERSATION ---
 {conversation}
+--- END CONVERSATION ---
+
+**Instructions**:
+1. **Understand and Clarify**: Make sure you understand the task, ask clarifying questions if the task is not clear.
+2. **Output in JSON**: Respond in the following format in JSON:
+
+```json
+{{"thoughts": <your thoughts>, "response": <your response to the user>, "let_user_respond": <a boolean whether or not to let the user respond>}}.
+```
 """
 
 
 EXAMPLES_CODE1 = """
-USER: Can you detect the dogs in this image? Media name dog.jpg
+USER: Can you write code to detect the dogs in this image? Media name dog.jpg
 
 OBSERVATION:
 [Artifacts loaded]
@@ -61,6 +66,7 @@
 EXAMPLES_CODE1_EXTRA = """
 USER: The the image only has one dog, can you fix this?
 
+OBSERVATION:
 [Artifacts loaded]
 Artifact dog.jpg loaded to /path/to/images/dog.jpg
 Artifact dog_detector.py loaded to /path/to/code/dog_detector.py
@@ -86,8 +92,24 @@
 AGENT: {"thoughts": "One dog is detected, I will show this to the user and ask them if the result looks good.", "response": "I have written the code to detect one dog and shown you the output, do the results look good to you?", "let_user_respond": true}
 """
 
-
 EXAMPLES_CODE2 = """
+USER: Can you describe this image?
+
+OBSERVATION:
+[Artifacts loaded]
+Artifact image.jpg loaded to /path/to/images/image.jpg
+[End of artifacts]
+
+AGENT: {"thoughts": "The user hasn't asked me to write any code and the task is very simple so I will view the image and answer myself to respond to the user quickly.", "response": "<execute_python>view_media_artifacts('image.jpg')</execute_python>", "let_user_respond": false}
+
+OBSERVATION:
+[Image image.jpg displayed]
+
+AGENT: {"thoughts": "The image shows a cat and a dog sitting on the couch, I will tell the user and ask them if they need any other assistance.", "response": "The image contains a dog and a cat sitting on a couch. Can I help you with any other tasks?", "let_user_respond": true}
+"""
+
+
+EXAMPLES_CODE3 = """
 USER: Can you create a function to count workers with helmets, return the count and save a visaulization of the bounding boxes?
 
 OBSERVATION:
@@ -137,13 +159,13 @@
 
 USER: The detections are slightly off. Can you fine tune florence2 using these labels? "[{'image_path': 'image1.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}, {'image_path': 'image2.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}]"
 
-AGENT: {"thoughts": "Because the user has supplied me with labels I can call florence2_fine_tuning on their behalf to fine tune the model", "response": "I will fine tune florence2 with the labels you provided <execute_python>florence2_fine_tuning([{'image_path': 'image1.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}, {'image_path': 'image2.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}], "phrase_grounding")</execute_python>", "let_user_respond": false}
+AGENT: {"thoughts": "Because the user has supplied me with labels I can call object_detection_fine_tuning on their behalf to fine tune the model", "response": "I will fine tune florence2 with the labels you provided <execute_python>object_detection_fine_tuning([{'image_path': 'image1.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}, {'image_path': 'image2.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}])</execute_python>", "let_user_respond": false}
 
 OBSERVATION:
-[Florence2 fine tuning id: 23b3b022-5ebf-4798-9373-20ef36429abf]
+[Fine tuning id: 23b3b022-5ebf-4798-9373-20ef36429abf]
 
 
-AGENT: {"thoughts": "The model has finished fine tuning, I will now replace the original florence2_phrase_grounding call with the fine tuning id.", "response": "I will not update the code to use the fine tuned model. <execute_python>use_florence2_fine_tuning(artifacts, "code.py", "phrase_grounding", "23b3b022-5ebf-4798-9373-20ef36429abf")</execute_python>", "let_user_respond": false}
+AGENT: {"thoughts": "The model has finished fine tuning, I will now replace the original florence2_phrase_grounding call with the fine tuning id.", "response": "I will not update the code to use the fine tuned model. <execute_python>use_object_detection_fine_tuning(artifacts, "code.py", "23b3b022-5ebf-4798-9373-20ef36429abf")</execute_python>", "let_user_respond": false}
 
 OBSERVATION:
 [Artifact code.py edits]
diff --git a/vision_agent/lmm/__init__.py b/vision_agent/lmm/__init__.py
index 61ee1794..c75881c1 100644
--- a/vision_agent/lmm/__init__.py
+++ b/vision_agent/lmm/__init__.py
@@ -1,2 +1,2 @@
-from .lmm import LMM, AzureOpenAILMM, ClaudeSonnetLMM, OllamaLMM, OpenAILMM
+from .lmm import LMM, AnthropicLMM, AzureOpenAILMM, OllamaLMM, OpenAILMM
 from .types import Message
diff --git a/vision_agent/lmm/lmm.py b/vision_agent/lmm/lmm.py
index d075dad5..0362fb3e 100644
--- a/vision_agent/lmm/lmm.py
+++ b/vision_agent/lmm/lmm.py
@@ -1,5 +1,4 @@
 import json
-import logging
 import os
 from abc import ABC, abstractmethod
 from pathlib import Path
@@ -14,8 +13,6 @@
 
 from .types import Message
 
-_LOGGER = logging.getLogger(__name__)
-
 
 class LMM(ABC):
     @abstractmethod
@@ -45,11 +42,11 @@ def __call__(
 
 
 class OpenAILMM(LMM):
-    r"""An LMM class for the OpenAI GPT-4 Vision model."""
+    r"""An LMM class for the OpenAI LMMs."""
 
     def __init__(
         self,
-        model_name: str = "gpt-4o",
+        model_name: str = "gpt-4o-2024-05-13",
         api_key: Optional[str] = None,
         max_tokens: int = 4096,
         json_mode: bool = False,
@@ -365,8 +362,8 @@ def f() -> Iterator[Optional[str]]:
             return resp["response"]  # type: ignore
 
 
-class ClaudeSonnetLMM(LMM):
-    r"""An LMM class for Anthropic's Claude Sonnet model."""
+class AnthropicLMM(LMM):
+    r"""An LMM class for Anthropic's LMMs."""
 
     def __init__(
         self,
@@ -402,7 +399,7 @@ def chat(
             ]
             if "media" in msg:
                 for media_path in msg["media"]:
-                    encoded_media = encode_media(media_path)
+                    encoded_media = encode_media(media_path, resize=768)
                     content.append(
                         ImageBlockParam(
                             type="image",
@@ -449,7 +446,7 @@ def generate(
         ]
         if media:
             for m in media:
-                encoded_media = encode_media(m)
+                encoded_media = encode_media(m, resize=768)
                 content.append(
                     ImageBlockParam(
                         type="image",
diff --git a/vision_agent/tools/__init__.py b/vision_agent/tools/__init__.py
index a401fb46..22453224 100644
--- a/vision_agent/tools/__init__.py
+++ b/vision_agent/tools/__init__.py
@@ -21,7 +21,7 @@
     depth_anything_v2,
     detr_segmentation,
     dpt_hybrid_midas,
-    extract_frames,
+    extract_frames_and_timestamps,
     florence2_image_caption,
     florence2_ocr,
     florence2_phrase_grounding,
diff --git a/vision_agent/tools/meta_tools.py b/vision_agent/tools/meta_tools.py
index 0d20cb28..52d732f7 100644
--- a/vision_agent/tools/meta_tools.py
+++ b/vision_agent/tools/meta_tools.py
@@ -486,6 +486,33 @@ def list_artifacts(artifacts: Artifacts) -> str:
     return output_str
 
 
+def check_and_load_image(code: str) -> List[str]:
+    if not code.strip():
+        return []
+
+    pattern = r"show_media_artifact\(\s*([^\)]+),\s*['\"]([^\)]+)['\"]\s*\)"
+    match = re.search(pattern, code)
+    if match:
+        name = match.group(2)
+        return [name]
+    return []
+
+
+def view_media_artifact(artifacts: Artifacts, name: str) -> str:
+    """Views the image artifact with the given name.
+
+    Parameters:
+        artifacts (Artifacts): The artifacts object to show the image from.
+        name (str): The name of the image artifact to show.
+    """
+    if name not in artifacts:
+        output_str = f"[Artifact {name} does not exist]"
+    else:
+        output_str = f"[Image {name} displayed]"
+    print(output_str)
+    return output_str
+
+
 def get_tool_descriptions() -> str:
     """Returns a description of all the tools that `generate_vision_code` has access to.
     Helpful for answering questions about what types of vision tasks you can do with
@@ -493,16 +520,15 @@ def get_tool_descriptions() -> str:
     return TOOL_DESCRIPTIONS
 
 
-def florence2_fine_tuning(bboxes: List[Dict[str, Any]], task: str) -> str:
+def object_detection_fine_tuning(bboxes: List[Dict[str, Any]]) -> str:
     """DO NOT use this function unless the user has supplied you with bboxes.
-    'florence2_fine_tuning' is a tool that fine-tune florence2 to be able to detect
-    objects in an image based on a given dataset. It returns the fine tuning job id.
+    'object_detection_fine_tuning' is a tool that fine-tunes object detection models to
+    be able to detect objects in an image based on a given dataset. It returns the fine
+    tuning job id.
 
     Parameters:
         bboxes (List[BboxInput]): A list of BboxInput containing the image path, labels
             and bounding boxes. The coordinates are unnormalized.
-        task (str): The florencev2 fine-tuning task. The options are
-            'phrase_grounding'.
 
     Returns:
         str: The fine tuning job id, this id will used to retrieve the fine tuned
@@ -510,12 +536,13 @@ def florence2_fine_tuning(bboxes: List[Dict[str, Any]], task: str) -> str:
 
     Example
     -------
-        >>> fine_tuning_job_id = florencev2_fine_tuning(
+        >>> fine_tuning_job_id = object_detection_fine_tuning(
             [{'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[370, 30, 560, 290]]},
              {'image_path': 'filename.png', 'labels': ['screw'], 'bboxes': [[120, 0, 300, 170]]}],
              "phrase_grounding"
         )
     """
+    task = "phrase_grounding"
     bboxes_input = [BboxInput.model_validate(bbox) for bbox in bboxes]
     task_type = PromptTask[task.upper()]
     fine_tuning_request = [
@@ -531,7 +558,7 @@ def florence2_fine_tuning(bboxes: List[Dict[str, Any]], task: str) -> str:
     fine_tune_id = str(
         landing_api.launch_fine_tuning_job("florencev2", task_type, fine_tuning_request)
     )
-    print(f"[Florence2 fine tuning id: {fine_tune_id}]")
+    print(f"[Fine tuning id: {fine_tune_id}]")
     return fine_tune_id
 
 
@@ -564,7 +591,7 @@ def use_extra_vision_agent_args(
     Returns:
         str: The edited code.
     """
-    generate_pattern = r"generate_vision_code\(\s*([^\)]+)\)"
+    generate_pattern = r"generate_vision_code\(\s*([^\)]+)\s*\)"
 
     def generate_replacer(match: re.Match) -> str:
         arg = match.group(1)
@@ -575,7 +602,7 @@ def generate_replacer(match: re.Match) -> str:
             out_str += ")"
         return out_str
 
-    edit_pattern = r"edit_vision_code\(\s*([^\)]+)\)"
+    edit_pattern = r"edit_vision_code\(\s*([^\)]+)\s*\)"
 
     def edit_replacer(match: re.Match) -> str:
         arg = match.group(1)
@@ -591,48 +618,52 @@ def edit_replacer(match: re.Match) -> str:
     return new_code
 
 
-def use_florence2_fine_tuning(
-    artifacts: Artifacts, name: str, task: str, fine_tune_id: str
+def use_object_detection_fine_tuning(
+    artifacts: Artifacts, name: str, fine_tune_id: str
 ) -> str:
-    """Replaces florence2 calls with the fine tuning id. This ensures that the code
-    utilizes the fined tuned florence2 model. Returns the diff between the original
-    code and the new code.
+    """Replaces calls to 'owl_v2_image', 'florence2_phrase_detection' and
+    'florence2_sam2_image' with the fine tuning id. This ensures that the code utilizes
+    the fined tuned florence2 model. Returns the diff between the original code and the
+    new code.
 
     Parameters:
         artifacts (Artifacts): The artifacts object to edit the code from.
         name (str): The name of the artifact to edit.
-        task (str): The task to fine tune the model for. The options are
-            'phrase_grounding'.
         fine_tune_id (str): The fine tuning job id.
 
     Examples
     --------
-        >>> diff = use_florence2_fine_tuning(artifacts, "code.py", "phrase_grounding", "23b3b022-5ebf-4798-9373-20ef36429abf")
+        >>> diff = use_object_detection_fine_tuning(artifacts, "code.py", "23b3b022-5ebf-4798-9373-20ef36429abf")
     """
 
-    task_to_fn = {"phrase_grounding": "florence2_phrase_grounding"}
-
     if name not in artifacts:
         output_str = f"[Artifact {name} does not exist]"
         print(output_str)
         return output_str
 
     code = artifacts[name]
-    if task.lower() == "phrase_grounding":
-        pattern = r"florence2_phrase_grounding\(\s*([^\)]+)\)"
-
-        def replacer(match: re.Match) -> str:
-            arg = match.group(1)  # capture all initial arguments
-            return f'florence2_phrase_grounding({arg}, "{fine_tune_id}")'
-
-    else:
-        raise ValueError(f"Task {task} is not supported.")
+    patterns = [
+        (
+            r"florence2_phrase_grounding\(\s*([^\)]+)\s*\)",
+            lambda match: f'florence2_phrase_grounding({match.group(1)}, "{fine_tune_id}")',
+        ),
+        (
+            r"owl_v2_image\(\s*([^\)]+)\s*\)",
+            lambda match: f'owl_v2_image({match.group(1)}, "{fine_tune_id}")',
+        ),
+        (
+            r"florence2_sam2_image\(\s*([^\)]+)\s*\)",
+            lambda match: f'florence2_sam2_image({match.group(1)}, "{fine_tune_id}")',
+        ),
+    ]
 
-    new_code = re.sub(pattern, replacer, code)
+    new_code = code
+    for pattern, replacer in patterns:
+        new_code = re.sub(pattern, replacer, new_code)
 
     if new_code == code:
         output_str = (
-            f"[Fine tuning task {task} function {task_to_fn[task]} not found in code]"
+            f"[No function calls to replace with fine tuning id in artifact {name}]"
         )
         print(output_str)
         return output_str
@@ -662,8 +693,9 @@ def replacer(match: re.Match) -> str:
         generate_vision_code,
         edit_vision_code,
         write_media_artifact,
-        florence2_fine_tuning,
-        use_florence2_fine_tuning,
+        view_media_artifact,
+        object_detection_fine_tuning,
+        use_object_detection_fine_tuning,
         list_artifacts,
     ]
 )
diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py
index 63927f01..fca3819c 100644
--- a/vision_agent/tools/tools.py
+++ b/vision_agent/tools/tools.py
@@ -149,6 +149,7 @@ def owl_v2_image(
     prompt: str,
     image: np.ndarray,
     box_threshold: float = 0.10,
+    fine_tune_id: Optional[str] = None,
 ) -> List[Dict[str, Any]]:
     """'owl_v2_image' is a tool that can detect and count multiple objects given a text
     prompt such as category names or referring expressions on images. The categories in
@@ -160,6 +161,8 @@ def owl_v2_image(
         image (np.ndarray): The image to ground the prompt to.
         box_threshold (float, optional): The threshold for the box detection. Defaults
             to 0.10.
+        fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
+            fine-tuned model ID here to use it.
 
     Returns:
         List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
@@ -176,7 +179,38 @@ def owl_v2_image(
             {'score': 0.98, 'label': 'car', 'bbox': [0.2, 0.21, 0.45, 0.5},
         ]
     """
+
     image_size = image.shape[:2]
+
+    if fine_tune_id is not None:
+        image_b64 = convert_to_b64(image)
+        landing_api = LandingPublicAPI()
+        status = landing_api.check_fine_tuning_job(UUID(fine_tune_id))
+        if status is not JobStatus.SUCCEEDED:
+            raise FineTuneModelIsNotReady(
+                f"Fine-tuned model {fine_tune_id} is not ready yet"
+            )
+
+        data_obj = Florence2FtRequest(
+            image=image_b64,
+            task=PromptTask.PHRASE_GROUNDING,
+            tool="florencev2_fine_tuning",
+            prompt=prompt,
+            fine_tuning=FineTuning(job_id=UUID(fine_tune_id)),
+        )
+        data = data_obj.model_dump(by_alias=True)
+        detections = send_inference_request(data, "tools", v2=False)
+        detections = detections["<CAPTION_TO_PHRASE_GROUNDING>"]
+        bboxes_formatted = [
+            ODResponseData(
+                label=detections["labels"][i],
+                bbox=normalize_bbox(detections["bboxes"][i], image_size),
+                score=1.0,
+            )
+            for i in range(len(detections["bboxes"]))
+        ]
+        return [bbox.model_dump() for bbox in bboxes_formatted]
+
     buffer_bytes = numpy_to_bytes(image)
     files = [("image", buffer_bytes)]
     payload = {
@@ -206,10 +240,10 @@ def owl_v2_video(
     box_threshold: float = 0.10,
 ) -> List[List[Dict[str, Any]]]:
     """'owl_v2_video' will run owl_v2 on each frame of a video. It can detect multiple
-    objects per frame given a text prompt sucha s a category name or referring
-    expression. The categories in text prompt are separated by commas. It returns a list
-    of lists where each inner list contains the score, label, and bounding box of the
-    detections for that frame.
+    objects indepdently per frame given a text prompt such as a category name or
+    referring expression but does not track objects across frames. The categories in
+    text prompt are separated by commas. It returns a list of lists where each inner
+    list contains the score, label, and bounding box of the detections for that frame.
 
     Parameters:
         prompt (str): The prompt to ground to the video.
@@ -335,7 +369,9 @@ def grounding_sam(
     return return_data
 
 
-def florence2_sam2_image(prompt: str, image: np.ndarray) -> List[Dict[str, Any]]:
+def florence2_sam2_image(
+    prompt: str, image: np.ndarray, fine_tune_id: Optional[str] = None
+) -> List[Dict[str, Any]]:
     """'florence2_sam2_image' is a tool that can segment multiple objects given a text
     prompt such as category names or referring expressions. The categories in the text
     prompt are separated by commas. It returns a list of bounding boxes, label names,
@@ -344,6 +380,8 @@ def florence2_sam2_image(prompt: str, image: np.ndarray) -> List[Dict[str, Any]]
     Parameters:
         prompt (str): The prompt to ground to the image.
         image (np.ndarray): The image to ground the prompt to.
+        fine_tune_id (Optional[str]): If you have a fine-tuned model, you can pass the
+            fine-tuned model ID here to use it.
 
     Returns:
         List[Dict[str, Any]]: A list of dictionaries containing the score, label,
@@ -369,18 +407,52 @@ def florence2_sam2_image(prompt: str, image: np.ndarray) -> List[Dict[str, Any]]
             },
         ]
     """
-    buffer_bytes = numpy_to_bytes(image)
+    if fine_tune_id is not None:
+        image_b64 = convert_to_b64(image)
+        landing_api = LandingPublicAPI()
+        status = landing_api.check_fine_tuning_job(UUID(fine_tune_id))
+        if status is not JobStatus.SUCCEEDED:
+            raise FineTuneModelIsNotReady(
+                f"Fine-tuned model {fine_tune_id} is not ready yet"
+            )
 
+        req_data_obj = Florence2FtRequest(
+            image=image_b64,
+            task=PromptTask.PHRASE_GROUNDING,
+            tool="florencev2_fine_tuning",
+            prompt=prompt,
+            fine_tuning=FineTuning(
+                job_id=UUID(fine_tune_id),
+                postprocessing="sam2",
+            ),
+        )
+        req_data = req_data_obj.model_dump(by_alias=True)
+        detections_ft = send_inference_request(req_data, "tools", v2=False)
+        detections_ft = detections_ft["<CAPTION_TO_PHRASE_GROUNDING>"]
+        return_data = []
+        all_masks = np.array(detections_ft["masks"])
+        for i in range(len(detections_ft["bboxes"])):
+            return_data.append(
+                {
+                    "score": 1.0,
+                    "label": detections_ft["labels"][i],
+                    "bbox": detections_ft["bboxes"][i],
+                    "mask": all_masks[i, :, :].astype(np.uint8),
+                }
+            )
+        return return_data
+
+    buffer_bytes = numpy_to_bytes(image)
     files = [("image", buffer_bytes)]
     payload = {
         "prompts": [s.strip() for s in prompt.split(",")],
         "function_name": "florence2_sam2_image",
     }
-    data: Dict[str, Any] = send_inference_request(
+    detections: Dict[str, Any] = send_inference_request(
         payload, "florence2-sam2", files=files, v2=True
     )
     return_data = []
-    for _, data_i in data["0"].items():
+    for _, data_i in detections["0"].items():
         mask = rle_decode_array(data_i["mask"])
         label = data_i["label"]
         bbox = normalize_bbox(data_i["bounding_box"], data_i["mask"]["size"])
@@ -389,17 +461,19 @@ def florence2_sam2_image(prompt: str, image: np.ndarray) -> List[Dict[str, Any]]
 
 
 def florence2_sam2_video_tracking(
-    prompt: str, frames: List[np.ndarray]
+    prompt: str, frames: List[np.ndarray], chunk_length: Optional[int] = None
 ) -> List[List[Dict[str, Any]]]:
     """'florence2_sam2_video_tracking' is a tool that can segment and track multiple
     entities in a video given a text prompt such as category names or referring
     expressions. You can optionally separate the categories in the text with commas. It
-    only tracks entities present in the first frame and only returns segmentation
-    masks. It is useful for tracking and counting without duplicating counts.
+    can find new objects every 'chunk_length' frames and is useful for tracking and
+    counting without duplicating counts and always outputs scores of 1.0.
 
     Parameters:
         prompt (str): The prompt to ground to the video.
         frames (List[np.ndarray]): The list of frames to ground the prompt to.
+        chunk_length (Optional[int]): The number of frames to re-run florence2 to find
+            new objects.
 
     Returns:
         List[List[Dict[str, Any]]]: A list of list of dictionaries containing the label
@@ -432,6 +506,8 @@ def florence2_sam2_video_tracking(
         "prompts": [s.strip() for s in prompt.split(",")],
         "function_name": "florence2_sam2_video_tracking",
     }
+    if chunk_length is not None:
+        payload["chunk_length"] = chunk_length  # type: ignore
     data: Dict[str, Any] = send_inference_request(
         payload, "florence2-sam2", files=files, v2=True
     )
@@ -1119,13 +1195,13 @@ def florence2_phrase_grounding(
     return_data = []
     for i in range(len(detections["bboxes"])):
         return_data.append(
-            {
-                "score": 1.0,
-                "label": detections["labels"][i],
-                "bbox": normalize_bbox(detections["bboxes"][i], image_size),
-            }
+            ODResponseData(
+                label=detections["labels"][i],
+                bbox=normalize_bbox(detections["bboxes"][i], image_size),
+                score=1.0,
+            )
         )
-    return return_data
+    return [bbox.model_dump() for bbox in return_data]
 
 
 def florence2_ocr(image: np.ndarray) -> List[Dict[str, Any]]:
@@ -1497,12 +1573,14 @@ def closest_box_distance(
 # Utility and visualization functions
 
 
-def extract_frames(
+def extract_frames_and_timestamps(
     video_uri: Union[str, Path], fps: float = 1
-) -> List[Tuple[np.ndarray, float]]:
-    """'extract_frames' extracts frames from a video which can be a file path, url or
-    youtube link, returns a list of tuples (frame, timestamp), where timestamp is the
-    relative time in seconds where the frame was captured. The frame is a numpy array.
+) -> List[Dict[str, Union[np.ndarray, float]]]:
+    """'extract_frames_and_timestamps' extracts frames and timestamps from a video
+    which can be a file path, url or youtube link, returns a list of dictionaries
+    with keys "frame" and "timestamp" where "frame" is a numpy array and "timestamp" is
+    the relative time in seconds where the frame was captured. The frame is a numpy
+    array.
 
     Parameters:
         video_uri (Union[str, Path]): The path to the video file, url or youtube link
@@ -1510,15 +1588,23 @@ def extract_frames(
             to 1.
 
     Returns:
-        List[Tuple[np.ndarray, float]]: A list of tuples containing the extracted frame
-            as a numpy array and the timestamp in seconds.
+        List[Dict[str, Union[np.ndarray, float]]]: A list of dictionaries containing the
+            extracted frame as a numpy array and the timestamp in seconds.
 
     Example
     -------
         >>> extract_frames("path/to/video.mp4")
-        [(frame1, 0.0), (frame2, 0.5), ...]
+        [{"frame": np.ndarray, "timestamp": 0.0}, ...]
     """
 
+    def reformat(
+        frames_and_timestamps: List[Tuple[np.ndarray, float]]
+    ) -> List[Dict[str, Union[np.ndarray, float]]]:
+        return [
+            {"frame": frame, "timestamp": timestamp}
+            for frame, timestamp in frames_and_timestamps
+        ]
+
     if str(video_uri).startswith(
         (
             "http://www.youtube.com/",
@@ -1540,16 +1626,16 @@ def extract_frames(
                 raise Exception("No suitable video stream found")
             video_file_path = video.download(output_path=temp_dir)
 
-            return extract_frames_from_video(video_file_path, fps)
+            return reformat(extract_frames_from_video(video_file_path, fps))
     elif str(video_uri).startswith(("http", "https")):
         _, image_suffix = os.path.splitext(video_uri)
         with tempfile.NamedTemporaryFile(delete=False, suffix=image_suffix) as tmp_file:
             # Download the video and save it to the temporary file
             with urllib.request.urlopen(str(video_uri)) as response:
                 tmp_file.write(response.read())
-            return extract_frames_from_video(tmp_file.name, fps)
+            return reformat(extract_frames_from_video(tmp_file.name, fps))
 
-    return extract_frames_from_video(str(video_uri), fps)
+    return reformat(extract_frames_from_video(str(video_uri), fps))
 
 
 def save_json(data: Any, file_path: str) -> None:
@@ -1953,7 +2039,6 @@ def overlay_counting_results(
     vit_image_classification,
     vit_nsfw_classification,
     countgd_counting,
-    florence2_image_caption,
     florence2_ocr,
     florence2_sam2_image,
     florence2_sam2_video_tracking,
@@ -1968,7 +2053,7 @@ def overlay_counting_results(
 ]
 
 UTIL_TOOLS = [
-    extract_frames,
+    extract_frames_and_timestamps,
     save_json,
     load_image,
     save_image,
diff --git a/vision_agent/tools/tools_types.py b/vision_agent/tools/tools_types.py
index 6ebcf468..aa0e430f 100644
--- a/vision_agent/tools/tools_types.py
+++ b/vision_agent/tools/tools_types.py
@@ -28,6 +28,7 @@ class FineTuning(BaseModel):
     model_config = ConfigDict(populate_by_name=True)
 
     job_id: UUID = Field(alias="jobId")
+    postprocessing: Optional[str] = None
 
     @field_serializer("job_id")
     def serialize_job_id(self, job_id: UUID, _info: SerializationInfo) -> str:
diff --git a/vision_agent/utils/image_utils.py b/vision_agent/utils/image_utils.py
index e480dac5..0536fefd 100644
--- a/vision_agent/utils/image_utils.py
+++ b/vision_agent/utils/image_utils.py
@@ -5,7 +5,7 @@
 from importlib import resources
 from io import BytesIO
 from pathlib import Path
-from typing import Dict, List, Tuple, Union
+from typing import Dict, List, Optional, Tuple, Union
 
 import numpy as np
 from PIL import Image, ImageDraw, ImageFont
@@ -154,15 +154,20 @@ def convert_to_b64(data: Union[str, Path, np.ndarray, ImageType]) -> str:
         )
 
 
-def encode_image_bytes(image: bytes) -> str:
-    image = Image.open(io.BytesIO(image)).convert("RGB")  # type: ignore
+def encode_image_bytes(image: bytes, resize: Optional[int] = None) -> str:
+    if resize is not None:
+        image_pil = Image.open(io.BytesIO(image)).convert("RGB")
+        if image_pil.size[0] > resize or image_pil.size[1] > resize:
+            image_pil.thumbnail((resize, resize))
+    else:
+        image_pil = Image.open(io.BytesIO(image)).convert("RGB")
     buffer = io.BytesIO()
-    image.save(buffer, format="PNG")  # type: ignore
+    image_pil.save(buffer, format="PNG")
     encoded_image = base64.b64encode(buffer.getvalue()).decode("utf-8")
     return encoded_image
 
 
-def encode_media(media: Union[str, Path]) -> str:
+def encode_media(media: Union[str, Path], resize: Optional[int] = None) -> str:
     if isinstance(media, str) and media.startswith(("http", "https")):
         # for mp4 video url, we assume there is a same url but ends with png
         # vision-agent-ui will upload this png when uploading the video
@@ -192,11 +197,17 @@ def encode_media(media: Union[str, Path]) -> str:
         frames = extract_frames_from_video(str(media), fps=1)
         image = frames[len(frames) // 2]
         buffer = io.BytesIO()
-        Image.fromarray(image[0]).convert("RGB").save(buffer, format="PNG")
+        if resize is not None:
+            image_pil = Image.fromarray(image[0]).convert("RGB")
+            if image_pil.size[0] > resize or image_pil.size[1] > resize:
+                image_pil.thumbnail((resize, resize))
+        else:
+            image_pil = Image.fromarray(image[0]).convert("RGB")
+        image_pil.save(buffer, format="PNG")
         image_bytes = buffer.getvalue()
     else:
         image_bytes = open(media, "rb").read()
-    return encode_image_bytes(image_bytes)
+    return encode_image_bytes(image_bytes, resize=resize)
 
 
 def denormalize_bbox(
diff --git a/vision_agent/utils/video.py b/vision_agent/utils/video.py
index ba6b0c76..0bb6fb18 100644
--- a/vision_agent/utils/video.py
+++ b/vision_agent/utils/video.py
@@ -61,6 +61,7 @@ def video_writer(
     stream.height = height - (height % 2)
     stream.width = width - (width % 2)
     stream.pix_fmt = "yuv420p"
+    stream.options = {"crf": "10"}
     for frame in frames:
         # Remove the alpha channel (convert RGBA to RGB)
         frame_rgb = frame[:, :, :3]
@@ -77,7 +78,7 @@ def video_writer(
 
 
 def frames_to_bytes(
-    frames: List[np.ndarray], fps: float = 10, file_ext: str = ".mp4"
+    frames: List[np.ndarray], fps: float = 1.0, file_ext: str = ".mp4"
 ) -> bytes:
     r"""Convert a list of frames to a video file encoded into a byte string.