diff --git a/README.md b/README.md
index e34e265e..3812767a 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,5 @@
 <div align="center">
-    <img alt="vision_agent" height="200px" src="https://github.com/landing-ai/vision-agent/blob/main/assets/logo.jpg?raw=true">
+    <img alt="vision_agent" height="200px" src="https://github.com/landing-ai/vision-agent/blob/main/assets/logo.png?raw=true">
 
 # 🔍🤖 Vision Agent
 [![](https://dcbadge.vercel.app/api/server/wPdN8RCYew?compact=true&style=flat)](https://discord.gg/wPdN8RCYew)
@@ -345,6 +345,11 @@ result = agent.generate_code(conv)
 
 
 ## Additional Backends
+### E2B Code Execution
+If you wish to run your code on the E2B backend, make sure you have your `E2B_API_KEY`
+set and then set `CODE_SANDBOX_RUNTIME=e2b` in your environment variables. This will
+run all the agent generated code on the E2B backend.
+
 ### Anthropic
 `AnthropicVisionAgentCoder` uses Anthropic. To get started you just need to get an
 Anthropic API key and set it in your environment variables:
diff --git a/assets/logo.jpg b/assets/logo.jpg
deleted file mode 100644
index a2fa3a2c..00000000
Binary files a/assets/logo.jpg and /dev/null differ
diff --git a/assets/logo.png b/assets/logo.png
new file mode 100644
index 00000000..636574f2
Binary files /dev/null and b/assets/logo.png differ
diff --git a/docs/index.md b/docs/index.md
index 08c808a9..297f70ac 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -341,6 +341,11 @@ result = agent.generate_code(conv)
 
 
 ## Additional Backends
+### E2B Code Execution
+If you wish to run your code on the E2B backend, make sure you have your `E2B_API_KEY`
+set and then set `CODE_SANDBOX_RUNTIME=e2b` in your environment variables. This will
+run all the agent generated code on the E2B backend.
+
 ### Anthropic
 `AnthropicVisionAgentCoder` uses Anthropic. To get started you just need to get an
 Anthropic API key and set it in your environment variables:
diff --git a/tests/integ/test_tools.py b/tests/integ/test_tools.py
index 690795f0..42c8e62e 100644
--- a/tests/integ/test_tools.py
+++ b/tests/integ/test_tools.py
@@ -24,7 +24,6 @@
     grounding_dino,
     grounding_sam,
     ixc25_image_vqa,
-    ixc25_temporal_localization,
     ixc25_video_vqa,
     loca_visual_prompt_counting,
     loca_zero_shot_counting,
@@ -71,6 +70,14 @@ def test_owl_v2_image():
     assert all([all([0 <= x <= 1 for x in obj["bbox"]]) for obj in result])
 
 
+def test_owl_v2_image_empty():
+    result = owl_v2_image(
+        prompt="coin",
+        image=np.zeros((0, 0, 3)).astype(np.uint8),
+    )
+    assert result == []
+
+
 def test_owl_v2_fine_tune_id():
     img = ski.data.coins()
     result = owl_v2_image(
@@ -110,6 +117,14 @@ def test_florence2_phrase_grounding():
     assert all([all([0 <= x <= 1 for x in obj["bbox"]]) for obj in result])
 
 
+def test_florence2_phrase_grounding_empty():
+    result = florence2_phrase_grounding(
+        image=np.zeros((0, 0, 3)).astype(np.uint8),
+        prompt="coin",
+    )
+    assert result == []
+
+
 def test_florence2_phrase_grounding_fine_tune_id():
     img = ski.data.coins()
     result = florence2_phrase_grounding(
@@ -147,7 +162,7 @@ def test_florence2_phrase_grounding_video_fine_tune_id():
         fine_tune_id=FINE_TUNE_ID,
     )
     assert len(result) == 10
-    assert 16 <= len([res["label"] for res in result[0]]) <= 26
+    assert 12 <= len([res["label"] for res in result[0]]) <= 26
     assert all([all([0 <= x <= 1 for x in obj["bbox"]]) for obj in result[0]])
 
 
@@ -195,6 +210,14 @@ def test_florence2_sam2_image_fine_tune_id():
     assert len([res["mask"] for res in result]) == len(result)
 
 
+def test_florence2_sam2_image_empty():
+    result = florence2_sam2_image(
+        prompt="coin",
+        image=np.zeros((0, 0, 3)).astype(np.uint8),
+    )
+    assert result == []
+
+
 def test_florence2_sam2_video():
     frames = [
         np.array(Image.fromarray(ski.data.coins()).convert("RGB")) for _ in range(10)
@@ -208,7 +231,7 @@ def test_florence2_sam2_video():
     assert len([res["mask"] for res in result[0]]) == 25
 
 
-def test_segmentation():
+def test_detr_segmentation():
     img = ski.data.coins()
     result = detr_segmentation(
         image=img,
@@ -218,6 +241,13 @@ def test_segmentation():
     assert len([res["mask"] for res in result]) == 1
 
 
+def test_detr_segmentation_empty():
+    result = detr_segmentation(
+        image=np.zeros((0, 0, 3)).astype(np.uint8),
+    )
+    assert result == []
+
+
 def test_clip():
     img = ski.data.coins()
     result = clip(
@@ -227,6 +257,15 @@ def test_clip():
     assert result["scores"] == [0.9999, 0.0001]
 
 
+def test_clip_empty():
+    result = clip(
+        classes=["coins", "notes"],
+        image=np.zeros((0, 0, 3)).astype(np.uint8),
+    )
+    assert result["scores"] == []
+    assert result["labels"] == []
+
+
 def test_vit_classification():
     img = ski.data.coins()
     result = vit_image_classification(
@@ -235,6 +274,14 @@ def test_vit_classification():
     assert "typewriter keyboard" in result["labels"]
 
 
+def test_vit_classification_empty():
+    result = vit_image_classification(
+        image=np.zeros((0, 0, 3)).astype(np.uint8),
+    )
+    assert result["labels"] == []
+    assert result["scores"] == []
+
+
 def test_nsfw_classification():
     img = ski.data.coins()
     result = vit_nsfw_classification(
@@ -243,7 +290,7 @@ def test_nsfw_classification():
     assert result["label"] == "normal"
 
 
-def test_image_caption() -> None:
+def test_image_caption():
     img = ski.data.rocket()
     result = blip_image_caption(
         image=img,
@@ -251,7 +298,7 @@ def test_image_caption() -> None:
     assert result.strip() == "a rocket on a stand"
 
 
-def test_florence_image_caption() -> None:
+def test_florence_image_caption():
     img = ski.data.rocket()
     result = florence2_image_caption(
         image=img,
@@ -259,7 +306,7 @@ def test_florence_image_caption() -> None:
     assert "The image shows a rocket on a launch pad at night" in result.strip()
 
 
-def test_loca_zero_shot_counting() -> None:
+def test_loca_zero_shot_counting():
     img = ski.data.coins()
 
     result = loca_zero_shot_counting(
@@ -268,7 +315,7 @@ def test_loca_zero_shot_counting() -> None:
     assert result["count"] == 21
 
 
-def test_loca_visual_prompt_counting() -> None:
+def test_loca_visual_prompt_counting():
     img = ski.data.coins()
     result = loca_visual_prompt_counting(
         visual_prompt={"bbox": [85, 106, 122, 145]},
@@ -277,7 +324,7 @@ def test_loca_visual_prompt_counting() -> None:
     assert result["count"] == 25
 
 
-def test_git_vqa_v2() -> None:
+def test_git_vqa_v2():
     img = ski.data.rocket()
     result = git_vqa_v2(
         prompt="Is the scene captured during day or night ?",
@@ -286,7 +333,7 @@ def test_git_vqa_v2() -> None:
     assert result.strip() == "night"
 
 
-def test_image_qa_with_context() -> None:
+def test_image_qa_with_context():
     img = ski.data.rocket()
     result = florence2_roberta_vqa(
         prompt="Is the scene captured during day or night ?",
@@ -295,7 +342,7 @@ def test_image_qa_with_context() -> None:
     assert "night" in result.strip()
 
 
-def test_ixc25_image_vqa() -> None:
+def test_ixc25_image_vqa():
     img = ski.data.cat()
     result = ixc25_image_vqa(
         prompt="What animal is in this image?",
@@ -304,7 +351,7 @@ def test_ixc25_image_vqa() -> None:
     assert "cat" in result.strip()
 
 
-def test_ixc25_video_vqa() -> None:
+def test_ixc25_video_vqa():
     frames = [
         np.array(Image.fromarray(ski.data.cat()).convert("RGB")) for _ in range(10)
     ]
@@ -315,18 +362,7 @@ def test_ixc25_video_vqa() -> None:
     assert "cat" in result.strip()
 
 
-def test_ixc25_temporal_localization() -> None:
-    frames = [
-        np.array(Image.fromarray(ski.data.cat()).convert("RGB")) for _ in range(10)
-    ]
-    result = ixc25_temporal_localization(
-        prompt="What animal is in this video?",
-        frames=frames,
-    )
-    assert result == [True] * 10
-
-
-def test_ocr() -> None:
+def test_ocr():
     img = ski.data.page()
     result = ocr(
         image=img,
@@ -334,7 +370,14 @@ def test_ocr() -> None:
     assert any("Region-based segmentation" in res["label"] for res in result)
 
 
-def test_florence2_ocr() -> None:
+def test_ocr_empty():
+    result = ocr(
+        image=np.zeros((0, 0, 3)).astype(np.uint8),
+    )
+    assert result == []
+
+
+def test_florence2_ocr():
     img = ski.data.page()
     result = florence2_ocr(
         image=img,
@@ -342,6 +385,13 @@ def test_florence2_ocr() -> None:
     assert any("Region-based segmentation" in res["label"] for res in result)
 
 
+def test_florence2_ocr_empty():
+    result = florence2_ocr(
+        image=np.zeros((0, 0, 3)).astype(np.uint8),
+    )
+    assert result == []
+
+
 def test_mask_distance():
     # Create two binary masks
     mask1 = np.zeros((100, 100), dtype=np.uint8)
@@ -399,14 +449,22 @@ def test_generate_hed():
     assert result.shape == img.shape
 
 
-def test_countgd_counting() -> None:
+def test_countgd_counting():
     img = ski.data.coins()
     result = countgd_counting(image=img, prompt="coin")
     assert len(result) == 24
     assert [res["label"] for res in result] == ["coin"] * 24
 
 
-def test_countgd_example_based_counting() -> None:
+def test_countgd_counting_empty():
+    result = countgd_counting(
+        prompt="coin",
+        image=np.zeros((0, 0, 3)).astype(np.uint8),
+    )
+    assert result == []
+
+
+def test_countgd_example_based_counting():
     img = ski.data.coins()
     result = countgd_example_based_counting(
         visual_prompts=[[85, 106, 122, 145]],
@@ -414,3 +472,11 @@ def test_countgd_example_based_counting() -> None:
     )
     assert len(result) == 24
     assert [res["label"] for res in result] == ["object"] * 24
+
+
+def test_countgd_example_based_counting_empty():
+    result = countgd_example_based_counting(
+        visual_prompts=[[85, 106, 122, 145]],
+        image=np.zeros((0, 0, 3)).astype(np.uint8),
+    )
+    assert result == []
diff --git a/tests/unit/test_va.py b/tests/unit/test_va.py
index 3fe619e8..e7a6e7c5 100644
--- a/tests/unit/test_va.py
+++ b/tests/unit/test_va.py
@@ -1,4 +1,12 @@
-from vision_agent.agent.vision_agent import parse_execution
+from vision_agent.agent.agent_utils import extract_tag
+from vision_agent.tools.meta_tools import use_extra_vision_agent_args
+
+
+def parse_execution(code, test_multi_plan=True, custom_tool_names=None):
+    code = extract_tag(code, "execute_python")
+    if not code:
+        return None
+    return use_extra_vision_agent_args(code, test_multi_plan, custom_tool_names)
 
 
 def test_parse_execution_zero():
diff --git a/vision_agent/agent/vision_agent.py b/vision_agent/agent/vision_agent.py
index 42541d33..29643ecd 100644
--- a/vision_agent/agent/vision_agent.py
+++ b/vision_agent/agent/vision_agent.py
@@ -1,4 +1,5 @@
 import copy
+import json
 import logging
 import os
 import tempfile
@@ -6,7 +7,7 @@
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast
 
 from vision_agent.agent import Agent
-from vision_agent.agent.agent_utils import extract_json
+from vision_agent.agent.agent_utils import extract_json, extract_tag
 from vision_agent.agent.vision_agent_prompts import (
     EXAMPLES_CODE1,
     EXAMPLES_CODE2,
@@ -18,6 +19,7 @@
     META_TOOL_DOCSTRING,
     Artifacts,
     check_and_load_image,
+    extract_and_save_files_to_artifacts,
     use_extra_vision_agent_args,
 )
 from vision_agent.utils import CodeInterpreterFactory
@@ -35,7 +37,7 @@ class BoilerplateCode:
     pre_code = [
         "from typing import *",
         "from vision_agent.utils.execute import CodeInterpreter",
-        "from vision_agent.tools.meta_tools import Artifacts, open_code_artifact, create_code_artifact, edit_code_artifact, get_tool_descriptions, generate_vision_code, edit_vision_code, write_media_artifact, view_media_artifact, object_detection_fine_tuning, use_object_detection_fine_tuning",
+        "from vision_agent.tools.meta_tools import Artifacts, open_code_artifact, create_code_artifact, edit_code_artifact, get_tool_descriptions, generate_vision_code, edit_vision_code, view_media_artifact, object_detection_fine_tuning, use_object_detection_fine_tuning",
         "artifacts = Artifacts('{remote_path}')",
         "artifacts.load('{remote_path}')",
     ]
@@ -57,6 +59,32 @@ def add_boilerplate(code: str, **format: Any) -> str:
         )
 
 
+def format_agent_message(agent_message: str) -> str:
+    agent_message_json = extract_json(agent_message)
+    output = ""
+    if "thinking" in agent_message_json and agent_message_json["thinking"]:
+        output += "<thinking>" + agent_message_json["thinking"] + "</thinking>"
+    if "response" in agent_message_json and agent_message_json["response"]:
+        output += "<response>" + agent_message_json["response"] + "</response>"
+    if "execute_python" in agent_message_json and agent_message_json["execute_python"]:
+        output += (
+            "\n<execute_python>\n"
+            + agent_message_json["execute_python"]
+            + "\n</execute_python>\n"
+        )
+    if (
+        "let_user_respond" in agent_message_json
+        and agent_message_json["let_user_respond"]
+    ):
+        output += (
+            "<let_user_respond>"
+            + str(agent_message_json["let_user_respond"])
+            + "</let_user_respond>"
+        )
+
+    return output
+
+
 def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
     chat = copy.deepcopy(chat)
 
@@ -67,7 +95,7 @@ def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
         elif chat_i["role"] == "observation":
             conversation += f"OBSERVATION:\n{chat_i['content']}\n\n"
         elif chat_i["role"] == "assistant":
-            conversation += f"AGENT: {chat_i['content']}\n\n"
+            conversation += f"AGENT: {format_agent_message(chat_i['content'])}\n\n"  # type: ignore
         else:
             raise ValueError(f"role {chat_i['role']} is not supported")
 
@@ -84,15 +112,31 @@ def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
         and len(chat[-1]["media"]) > 0  # type: ignore
     ):
         message["media"] = chat[-1]["media"]
-    return extract_json(orch([message], stream=False))  # type: ignore
+    conv_resp = cast(str, orch([message], stream=False))
+
+    let_user_respond_str = extract_tag(conv_resp, "let_user_respond")
+    let_user_respond = (
+        "true" in let_user_respond_str.lower() if let_user_respond_str else False
+    )
+
+    return {
+        "thinking": extract_tag(conv_resp, "thinking"),
+        "response": extract_tag(conv_resp, "response"),
+        "execute_python": extract_tag(conv_resp, "execute_python"),
+        "let_user_respond": let_user_respond,
+    }
 
 
 def execute_code_action(
-    code: str, code_interpreter: CodeInterpreter, artifact_remote_path: str
+    artifacts: Artifacts,
+    code: str,
+    code_interpreter: CodeInterpreter,
+    artifact_remote_path: str,
 ) -> Tuple[Execution, str]:
     result = code_interpreter.exec_isolation(
         BoilerplateCode.add_boilerplate(code, remote_path=artifact_remote_path)
     )
+    extract_and_save_files_to_artifacts(artifacts, code)
 
     obs = str(result.logs)
     if result.error:
@@ -100,33 +144,8 @@ def execute_code_action(
     return result, obs
 
 
-def parse_execution(
-    response: str,
-    test_multi_plan: bool = True,
-    custom_tool_names: Optional[List[str]] = None,
-) -> Optional[str]:
-    code = None
-    remaining = response
-    all_code = []
-    while "<execute_python>" in remaining:
-        code_i = remaining[
-            remaining.find("<execute_python>") + len("<execute_python>") :
-        ]
-        code_i = code_i[: code_i.find("</execute_python>")]
-        remaining = remaining[
-            remaining.find("</execute_python>") + len("</execute_python>") :
-        ]
-        all_code.append(code_i)
-
-    if len(all_code) > 0:
-        code = "\n".join(all_code)
-
-    if code is not None:
-        code = use_extra_vision_agent_args(code, test_multi_plan, custom_tool_names)
-    return code
-
-
 def execute_user_code_action(
+    artifacts: Artifacts,
     last_user_message: Message,
     code_interpreter: CodeInterpreter,
     artifact_remote_path: str,
@@ -138,40 +157,43 @@ def execute_user_code_action(
         return user_result, user_obs
 
     last_user_content = cast(str, last_user_message.get("content", ""))
+    try:
+        user_code_action = json.loads(last_user_content).get("execute_python", None)
+    except json.JSONDecodeError:
+        return user_result, user_obs
 
-    user_code_action = parse_execution(last_user_content, False)
     if user_code_action is not None:
+        user_code_action = use_extra_vision_agent_args(user_code_action, False)
         user_result, user_obs = execute_code_action(
-            user_code_action, code_interpreter, artifact_remote_path
+            artifacts, user_code_action, code_interpreter, artifact_remote_path
         )
         if user_result.error:
             user_obs += f"\n{user_result.error}"
     return user_result, user_obs
 
 
-def add_step_descriptions(response: Dict[str, str]) -> Dict[str, str]:
+def add_step_descriptions(response: Dict[str, Any]) -> Dict[str, Any]:
     response = copy.deepcopy(response)
-    if "response" in response:
-        resp_str = response["response"]
-        if "<execute_python>" in resp_str:
-            # only include descriptions for these, the rest will just have executing
-            # code
-            description_map = {
-                "open_code_artifact": "Reading file.",
-                "create_code_artifact": "Creating file.",
-                "edit_code_artifact": "Editing file.",
-                "generate_vision_code": "Generating vision code.",
-                "edit_vision_code": "Editing vision code.",
-            }
-            description = ""
-            for k, v in description_map.items():
-                if k in resp_str:
-                    description += v + " "
-            if description == "":
-                description = "Executing code."
-            resp_str = resp_str[resp_str.find("<execute_python>") :]
-            resp_str = description + resp_str
-        response["response"] = resp_str
+
+    if "execute_python" in response and response["execute_python"]:
+        # only include descriptions for these, the rest will just have executing
+        # code
+        description_map = {
+            "open_code_artifact": "Reading file.",
+            "create_code_artifact": "Creating file.",
+            "edit_code_artifact": "Editing file.",
+            "generate_vision_code": "Generating vision code.",
+            "edit_vision_code": "Editing vision code.",
+        }
+        description = ""
+        for k, v in description_map.items():
+            if k in response["execute_python"]:
+                description += v + " "
+        if description == "":
+            description = "Executing code."
+
+        response["response"] = description
+
     return response
 
 
@@ -369,7 +391,10 @@ def chat_with_artifacts(
             self.streaming_message({"role": "observation", "content": artifacts_loaded})
 
             user_result, user_obs = execute_user_code_action(
-                last_user_message, code_interpreter, str(remote_artifacts_path)
+                artifacts,
+                last_user_message,
+                code_interpreter,
+                str(remote_artifacts_path),
             )
             finished = user_result is not None and user_obs is not None
             if user_result is not None and user_obs is not None:
@@ -394,13 +419,13 @@ def chat_with_artifacts(
                 int_chat.append(
                     {
                         "role": "assistant",
-                        "content": str(add_step_descriptions(response)),
+                        "content": json.dumps(add_step_descriptions(response)),
                     }
                 )
                 orig_chat.append(
                     {
                         "role": "assistant",
-                        "content": str(add_step_descriptions(response)),
+                        "content": json.dumps(add_step_descriptions(response)),
                     }
                 )
 
@@ -408,11 +433,13 @@ def chat_with_artifacts(
                 if last_response == response:
                     response["let_user_respond"] = True
 
-                finished = response["let_user_respond"]
+                finished = response.get("let_user_respond", False)
 
-                code_action = parse_execution(
-                    response["response"], test_multi_plan, custom_tool_names
-                )
+                code_action = response.get("execute_python", None)
+                if code_action is not None:
+                    code_action = use_extra_vision_agent_args(
+                        code_action, test_multi_plan, custom_tool_names
+                    )
 
                 if last_response == response:
                     self.streaming_message(
@@ -431,14 +458,17 @@ def chat_with_artifacts(
                     self.streaming_message(
                         {
                             "role": "assistant",
-                            "content": response,
+                            "content": json.dumps(response),
                             "finished": finished and code_action is None,
                         }
                     )
 
                 if code_action is not None:
                     result, obs = execute_code_action(
-                        code_action, code_interpreter, str(remote_artifacts_path)
+                        artifacts,
+                        code_action,
+                        code_interpreter,
+                        str(remote_artifacts_path),
                     )
 
                     media_obs = check_and_load_image(code_action)
diff --git a/vision_agent/agent/vision_agent_prompts.py b/vision_agent/agent/vision_agent_prompts.py
index a8b1e543..278b0b4c 100644
--- a/vision_agent/agent/vision_agent_prompts.py
+++ b/vision_agent/agent/vision_agent_prompts.py
@@ -27,13 +27,14 @@
 **Instructions**:
 1. **Understand and Clarify**: Make sure you understand the task, ask clarifying questions if the task is not clear.
 2. **Code Generation**: Only use code provided in the Documentation in your <execute_python> tags. Only use `edit_vision_code` to modify code written by `generate_vision_code`.
-3. **Execute**: Do only what the user asked you to do and no more. If you need to ask the user a question, set `let_user_respond` to `true`.
+3. **Execute**: Do only what the user asked you to do and no more. If you need to ask the user a question or show your results to the user, set <let_user_respond> to `true`.
 4. **Response**: Keep your responses short and concise. Provide the user only with the information they need to continue the conversation.
-5. **Output in JSON**: Respond in the following format in JSON:
+5. **Output**: You can only respond with <thinking>, <response>, <execute_python>, and <let_user_respond> tags.
 
-```json
-{{"thoughts": <your thoughts>, "response": <your response to the user>, "let_user_respond": <a boolean whether or not to let the user respond>}}.
-```
+<thinking>Your thoughts here...</thinking>
+<response>Your response to the user here...</response>
+<execute_python>Your code here...</execute_python>
+<let_user_respond>true/false for whether or not to you want to let the user respond.</let_user_respond>
 """
 
 
@@ -45,7 +46,11 @@
 Artifact dog.jpg loaded to /path/to/images/dog.jpg
 [End of artifacts]
 
-AGENT: {"thoughts": "I will use the generate_vision_code to detect the dogs in the image.", "response": "<execute_python>generate_vision_code(artifacts, 'dog_detector.py', 'Can you write code to detect dogs in this image?', media=['/path/to/images/dog.jpg'])</execute_python>", "let_user_respond": false}
+AGENT: <thinking>I will use the generate_vision_code to detect the dogs in the image.</thinking>
+<execute_python>
+generate_vision_code(artifacts, 'dog_detector.py', 'Can you write code to detect dogs in this image?', media=['/path/to/images/dog.jpg'])
+</execute_python>
+<let_user_respond>false</let_user_respond>
 
 OBSERVATION:
 [Artifact dog_detector.py]
@@ -56,14 +61,21 @@
 4|    return dogs
 [End of artifact]
 
-AGENT: {"thoughts": "I have generated the code to detect the dogs in the image, I must now run the code and print the results to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n    print(detect_dogs('/example/workspace/dog.jpg'))</execute_python>", "let_user_respond": false}
+AGENT: <thinking>I have generated the code to detect the dogs in the image, I must now run the code and print the results to get the output.</thinking>
+<execute_python>
+from dog_detector import detect_dogs
+print(detect_dogs('/example/workspace/dog.jpg'))
+</execute_python>
+<let_user_respond>false</let_user_respond>
 
 OBSERVATION:
 ----- stdout -----
 [{'score': 0.99, 'label': 'dog', 'box': [0.1, 0.2, 0.3, 0.4]}, {'score': 0.23, 'label': 'dog', 'box': [0.2, 0.3, 0.4, 0.5]}]
 
 
-AGENT: {"thoughts": "Two dogs are detected, I will show this to the user and ask them if the result looks good.", "response": "The code detectd two dogs, do the results look good to you?", "let_user_respond": true}
+AGENT: <thinking>Two dogs are detected, I will show this to the user and ask them if the result looks good.</thinking>
+<response>The code detectd two dogs, do the results look good to you?</response>
+<let_user_respond>true</let_user_respond>
 """
 
 EXAMPLES_CODE1_EXTRA = """
@@ -75,7 +87,11 @@
 Artifact dog_detector.py loaded to /path/to/code/dog_detector.py
 [End of artifacts]
 
-AGENT: {"thoughts": "I will use edit_vision_code to fix the code to detect only one dog. The lower score is 0.23 so I will set the threshold above that.", "response": "<execute_python>edit_vision_code(artifacts, 'dog_detector.py', ['Can you write code to detect dogs in this image?', 'Can you increase the threshold to 0.24?'], media=['/path/to/images/dog.jpg'])</execute_python>", "let_user_respond": false}
+AGENT: <thinking>I will use edit_vision_code to fix the code to detect only one dog. The lower score is 0.23 so I will set the threshold above that</thinking>
+<execute_python>
+edit_vision_code(artifacts, 'dog_detector.py', ['Can you write code to detect dogs in this image?', 'Can you increase the threshold to 0.24?'], media=['/path/to/images/dog.jpg'])
+</execute_python>
+<let_user_respond>false</let_user_respond>
 
 OBSERVATION:
 [Artifact dog_detector.py]
@@ -86,13 +102,20 @@
 4|    return dogs
 [End of artifact]
 
-AGENT: {"thoughts": "I have edited the code to detect only one dog, I must now run the code and print the results to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n    print(detect_dogs('/path/to/images/dog.jpg'))</execute_python>", "let_user_respond": false}
+AGENT: <thinking>I have edited the code to detect only one dog, I must now run the code and print the results to get the output.</thinking>
+<execute_python>
+from dog_detector import detect_dogs
+print(detect_dogs('/path/to/images/dog.jpg'))
+</execute_python>
+<let_user_respond>false</let_user_respond>
 
 OBSERVATION:
 ----- stdout -----
 [{'score': 0.99, 'label': 'dog', 'box': [0.1, 0.2, 0.3, 0.4]}]
 
-AGENT: {"thoughts": "One dog is detected, I will show this to the user and ask them if the result looks good.", "response": "The code detected one dog, do these results look good to you?", "let_user_respond": true}
+AGENT: <thinking>One dog is detected, I will show this to the user and ask them if the result looks good.</thinking>
+<response>The code detected one dog, do these results look good to you?</response>
+<let_user_respond>true</let_user_respond>
 """
 
 EXAMPLES_CODE2 = """
@@ -103,12 +126,18 @@
 Artifact image.jpg loaded to /path/to/images/image.jpg
 [End of artifacts]
 
-AGENT: {"thoughts": "The user hasn't asked me to write any code and the task is very simple so I will view the image and answer myself to respond to the user quickly.", "response": "<execute_python>view_media_artifacts('image.jpg')</execute_python>", "let_user_respond": false}
+AGENT: <thinking>The user hasn't asked me to write any code and the task is very simple so I will view the image and answer myself to respond to the user quickly.</thinking>
+<execute_python>
+view_media_artifacts('image.jpg')
+</execute_python>
+<let_user_respond>false</let_user_respond>
 
 OBSERVATION:
 [Image image.jpg displayed]
 
-AGENT: {"thoughts": "The image shows a cat and a dog sitting on the couch, I will tell the user and ask them if they need any other assistance.", "response": "The image contains a dog and a cat sitting on a couch. Can I help you with any other tasks?", "let_user_respond": true}
+AGENT: <thinking>The image shows a cat and a dog sitting on the couch, I will tell the user and ask them if they need any other assistance.</thinking>
+<response>The image contains a dog and a cat sitting on a couch. Can I help you with any other tasks?</response>
+<let_user_respond>true</let_user_respond>
 """
 
 
@@ -119,7 +148,9 @@
 [Artifacts loaded]
 [End of artifacts]
 
-AGENT: {"thoughts": "The user has asked to count workers with helmets but has not provided an image. I will ask the user for an image and then generate the code to count workers with helmets.", "response": "Can you provide an image of workers with helmets?", "let_user_respond": true}
+AGENT: <thinking>The user has asked to count workers with helmets but has not provided an image. I will ask the user for an image and then generate the code to count workers with helmets.</thinking>
+<response>Can you provide an image of workers with helmets?</response>
+<let_user_respond>true</let_user_respond>
 
 USER: Yes you can use workers.png
 
@@ -128,7 +159,11 @@
 Artifact workers.png loaded to /path/to/images/workers.png
 [End of artifacts]
 
-AGENT: {"thoughts": "I will use the generate_vision_code to count the workers with helmets in the image.", "response": "<execute_python>generate_vision_code(artifacts, 'code.py', 'Can you create a function to count workers with helmets, return the count and save a visaulization of the bounding boxes?', media=['/paths/to/images/workers.png'])</execute_python>", "let_user_respond": false}
+AGENT: <thinking>I will use the generate_vision_code to count the workers with helmets in the image.</thinking>
+<execute_python>
+generate_vision_code(artifacts, 'code.py', 'Can you create a function to count workers with helmets, return the count and save a visaulization of the bounding boxes?', media=['/paths/to/images/workers.png'])
+</execute_python>
+<let_user_respond>false</let_user_respond>
 
 OBSERVATION:
 [Artifact code.py]
@@ -152,22 +187,37 @@
 16|    return count
 [End of artifact]
 
-AGENT: {"thoughts": "I have generated the code to count the workers with helmets in the image, I must now run the code and print the output and write the visualization to the artifacts so I can see the result and the user can see the visaulization.", "response": "<execute_python>from code import count_workers_with_helmets\n    print(count_workers_with_helmets('/path/to/images/workers.png', 'workers_viz.png'))\n    write_media_artifact(artifacts, 'workers_viz.png', 'workers_viz.png')</execute_python>", "let_user_respond": false}
+AGENT: <thinking>I have generated the code to count the workers with helmets in the image, I must now run the code and print the output.</thinking>
+<execute_python>
+from code import count_workers_with_helmets
+print(count_workers_with_helmets('/path/to/images/workers.png', 'workers_viz.png'))
+</execute_python>
+<let_user_respond>false</let_user_respond>
 
 OBSERVATION:
 ----- stdout -----
 2
 
-AGENT: {"thoughts": "Two workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "The code to detect workers with helmets is saved in code.py and the visualization under 'workers_viz.png'.", "let_user_respond": true}
+AGENT: <thinking>Two workers with helmets are detected, I will show this to the user and ask them if the result looks good.</thinking>
+<response>The code to detect workers with helmets is saved in code.py and the visualization under 'workers_viz.png'.</response>
+<let_user_respond>true</let_user_respond>
 
 USER: The detections are slightly off. Can you fine tune florence2 using these labels? "[{'image_path': 'image1.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}, {'image_path': 'image2.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}]"
 
-AGENT: {"thoughts": "Because the user has supplied me with labels I can call object_detection_fine_tuning on their behalf to fine tune the model", "response": "<execute_python>object_detection_fine_tuning([{'image_path': 'image1.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}, {'image_path': 'image2.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}])</execute_python>", "let_user_respond": false}
+AGENT: <thinking>Because the user has supplied me with labels I can call object_detection_fine_tuning on their behalf to fine tune the model</thinking>
+<execute_python>
+object_detection_fine_tuning([{'image_path': 'image1.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}, {'image_path': 'image2.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}])
+</execute_python>
+<let_user_respond>false</let_user_respond>
 
 OBSERVATION:
 [Fine tuning id: 23b3b022-5ebf-4798-9373-20ef36429abf]
 
-AGENT: {"thoughts": "The model has finished fine tuning, I will now replace the original florence2_phrase_grounding call with the fine tuning id.", "response": "<execute_python>use_object_detection_fine_tuning(artifacts, "code.py", "23b3b022-5ebf-4798-9373-20ef36429abf")</execute_python>", "let_user_respond": false}
+AGENT: <thinking>The model has finished fine tuning, I will now replace the original florence2_phrase_grounding call with the fine tuning id.</thinking>
+<execute_python>
+use_object_detection_fine_tuning(artifacts, "code.py", "23b3b022-5ebf-4798-9373-20ef36429abf")
+</execute_python>
+<let_user_respond>false</let_user_respond>
 
 OBSERVATION:
 [Artifact code.py edits]
@@ -184,11 +234,18 @@ def count_workers_with_helmets(image_path: str, output_path: str):
      count = 0
 [End of edits]
 
-AGENT: {"thoughts": "I have updated the code to use the fine tuned model, I must now run the code and print the output and write the visualization to the artifacts so I can see the result and the user can see the visaulization.", "response": "<execute_python>from code import count_workers_with_helmets\n    print(count_workers_with_helmets('/path/to/images/workers.png', 'workers_viz.png'))\n    write_media_artifact(artifacts, 'workers_viz.png', 'workers_viz.png')</execute_python>", "let_user_respond": false}
+AGENT: <thinking>I have updated the code to use the fine tuned model, I must now run the code and print the output.</thinking>
+<execute_python>
+from code import count_workers_with_helmets
+print(count_workers_with_helmets('/path/to/images/workers.png', 'workers_viz.png'))
+</execute_python>
+<let_user_respond>false</let_user_respond>
 
 OBSERVATION:
 ----- stdout -----
 3
 
-AGENT: {"thoughts": "Three workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "I have updated the code using the fine tuned florence2_phrase_grounding model to count the workers wearing helmets in code.py and saved the visualization under 'workers_viz.png'.", "let_user_respond": true}
+AGENT: <thinking>Three workers with helmets are detected, I will show this to the user and ask them if the result looks good.</thinking>
+<response>I have updated the code using the fine tuned florence2_phrase_grounding model to count the workers wearing helmets in code.py and saved the visualization under 'workers_viz.png'.</response>
+<let_user_respond>true</let_user_respond>
 """
diff --git a/vision_agent/tools/__init__.py b/vision_agent/tools/__init__.py
index 2a75aa2b..16df1193 100644
--- a/vision_agent/tools/__init__.py
+++ b/vision_agent/tools/__init__.py
@@ -37,7 +37,6 @@
     grounding_dino,
     grounding_sam,
     ixc25_image_vqa,
-    ixc25_temporal_localization,
     ixc25_video_vqa,
     load_image,
     loca_visual_prompt_counting,
diff --git a/vision_agent/tools/meta_tools.py b/vision_agent/tools/meta_tools.py
index 7f59c685..d9537e7c 100644
--- a/vision_agent/tools/meta_tools.py
+++ b/vision_agent/tools/meta_tools.py
@@ -1,4 +1,3 @@
-import base64
 import difflib
 import json
 import os
@@ -9,7 +8,6 @@
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Union
 
-import numpy as np
 from IPython.display import display
 from redbaron import RedBaron  # type: ignore
 
@@ -22,8 +20,7 @@
 from vision_agent.tools.tools import TOOL_DESCRIPTIONS
 from vision_agent.tools.tools_types import BboxInput, BboxInputBase64, PromptTask
 from vision_agent.utils.execute import Execution, MimeType
-from vision_agent.utils.image_utils import convert_to_b64, numpy_to_bytes
-from vision_agent.utils.video import frames_to_bytes
+from vision_agent.utils.image_utils import convert_to_b64
 
 CURRENT_FILE = None
 CURRENT_LINE = 0
@@ -393,19 +390,6 @@ def generate_vision_plan(
         redisplay_results(response.test_results)
     response.test_results = None
     artifacts[name] = response.model_dump_json()
-    media_names = extract_json(
-        AnthropicLMM()(  # type: ignore
-            f"""Extract any media file names from this output in the following JSON format:
-{{"media": ["image1.jpg", "image2.jpg"]}}
-
-{artifacts[name]}"""
-        )
-    )
-    if "media" in media_names and isinstance(media_names, dict):
-        for media in media_names["media"]:
-            if isinstance(media, str):
-                with open(media, "rb") as f:
-                    artifacts[media] = f.read()
 
     output_str = f"[Start Plan Context, saved at {name}]"
     for plan in response.plans.keys():
@@ -466,6 +450,12 @@ def detect_dogs(image_path: str):
         test_multi_plan=test_multi_plan,
         custom_tool_names=custom_tool_names,
     )
+
+    # capture and save any files that were saved in the code to the artifacts
+    extract_and_save_files_to_artifacts(
+        artifacts, response["code"] + "\n" + response["test"]
+    )
+
     redisplay_results(response["test_result"])
     code = response["code"]
     artifacts[name] = code
@@ -546,6 +536,11 @@ def detect_dogs(image_path: str):
         test_multi_plan=False,
         custom_tool_names=custom_tool_names,
     )
+    # capture and save any files that were saved in the code to the artifacts
+    extract_and_save_files_to_artifacts(
+        artifacts, response["code"] + "\n" + response["test"]
+    )
+
     redisplay_results(response["test_result"])
     code = response["code"]
     artifacts[name] = code
@@ -567,49 +562,6 @@ def detect_dogs(image_path: str):
     return view_lines(code_lines, 0, total_lines, name, total_lines)
 
 
-def write_media_artifact(
-    artifacts: Artifacts,
-    name: str,
-    media: Union[str, np.ndarray, List[np.ndarray]],
-    fps: Optional[float] = None,
-) -> str:
-    """Writes a media file to the artifacts object.
-
-    Parameters:
-        artifacts (Artifacts): The artifacts object to save the media to.
-        name (str): The name of the media artifact to save.
-        media (Union[str, np.ndarray, List[np.ndarray]]): The media to save, can either
-            be a file path, single image or list of frames for a video.
-        fps (Optional[float]): The frames per second if you are writing a video.
-    """
-    if isinstance(media, str):
-        with open(media, "rb") as f:
-            media_bytes = f.read()
-    elif isinstance(media, list):
-        media_bytes = frames_to_bytes(media, fps=fps if fps is not None else 1.0)
-    elif isinstance(media, np.ndarray):
-        media_bytes = numpy_to_bytes(media)
-    else:
-        print(f"[Invalid media type {type(media)}]")
-        return f"[Invalid media type {type(media)}]"
-    artifacts[name] = media_bytes
-    print(f"[Media {name} saved]")
-    display(
-        {
-            MimeType.APPLICATION_ARTIFACT: json.dumps(
-                {
-                    "name": name,
-                    "action": "create",
-                    "content": base64.b64encode(media_bytes).decode("utf-8"),
-                    "contentType": "media_output",
-                }
-            )
-        },
-        raw=True,
-    )
-    return f"[Media {name} saved]"
-
-
 def list_artifacts(artifacts: Artifacts) -> str:
     """Lists all the artifacts that have been loaded into the artifacts object."""
     output_str = artifacts.show()
@@ -813,6 +765,61 @@ def use_object_detection_fine_tuning(
     return diff
 
 
+def extract_and_save_files_to_artifacts(artifacts: Artifacts, code: str) -> None:
+    """Extracts and saves files used in the code to the artifacts object.
+
+    Parameters:
+        artifacts (Artifacts): The artifacts object to save the files to.
+        code (str): The code to extract the files from.
+    """
+    try:
+        response = extract_json(
+            AnthropicLMM()(  # type: ignore
+                f"""You are a helpful AI assistant. Your job is to look at a snippet of code and return the file paths that are being saved in the file. Below is the code snippet:
+
+```python
+{code}
+```
+
+Return the file paths in the following JSON format:
+{{"file_paths": ["/path/to/image1.jpg", "/other/path/to/data.json"]}}"""
+            )
+        )
+    except json.JSONDecodeError:
+        return
+
+    text_file_ext = [
+        ".txt",
+        ".md",
+        "rtf",
+        ".html",
+        ".htm",
+        "xml",
+        ".json",
+        ".csv",
+        ".tsv",
+        ".yaml",
+        ".yml",
+        ".toml",
+        ".conf",
+        ".env" ".ini",
+        ".log",
+        ".py",
+        ".java",
+        ".js",
+        ".cpp",
+        ".c" ".sql",
+        ".sh",
+    ]
+
+    if "file_paths" in response and isinstance(response["file_paths"], list):
+        for file_path in response["file_paths"]:
+            read_mode = "r" if Path(file_path).suffix in text_file_ext else "rb"
+            if Path(file_path).is_file():
+                with open(file_path, read_mode) as f:
+                    artifacts[Path(file_path).name] = f.read()
+
+
 META_TOOL_DOCSTRING = get_tool_documentation(
     [
         get_tool_descriptions,
@@ -822,7 +829,6 @@ def use_object_detection_fine_tuning(
         generate_vision_plan,
         generate_vision_code,
         edit_vision_code,
-        write_media_artifact,
         view_media_artifact,
         object_detection_fine_tuning,
         use_object_detection_fine_tuning,
diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py
index 45f10c33..86e484b2 100644
--- a/vision_agent/tools/tools.py
+++ b/vision_agent/tools/tools.py
@@ -181,6 +181,8 @@ def owl_v2_image(
     """
 
     image_size = image.shape[:2]
+    if image_size[0] < 1 or image_size[1] < 1:
+        return []
 
     if fine_tune_id is not None:
         image_b64 = convert_to_b64(image)
@@ -413,6 +415,9 @@ def florence2_sam2_image(
             },
         ]
     """
+    if image.shape[0] < 1 or image.shape[1] < 1:
+        return []
+
     if fine_tune_id is not None:
         image_b64 = convert_to_b64(image)
         landing_api = LandingPublicAPI()
@@ -701,6 +706,8 @@ def countgd_counting(
         ]
     """
     image_size = image.shape[:2]
+    if image_size[0] < 1 or image_size[1] < 1:
+        return []
     buffer_bytes = numpy_to_bytes(image)
     files = [("image", buffer_bytes)]
     prompt = prompt.replace(", ", " .")
@@ -759,6 +766,8 @@ def countgd_example_based_counting(
         ]
     """
     image_size = image.shape[:2]
+    if image_size[0] < 1 or image_size[1] < 1:
+        return []
     buffer_bytes = numpy_to_bytes(image)
     files = [("image", buffer_bytes)]
     visual_prompts = [
@@ -828,6 +837,8 @@ def ixc25_image_vqa(prompt: str, image: np.ndarray) -> str:
         >>> ixc25_image_vqa('What is the cat doing?', image)
         'drinking milk'
     """
+    if image.shape[0] < 1 or image.shape[1] < 1:
+        raise ValueError(f"Image is empty, image shape: {image.shape}")
 
     buffer_bytes = numpy_to_bytes(image)
     files = [("image", buffer_bytes)]
@@ -871,47 +882,6 @@ def ixc25_video_vqa(prompt: str, frames: List[np.ndarray]) -> str:
     return cast(str, data["answer"])
 
 
-def ixc25_temporal_localization(prompt: str, frames: List[np.ndarray]) -> List[bool]:
-    """'ixc25_temporal_localization' uses ixc25_video_vqa to temporally segment a video
-    given a prompt that can be other an object or a phrase. It returns a list of
-    boolean values indicating whether the object or phrase is present in the
-    corresponding frame.
-
-    Parameters:
-        prompt (str): The question about the video
-        frames (List[np.ndarray]): The reference frames used for the question
-
-    Returns:
-        List[bool]: A list of boolean values indicating whether the object or phrase is
-            present in the corresponding frame.
-
-    Example
-    -------
-        >>> output = ixc25_temporal_localization('soccer goal', frames)
-        >>> print(output)
-        [False, False, False, True, True, True, False, False, False, False]
-        >>> save_video([f for i, f in enumerate(frames) if output[i]], 'output.mp4')
-    """
-
-    buffer_bytes = frames_to_bytes(frames)
-    files = [("video", buffer_bytes)]
-    payload = {
-        "prompt": prompt,
-        "chunk_length": 2,
-        "function_name": "ixc25_temporal_localization",
-    }
-    data: List[int] = send_inference_request(
-        payload,
-        "video-temporal-localization?model=internlm-xcomposer",
-        files=files,
-        v2=True,
-    )
-    chunk_size = round(len(frames) / len(data))
-    data_explode = [[elt] * chunk_size for elt in data]
-    data_bool = [bool(elt) for sublist in data_explode for elt in sublist]
-    return data_bool[: len(frames)]
-
-
 def gpt4o_image_vqa(prompt: str, image: np.ndarray) -> str:
     """'gpt4o_image_vqa' is a tool that can answer any questions about arbitrary images
     including regular images or images of documents or presentations. It returns text
@@ -1024,6 +994,9 @@ def clip(image: np.ndarray, classes: List[str]) -> Dict[str, Any]:
         {"labels": ["dog", "cat", "bird"], "scores": [0.68, 0.30, 0.02]},
     """
 
+    if image.shape[0] < 1 or image.shape[1] < 1:
+        return {"labels": [], "scores": []}
+
     image_b64 = convert_to_b64(image)
     data = {
         "prompt": ",".join(classes),
@@ -1052,6 +1025,8 @@ def vit_image_classification(image: np.ndarray) -> Dict[str, Any]:
         >>> vit_image_classification(image)
         {"labels": ["leopard", "lemur, otter", "bird"], "scores": [0.68, 0.30, 0.02]},
     """
+    if image.shape[0] < 1 or image.shape[1] < 1:
+        return {"labels": [], "scores": []}
 
     image_b64 = convert_to_b64(image)
     data = {
@@ -1080,6 +1055,8 @@ def vit_nsfw_classification(image: np.ndarray) -> Dict[str, Any]:
         >>> vit_nsfw_classification(image)
         {"label": "normal", "scores": 0.68},
     """
+    if image.shape[0] < 1 or image.shape[1] < 1:
+        raise ValueError(f"Image is empty, image shape: {image.shape}")
 
     image_b64 = convert_to_b64(image)
     data = {
@@ -1180,6 +1157,8 @@ def florence2_phrase_grounding(
         ]
     """
     image_size = image.shape[:2]
+    if image_size[0] < 1 or image_size[1] < 1:
+        return []
     image_b64 = convert_to_b64(image)
 
     if fine_tune_id is not None:
@@ -1399,6 +1378,8 @@ def detr_segmentation(image: np.ndarray) -> List[Dict[str, Any]]:
             },
         ]
     """
+    if image.shape[0] < 1 or image.shape[1] < 1:
+        return []
     image_b64 = convert_to_b64(image)
     data = {
         "image": image_b64,
@@ -1442,6 +1423,9 @@ def depth_anything_v2(image: np.ndarray) -> np.ndarray:
                 [10, 11, 15, ..., 202, 202, 205],
                 [10, 10, 10, ..., 200, 200, 200]], dtype=uint8),
     """
+    if image.shape[0] < 1 or image.shape[1] < 1:
+        raise ValueError(f"Image is empty, image shape: {image.shape}")
+
     image_b64 = convert_to_b64(image)
     data = {
         "image": image_b64,