From 6b8e58bfb89b536991b63949974da4416b53dc6f Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Thu, 15 Aug 2024 10:52:36 -0700
Subject: [PATCH 01/37] update for new conv

---
 vision_agent/agent/vision_agent.py         | 25 ++++++++++++++++++----
 vision_agent/agent/vision_agent_prompts.py |  2 ++
 2 files changed, 23 insertions(+), 4 deletions(-)

diff --git a/vision_agent/agent/vision_agent.py b/vision_agent/agent/vision_agent.py
index cfb482e1..dad2d824 100644
--- a/vision_agent/agent/vision_agent.py
+++ b/vision_agent/agent/vision_agent.py
@@ -1,6 +1,8 @@
 import copy
 import logging
 import os
+import tempfile
+import pickle as pkl
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Union, cast
 
@@ -12,12 +14,14 @@
     VA_CODE,
 )
 from vision_agent.lmm import LMM, Message, OpenAILMM
+from vision_agent.tools.meta_tools import Artifacts
 from vision_agent.tools import META_TOOL_DOCSTRING
 from vision_agent.utils import CodeInterpreterFactory
 from vision_agent.utils.execute import CodeInterpreter
 
 logging.basicConfig(level=logging.INFO)
 _LOGGER = logging.getLogger(__name__)
+ARTIFACT = "artifacts.pkl"
 WORKSPACE = Path(os.getenv("WORKSPACE", ""))
 WORKSPACE.mkdir(parents=True, exist_ok=True)
 if str(WORKSPACE) != "":
@@ -28,7 +32,8 @@ class DefaultImports:
     code = [
         "from typing import *",
         "from vision_agent.utils.execute import CodeInterpreter",
-        "from vision_agent.tools.meta_tools import generate_vision_code, edit_vision_code, open_file, create_file, scroll_up, scroll_down, edit_file, get_tool_descriptions",
+        "from vision_agent.tools.meta_tools import Artifacts, open_artifact, create_artifact, edit_artifact, get_tool_descriptions",
+        f"artifacts = Artifacts({ARTIFACT})",
     ]
 
     @staticmethod
@@ -66,9 +71,21 @@ def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
     return extract_json(orch([{"role": "user", "content": prompt}], stream=False))  # type: ignore
 
 
-def run_code_action(code: str, code_interpreter: CodeInterpreter) -> str:
-    # Note the code interpreter needs to keep running in the same environment because
-    # the SWE tools hold state like line numbers and currently open files.
+def run_code_action(code: str, artifacts: Artifacts, code_interpreter: CodeInterpreter) -> str:
+    with tempfile.TemporaryDirectory() as tmpdirname:
+        for name in artifacts:
+            temp_file_path = Path(tmpdirname) / name + ".py"
+            with open(temp_file_path, "w") as f:
+                f.write(artifacts[name])
+            code_interpreter.upload_file(temp_file_path)
+            temp_file_path.unlink()
+
+        temp_file_path = Path(tmpdirname) / ARTIFACT
+        with open(temp_file_path, "wb") as f:
+            pkl.dump(artifacts.artifacts, f)
+        code_interpreter.upload_file(temp_file_path)
+        temp_file_path.unlink()
+
     result = code_interpreter.exec_cell(DefaultImports.prepend_imports(code))
 
     return_str = ""
diff --git a/vision_agent/agent/vision_agent_prompts.py b/vision_agent/agent/vision_agent_prompts.py
index 4774d84d..7b714378 100644
--- a/vision_agent/agent/vision_agent_prompts.py
+++ b/vision_agent/agent/vision_agent_prompts.py
@@ -31,6 +31,7 @@
 {conversation}
 """
 
+
 EXAMPLES_CODE1 = """
 USER: Can you detect the dogs in this image? Media name dog.jpg
 
@@ -76,6 +77,7 @@
 AGENT: {"thoughts": "One dog is detected, I will show this to the user and ask them if the result looks good.", "response": "I have written the code to detect one dog and shown you the output, do the results look good to you?", "let_user_respond": true}
 """
 
+
 EXAMPLES_CODE2 = """
 USER: Can you create a function to count workers with helmets?
 

From 07a9b849b6c5454e038a1bc9e40ab894bd18e432 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Thu, 15 Aug 2024 10:52:50 -0700
Subject: [PATCH 02/37] add artifact tools

---
 vision_agent/tools/meta_tools.py | 755 +++++++++++++++++++------------
 1 file changed, 466 insertions(+), 289 deletions(-)

diff --git a/vision_agent/tools/meta_tools.py b/vision_agent/tools/meta_tools.py
index 4a82436d..29d56c55 100644
--- a/vision_agent/tools/meta_tools.py
+++ b/vision_agent/tools/meta_tools.py
@@ -1,12 +1,18 @@
 import os
+import pickle as pkl
 import subprocess
+import tempfile
 from pathlib import Path
-from typing import Any, Dict, List, Union
+from typing import Any, Dict, List, Optional, Union
+from uuid import UUID
 
 import vision_agent as va
+from vision_agent.clients.landing_public_api import LandingPublicAPI
 from vision_agent.lmm.types import Message
 from vision_agent.tools.tool_utils import get_tool_documentation
 from vision_agent.tools.tools import TOOL_DESCRIPTIONS
+from vision_agent.utils.image_utils import convert_to_b64
+from vision_agent.utils import CodeInterpreterFactory
 
 # These tools are adapted from SWE-Agent https://github.com/princeton-nlp/SWE-agent
 
@@ -35,97 +41,32 @@ def filter_file(file_name: Union[str, Path]) -> bool:
     )
 
 
-def generate_vision_code(save_file: str, chat: str, media: List[str]) -> str:
-    """Generates python code to solve vision based tasks.
+class Artifacts:
+    def __init__(self, save_path: Union[str, Path]) -> None:
+        self.save_path = save_path
+        self.artifacts = {}
 
-    Parameters:
-        save_file (str): The file path to save the code.
-        chat (str): The chat message from the user.
-        media (List[str]): The media files to use.
-
-    Returns:
-        str: The generated code.
-
-    Examples
-    --------
-        >>> generate_vision_code("code.py", "Can you detect the dogs in this image?", ["image.jpg"])
-        from vision_agent.tools import load_image, owl_v2
-        def detect_dogs(image_path: str):
-            image = load_image(image_path)
-            dogs = owl_v2("dog", image)
-            return dogs
-    """
+        self.code_sandbox_runtime = None
 
-    if ZMQ_PORT is not None:
-        agent = va.agent.VisionAgentCoder(
-            report_progress_callback=lambda inp: report_progress_callback(
-                int(ZMQ_PORT), inp
-            )
-        )
-    else:
-        agent = va.agent.VisionAgentCoder()
-    try:
-        fixed_chat: List[Message] = [{"role": "user", "content": chat, "media": media}]
-        response = agent.chat_with_workflow(fixed_chat)
-        code = response["code"]
-        with open(save_file, "w") as f:
-            f.write(code)
-        code_lines = code.splitlines(keepends=True)
-        total_lines = len(code_lines)
-        return view_lines(code_lines, 0, total_lines, save_file, total_lines)
-    except Exception as e:
-        return str(e)
-
-
-def edit_vision_code(code_file: str, chat_history: List[str], media: List[str]) -> str:
-    """Edits python code to solve a vision based task.
+    def load(self, file_path: Union[str, Path]) -> None:
+        with open(file_path, "rb") as f:
+            self.artifacts = pkl.load(f)
 
-    Parameters:
-        code_file (str): The file path to the code.
-        chat_history (List[str]): The chat history to used to generate the code.
-
-    Returns:
-        str: The edited code.
-
-    Examples
-    --------
-        >>> edit_vision_code(
-        >>>     "code.py",
-        >>>     ["Can you detect the dogs in this image?", "Can you use a higher threshold?"],
-        >>>     ["dog.jpg"],
-        >>> )
-        from vision_agent.tools import load_image, owl_v2
-        def detect_dogs(image_path: str):
-            image = load_image(image_path)
-            dogs = owl_v2("dog", image, threshold=0.8)
-            return dogs
-    """
+    def save(self) -> None:
+        with open(self.save_path, "wb") as f:
+            pkl.dump(self.artifacts, f)
+
+    def __iter__(self):
+        return iter(self.artifacts)
+
+    def __getitem__(self, name: str) -> str:
+        return self.artifacts[name]
 
-    agent = va.agent.VisionAgentCoder()
-    with open(code_file, "r") as f:
-        code = f.read()
-
-    # Append latest code to second to last message from assistant
-    fixed_chat_history: List[Message] = []
-    for i, chat in enumerate(chat_history):
-        if i == 0:
-            fixed_chat_history.append({"role": "user", "content": chat, "media": media})
-        elif i > 0 and i < len(chat_history) - 1:
-            fixed_chat_history.append({"role": "user", "content": chat})
-        elif i == len(chat_history) - 1:
-            fixed_chat_history.append({"role": "assistant", "content": code})
-            fixed_chat_history.append({"role": "user", "content": chat})
-
-    try:
-        response = agent.chat_with_workflow(fixed_chat_history, test_multi_plan=False)
-        code = response["code"]
-        with open(code_file, "w") as f:
-            f.write(code)
-        code_lines = code.splitlines(keepends=True)
-        total_lines = len(code_lines)
-        return view_lines(code_lines, 0, total_lines, code_file, total_lines)
-    except Exception as e:
-        return str(e)
+    def __setitem__(self, name: str, value: str) -> None:
+        self.artifacts[name] = value
+
+    def __contains__(self, name: str) -> bool:
+        return name in self.artifacts
 
 
 def format_lines(lines: List[str], start_idx: int) -> str:
@@ -136,34 +77,38 @@ def format_lines(lines: List[str], start_idx: int) -> str:
 
 
 def view_lines(
-    lines: List[str], line_num: int, window_size: int, file_path: str, total_lines: int
+    lines: List[str], line_num: int, window_size: int, name: str, total_lines: int
 ) -> str:
     start = max(0, line_num - window_size)
     end = min(len(lines), line_num + window_size)
     return (
-        f"[File: {file_path} ({total_lines} lines total)]\n"
+        f"[Artifact: {name} ({total_lines} lines total)]\n"
         + format_lines(lines[start:end], start)
-        + ("[End of file]" if end == len(lines) else f"[{len(lines) - end} more lines]")
+        + (
+            "[End of artifact]"
+            if end == len(lines)
+            else f"[{len(lines) - end} more lines]"
+        )
     )
 
 
-def open_file(file_path: str, line_num: int = 0, window_size: int = 100) -> str:
-    """Opens the file at at the given path in the editor. If `line_num` is provided,
-    the window will be moved to include that line. It only shows the first 100 lines by
-    default! Max `window_size` supported is 2000. use `scroll up/down` to view the file
-    if you want to see more.
+def open_artifact(
+    artifacts: Artifacts, name: str, line_num: int = 0, window_size: int = 100
+) -> str:
+    """Opens the provided artifact. If `line_num` is provided, the window will be moved
+    to include that line. It only shows the first 100 lines by default! Max
+    `window_size` supported is 2000.
 
     Parameters:
-        file_path (str): The file path to open, preferred absolute path.
+        artifacts (Artifacts): The artifacts object to open the artifact from.
+        name (str): The name of the artifact to open.
         line_num (int): The line number to move the window to.
         window_size (int): The number of lines to show above and below the line.
     """
+    if name not in artifacts:
+        return f"[Artifact {name} does not exist]"
 
-    file_path_p = Path(file_path)
-    if not file_path_p.exists():
-        return f"[File {file_path} does not exist]"
-
-    total_lines = sum(1 for _ in open(file_path_p))
+    total_lines = len(artifacts[name].splitlines())
     window_size = min(window_size, 2000)
     window_size = window_size // 2
     if line_num - window_size < 0:
@@ -171,158 +116,45 @@ def open_file(file_path: str, line_num: int = 0, window_size: int = 100) -> str:
     elif line_num >= total_lines:
         line_num = total_lines - 1 - window_size
 
-    global CURRENT_LINE, CURRENT_FILE
-    CURRENT_LINE = line_num
-    CURRENT_FILE = file_path
-
-    with open(file_path, "r") as f:
-        lines = f.readlines()
-
-    return view_lines(lines, line_num, window_size, file_path, total_lines)
-
-
-def create_file(file_path: str) -> str:
-    """Creates and opens a new file with the given name.
-
-    Parameters:
-        file_path (str): The file path to create, preferred absolute path.
-    """
-
-    file_path_p = Path(file_path)
-    if file_path_p.exists():
-        return f"[File {file_path} already exists]"
-    file_path_p.touch()
-    global CURRENT_FILE
-    CURRENT_FILE = file_path
-    return f"[File created {file_path}]"
-
-
-def scroll_up() -> str:
-    """Moves the window up by 100 lines."""
-    if CURRENT_FILE is None:
-        return "[No file is open]"
-
-    return open_file(CURRENT_FILE, CURRENT_LINE + DEFAULT_WINDOW_SIZE)
-
-
-def scroll_down() -> str:
-    """Moves the window down by 100 lines."""
-    if CURRENT_FILE is None:
-        return "[No file is open]"
-
-    return open_file(CURRENT_FILE, CURRENT_LINE - DEFAULT_WINDOW_SIZE)
-
-
-def search_dir(search_term: str, dir_path: str) -> str:
-    """Searches for search_term in all files in a directory.
-
-    Parameters:
-        search_term (str): The search term to look for.
-        dir_path (str): The directory path to search in, preferred absolute path.
-    """
-
-    dir_path_p = Path(dir_path)
-    if not dir_path_p.exists():
-        return f"[Directory {dir_path} does not exist]"
-
-    matches = []
-    for file in dir_path_p.glob("**/*"):
-        if filter_file(file):
-            with open(file, "r") as f:
-                lines = f.readlines()
-                for i, line in enumerate(lines):
-                    if search_term in line:
-                        matches.append(f"{file}:{i}|{line.strip()}\n")
-    if not matches:
-        return f"[No matches found for {search_term} in {dir_path}]"
-    if len(matches) > 100:
-        return f"[More than {len(matches)} matches found for {search_term} in {dir_path}. Please narrow your search]"
+    lines = artifacts[name].splitlines()
 
-    return_str = f"[Found {len(matches)} matches for {search_term} in {dir_path}]\n"
-    for match in matches:
-        return_str += match
+    return view_lines(lines, line_num, window_size, name, total_lines)
 
-    return_str += f"[End of matches for {search_term} in {dir_path}]"
-    return return_str
 
-
-def search_file(search_term: str, file_path: str) -> str:
-    """Searches the file for the given search term.
+def create_artifact(artifacts: Artifacts, name: str) -> str:
+    """Creates a new artifiact with the given name.
 
     Parameters:
-        search_term (str): The search term to look for.
-        file_path (str): The file path to search in, preferred absolute path.
+        artifacts (Artifacts): The artifacts object to add the new artifact to.
+        name (str): The name of the new artifact.
     """
+    if name in artifacts:
+        return f"[Artifact {name} already exists]"
+    artifacts[name] = ""
+    return f"[Artifact {name} created]"
 
-    file_path_p = Path(file_path)
-    if not file_path_p.exists():
-        return f"[File {file_path} does not exist]"
-
-    with open(file_path_p, "r") as f:
-        lines = f.readlines()
-
-    search_results = []
-    for i, line in enumerate(lines):
-        if search_term in line:
-            search_results.append(f"{i}|{line.strip()}\n")
 
-    if not search_results:
-        return f"[No matches found for {search_term} in {file_path}]"
-
-    return_str = (
-        f"[Found {len(search_results)} matches for {search_term} in {file_path}]\n"
-    )
-    for result in search_results:
-        return_str += result
-
-    return_str += f"[End of matches for {search_term} in {file_path}]"
-    return return_str
-
-
-def find_file(file_name: str, dir_path: str = "./") -> str:
-    """Finds all files with the given name in the specified directory.
-
-    Parameters:
-        file_name (str): The file name to look for.
-        dir_path (str): The directory path to search in, preferred absolute path.
-    """
-
-    dir_path_p = Path(dir_path)
-    if not dir_path_p.exists():
-        return f"[Directory {dir_path} does not exist]"
-
-    files = list(dir_path_p.glob(f"**/*{file_name}*"))
-    files = [f for f in files if filter_file(f)]
-    if not files:
-        return f"[No files found in {dir_path} with name {file_name}]"
-
-    return_str = f"[Found {len(files)} matches for {file_name} in {dir_path}]\n"
-    for match in files:
-        return_str += str(match) + "\n"
-
-    return_str += f"[End of matches for {file_name} in {dir_path}]"
-    return return_str
-
-
-def edit_file(file_path: str, start: int, end: int, content: str) -> str:
-    """Edits the file at the given path with the provided content. The content will be
-    inserted between the `start` and `end` line numbers. If the `start` and `end` are
-    the same, the content will be inserted at the `start` line number. If the `end` is
-    greater than the total number of lines in the file, the content will be inserted at
-    the end of the file. If the `start` or `end` are negative, the function will return
-    an error message.
+def edit_artifact(
+    artifacts: Artifacts, name: str, start: int, end: int, content: str
+) -> str:
+    """Edits the given artifact with the provided content. The content will be inserted
+    between the `start` and `end` line numbers. If the `start` and `end` are the same,
+    the content will be inserted at the `start` line number. If the `end` is greater
+    than the total number of lines in the file, the content will be inserted at the end
+    of the file. If the `start` or `end` are negative, the function will return an
+    error message.
 
     Parameters:
-        file_path (str): The file path to edit, preferred absolute path.
+        artifacts (Artifacts): The artifacts object to edit the artifact from.
+        name (str): The name of the artifact to edit.
         start (int): The line number to start the edit.
         end (int): The line number to end the edit.
         content (str): The content to insert.
     """
-    file_path_p = Path(file_path)
-    if not file_path_p.exists():
-        return f"[File {file_path} does not exist]"
+    if name not in artifacts:
+        return f"[Artifact {name} does not exist]"
 
-    total_lines = sum(1 for _ in open(file_path_p))
+    total_lines = len(artifacts[name].splitlines())
     if start < 0 or end < 0 or start > end or end > total_lines:
         return "[Invalid line range]"
     if start == end:
@@ -332,50 +164,391 @@ def edit_file(file_path: str, start: int, end: int, content: str) -> str:
     new_content_lines = [
         line if line.endswith("\n") else line + "\n" for line in new_content_lines
     ]
-    with open(file_path_p, "r") as f:
-        lines = f.readlines()
-        edited_lines = lines[:start] + new_content_lines + lines[end:]
+    lines = artifacts[name].splitlines()
+    edited_lines = lines[:start] + new_content_lines + lines[end:]
 
     cur_line = start + len(content.split("\n")) // 2
-    tmp_file = file_path_p.with_suffix(".tmp")
-    with open(tmp_file, "w") as f:
-        f.writelines(edited_lines)
-
-    process = subprocess.Popen(
-        [
-            "flake8",
-            "--isolated",
-            "--select=F821,F822,F831,E111,E112,E113,E999,E902",
-            tmp_file,
-        ],
-        stdout=subprocess.PIPE,
-        stderr=subprocess.PIPE,
-        text=True,
-    )
-    stdout, _ = process.communicate()
-    tmp_file.unlink()
-    if stdout != "":
-        stdout = stdout.replace(tmp_file.name, file_path)
-        error_msg = "[Edit failed with the following status]\n" + stdout
-        original_view = view_lines(
-            lines,
-            start + ((end - start) // 2),
-            DEFAULT_WINDOW_SIZE,
-            file_path,
-            total_lines,
+    with tempfile.NamedTemporaryFile(delete=True) as f:
+        with open(f.name, "w") as f:
+            f.writelines(edited_lines)
+
+        process = subprocess.Popen(
+            [
+                "flake8",
+                "--isolated",
+                "--select=F821,F822,F831,E111,E112,E113,E999,E902",
+                f.name,
+            ],
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            text=True,
         )
-        total_lines_edit = sum(1 for _ in edited_lines)
-        edited_view = view_lines(
-            edited_lines, cur_line, DEFAULT_WINDOW_SIZE, file_path, total_lines_edit
-        )
-
-        error_msg += f"\n[This is how your edit would have looked like if applied]\n{edited_view}\n\n[This is the original code before your edit]\n{original_view}"
-        return error_msg
-
-    with open(file_path_p, "w") as f:
-        f.writelines(edited_lines)
+        stdout, _ = process.communicate()
+
+        if stdout != "":
+            stdout = stdout.replace(f.name, name)
+            error_msg = "[Edit failed with the following status]\n" + stdout
+            original_view = view_lines(
+                lines,
+                start + ((end - start) // 2),
+                DEFAULT_WINDOW_SIZE,
+                name,
+                total_lines,
+            )
+            total_lines_edit = sum(1 for _ in edited_lines)
+            edited_view = view_lines(
+                edited_lines, cur_line, DEFAULT_WINDOW_SIZE, name, total_lines_edit
+            )
 
-    return open_file(file_path, cur_line)
+            error_msg += f"\n[This is how your edit would have looked like if applied]\n{edited_view}\n\n[This is the original code before your edit]\n{original_view}"
+            return error_msg
+
+    artifacts[name] = "".join(edited_lines)
+
+    return open_artifact(artifacts, name, cur_line)
+
+
+# def generate_vision_code(save_file: str, chat: str, media: List[str]) -> str:
+#     """Generates python code to solve vision based tasks.
+
+#     Parameters:
+#         save_file (str): The file path to save the code.
+#         chat (str): The chat message from the user.
+#         media (List[str]): The media files to use.
+
+#     Returns:
+#         str: The generated code.
+
+#     Examples
+#     --------
+#         >>> generate_vision_code("code.py", "Can you detect the dogs in this image?", ["image.jpg"])
+#         from vision_agent.tools import load_image, owl_v2
+#         def detect_dogs(image_path: str):
+#             image = load_image(image_path)
+#             dogs = owl_v2("dog", image)
+#             return dogs
+#     """
+
+#     if ZMQ_PORT is not None:
+#         agent = va.agent.VisionAgentCoder(
+#             report_progress_callback=lambda inp: report_progress_callback(
+#                 int(ZMQ_PORT), inp
+#             )
+#         )
+#     else:
+#         agent = va.agent.VisionAgentCoder()
+#     try:
+#         fixed_chat: List[Message] = [{"role": "user", "content": chat, "media": media}]
+#         response = agent.chat_with_workflow(fixed_chat)
+#         code = response["code"]
+#         with open(save_file, "w") as f:
+#             f.write(code)
+#         code_lines = code.splitlines(keepends=True)
+#         total_lines = len(code_lines)
+#         return view_lines(code_lines, 0, total_lines, save_file, total_lines)
+#     except Exception as e:
+#         return str(e)
+
+
+# def edit_vision_code(code_file: str, chat_history: List[str], media: List[str]) -> str:
+#     """Edits python code to solve a vision based task.
+
+#     Parameters:
+#         code_file (str): The file path to the code.
+#         chat_history (List[str]): The chat history to used to generate the code.
+
+#     Returns:
+#         str: The edited code.
+
+#     Examples
+#     --------
+#         >>> edit_vision_code(
+#         >>>     "code.py",
+#         >>>     ["Can you detect the dogs in this image?", "Can you use a higher threshold?"],
+#         >>>     ["dog.jpg"],
+#         >>> )
+#         from vision_agent.tools import load_image, owl_v2
+#         def detect_dogs(image_path: str):
+#             image = load_image(image_path)
+#             dogs = owl_v2("dog", image, threshold=0.8)
+#             return dogs
+#     """
+
+#     agent = va.agent.VisionAgentCoder()
+#     with open(code_file, "r") as f:
+#         code = f.read()
+
+#     # Append latest code to second to last message from assistant
+#     fixed_chat_history: List[Message] = []
+#     for i, chat in enumerate(chat_history):
+#         if i == 0:
+#             fixed_chat_history.append({"role": "user", "content": chat, "media": media})
+#         elif i > 0 and i < len(chat_history) - 1:
+#             fixed_chat_history.append({"role": "user", "content": chat})
+#         elif i == len(chat_history) - 1:
+#             fixed_chat_history.append({"role": "assistant", "content": code})
+#             fixed_chat_history.append({"role": "user", "content": chat})
+
+#     try:
+#         response = agent.chat_with_workflow(fixed_chat_history, test_multi_plan=False)
+#         code = response["code"]
+#         with open(code_file, "w") as f:
+#             f.write(code)
+#         code_lines = code.splitlines(keepends=True)
+#         total_lines = len(code_lines)
+#         return view_lines(code_lines, 0, total_lines, code_file, total_lines)
+#     except Exception as e:
+#         return str(e)
+
+
+# def format_lines(lines: List[str], start_idx: int) -> str:
+#     output = ""
+#     for i, line in enumerate(lines):
+#         output += f"{i + start_idx}|{line}"
+#     return output
+
+
+# def view_lines(
+#     lines: List[str], line_num: int, window_size: int, file_path: str, total_lines: int
+# ) -> str:
+#     start = max(0, line_num - window_size)
+#     end = min(len(lines), line_num + window_size)
+#     return (
+#         f"[File: {file_path} ({total_lines} lines total)]\n"
+#         + format_lines(lines[start:end], start)
+#         + ("[End of file]" if end == len(lines) else f"[{len(lines) - end} more lines]")
+#     )
+
+
+# def open_file(file_path: str, line_num: int = 0, window_size: int = 100) -> str:
+#     """Opens the file at at the given path in the editor. If `line_num` is provided,
+#     the window will be moved to include that line. It only shows the first 100 lines by
+#     default! Max `window_size` supported is 2000. use `scroll up/down` to view the file
+#     if you want to see more.
+
+#     Parameters:
+#         file_path (str): The file path to open, preferred absolute path.
+#         line_num (int): The line number to move the window to.
+#         window_size (int): The number of lines to show above and below the line.
+#     """
+
+#     file_path_p = Path(file_path)
+#     if not file_path_p.exists():
+#         return f"[File {file_path} does not exist]"
+
+#     total_lines = sum(1 for _ in open(file_path_p))
+#     window_size = min(window_size, 2000)
+#     window_size = window_size // 2
+#     if line_num - window_size < 0:
+#         line_num = window_size
+#     elif line_num >= total_lines:
+#         line_num = total_lines - 1 - window_size
+
+#     global CURRENT_LINE, CURRENT_FILE
+#     CURRENT_LINE = line_num
+#     CURRENT_FILE = file_path
+
+#     with open(file_path, "r") as f:
+#         lines = f.readlines()
+
+#     return view_lines(lines, line_num, window_size, file_path, total_lines)
+
+
+# def create_file(file_path: str) -> str:
+#     """Creates and opens a new file with the given name.
+
+#     Parameters:
+#         file_path (str): The file path to create, preferred absolute path.
+#     """
+
+#     file_path_p = Path(file_path)
+#     if file_path_p.exists():
+#         return f"[File {file_path} already exists]"
+#     file_path_p.touch()
+#     global CURRENT_FILE
+#     CURRENT_FILE = file_path
+#     return f"[File created {file_path}]"
+
+
+# def scroll_up() -> str:
+#     """Moves the window up by 100 lines."""
+#     if CURRENT_FILE is None:
+#         return "[No file is open]"
+
+#     return open_file(CURRENT_FILE, CURRENT_LINE + DEFAULT_WINDOW_SIZE)
+
+
+# def scroll_down() -> str:
+#     """Moves the window down by 100 lines."""
+#     if CURRENT_FILE is None:
+#         return "[No file is open]"
+
+#     return open_file(CURRENT_FILE, CURRENT_LINE - DEFAULT_WINDOW_SIZE)
+
+
+# def search_dir(search_term: str, dir_path: str) -> str:
+#     """Searches for search_term in all files in a directory.
+
+#     Parameters:
+#         search_term (str): The search term to look for.
+#         dir_path (str): The directory path to search in, preferred absolute path.
+#     """
+
+#     dir_path_p = Path(dir_path)
+#     if not dir_path_p.exists():
+#         return f"[Directory {dir_path} does not exist]"
+
+#     matches = []
+#     for file in dir_path_p.glob("**/*"):
+#         if filter_file(file):
+#             with open(file, "r") as f:
+#                 lines = f.readlines()
+#                 for i, line in enumerate(lines):
+#                     if search_term in line:
+#                         matches.append(f"{file}:{i}|{line.strip()}\n")
+#     if not matches:
+#         return f"[No matches found for {search_term} in {dir_path}]"
+#     if len(matches) > 100:
+#         return f"[More than {len(matches)} matches found for {search_term} in {dir_path}. Please narrow your search]"
+
+#     return_str = f"[Found {len(matches)} matches for {search_term} in {dir_path}]\n"
+#     for match in matches:
+#         return_str += match
+
+#     return_str += f"[End of matches for {search_term} in {dir_path}]"
+#     return return_str
+
+
+# def search_file(search_term: str, file_path: str) -> str:
+#     """Searches the file for the given search term.
+
+#     Parameters:
+#         search_term (str): The search term to look for.
+#         file_path (str): The file path to search in, preferred absolute path.
+#     """
+
+#     file_path_p = Path(file_path)
+#     if not file_path_p.exists():
+#         return f"[File {file_path} does not exist]"
+
+#     with open(file_path_p, "r") as f:
+#         lines = f.readlines()
+
+#     search_results = []
+#     for i, line in enumerate(lines):
+#         if search_term in line:
+#             search_results.append(f"{i}|{line.strip()}\n")
+
+#     if not search_results:
+#         return f"[No matches found for {search_term} in {file_path}]"
+
+#     return_str = (
+#         f"[Found {len(search_results)} matches for {search_term} in {file_path}]\n"
+#     )
+#     for result in search_results:
+#         return_str += result
+
+#     return_str += f"[End of matches for {search_term} in {file_path}]"
+#     return return_str
+
+
+# def find_file(file_name: str, dir_path: str = "./") -> str:
+#     """Finds all files with the given name in the specified directory.
+
+#     Parameters:
+#         file_name (str): The file name to look for.
+#         dir_path (str): The directory path to search in, preferred absolute path.
+#     """
+
+#     dir_path_p = Path(dir_path)
+#     if not dir_path_p.exists():
+#         return f"[Directory {dir_path} does not exist]"
+
+#     files = list(dir_path_p.glob(f"**/*{file_name}*"))
+#     files = [f for f in files if filter_file(f)]
+#     if not files:
+#         return f"[No files found in {dir_path} with name {file_name}]"
+
+#     return_str = f"[Found {len(files)} matches for {file_name} in {dir_path}]\n"
+#     for match in files:
+#         return_str += str(match) + "\n"
+
+#     return_str += f"[End of matches for {file_name} in {dir_path}]"
+#     return return_str
+
+
+# def edit_file(file_path: str, start: int, end: int, content: str) -> str:
+#     """Edits the file at the given path with the provided content. The content will be
+#     inserted between the `start` and `end` line numbers. If the `start` and `end` are
+#     the same, the content will be inserted at the `start` line number. If the `end` is
+#     greater than the total number of lines in the file, the content will be inserted at
+#     the end of the file. If the `start` or `end` are negative, the function will return
+#     an error message.
+
+#     Parameters:
+#         file_path (str): The file path to edit, preferred absolute path.
+#         start (int): The line number to start the edit.
+#         end (int): The line number to end the edit.
+#         content (str): The content to insert.
+#     """
+#     file_path_p = Path(file_path)
+#     if not file_path_p.exists():
+#         return f"[File {file_path} does not exist]"
+
+#     total_lines = sum(1 for _ in open(file_path_p))
+#     if start < 0 or end < 0 or start > end or end > total_lines:
+#         return "[Invalid line range]"
+#     if start == end:
+#         end += 1
+
+#     new_content_lines = content.splitlines(keepends=True)
+#     new_content_lines = [
+#         line if line.endswith("\n") else line + "\n" for line in new_content_lines
+#     ]
+#     with open(file_path_p, "r") as f:
+#         lines = f.readlines()
+#         edited_lines = lines[:start] + new_content_lines + lines[end:]
+
+#     cur_line = start + len(content.split("\n")) // 2
+#     tmp_file = file_path_p.with_suffix(".tmp")
+#     with open(tmp_file, "w") as f:
+#         f.writelines(edited_lines)
+
+#     process = subprocess.Popen(
+#         [
+#             "flake8",
+#             "--isolated",
+#             "--select=F821,F822,F831,E111,E112,E113,E999,E902",
+#             tmp_file,
+#         ],
+#         stdout=subprocess.PIPE,
+#         stderr=subprocess.PIPE,
+#         text=True,
+#     )
+#     stdout, _ = process.communicate()
+#     tmp_file.unlink()
+#     if stdout != "":
+#         stdout = stdout.replace(tmp_file.name, file_path)
+#         error_msg = "[Edit failed with the following status]\n" + stdout
+#         original_view = view_lines(
+#             lines,
+#             start + ((end - start) // 2),
+#             DEFAULT_WINDOW_SIZE,
+#             file_path,
+#             total_lines,
+#         )
+#         total_lines_edit = sum(1 for _ in edited_lines)
+#         edited_view = view_lines(
+#             edited_lines, cur_line, DEFAULT_WINDOW_SIZE, file_path, total_lines_edit
+#         )
+
+#         error_msg += f"\n[This is how your edit would have looked like if applied]\n{edited_view}\n\n[This is the original code before your edit]\n{original_view}"
+#         return error_msg
+
+#     with open(file_path_p, "w") as f:
+#         f.writelines(edited_lines)
+
+#     return open_file(file_path, cur_line)
 
 
 def get_tool_descriptions() -> str:
@@ -388,15 +561,19 @@ def get_tool_descriptions() -> str:
 META_TOOL_DOCSTRING = get_tool_documentation(
     [
         get_tool_descriptions,
-        generate_vision_code,
-        edit_vision_code,
-        open_file,
-        create_file,
-        scroll_up,
-        scroll_down,
-        edit_file,
-        search_dir,
-        search_file,
-        find_file,
+        open_artifact,
+        create_artifact,
+        edit_artifact,
+        # generate_vision_code,
+        # edit_vision_code,
+        # open_file,
+        # create_file,
+        # scroll_up,
+        # scroll_down,
+        # edit_file,
+        # search_dir,
+        # search_file,
+        # find_file,
+        # florencev2_fine_tuning,
     ]
 )

From 8dede494c0ea25de9dbc9e490f986e563c869aca Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Thu, 15 Aug 2024 10:53:02 -0700
Subject: [PATCH 03/37] update local executor

---
 vision_agent/utils/execute.py | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/vision_agent/utils/execute.py b/vision_agent/utils/execute.py
index 033276d3..1ae5e446 100644
--- a/vision_agent/utils/execute.py
+++ b/vision_agent/utils/execute.py
@@ -40,6 +40,7 @@
 load_dotenv()
 _LOGGER = logging.getLogger(__name__)
 _SESSION_TIMEOUT = 600  # 10 minutes
+WORKSPACE = Path(os.getenv("WORKSPACE", ""))
 
 
 class MimeType(str, Enum):
@@ -607,6 +608,22 @@ def exec_cell(self, code: str) -> Execution:
             traceback_raw = traceback.format_exc().splitlines()
             return Execution.from_exception(e, traceback_raw)
 
+    def upload_file(self, file_path: str) -> Path:
+        with open(file_path) as f:
+            contents = f.read()
+        with open(WORKSPACE / file_path, "wb") as f:
+            f.write(contents)
+
+        return Path(WORKSPACE / file_path)
+
+    def download_file(self, file_path: str) -> Path:
+        with open(file_path, "rb") as f:
+            contents = f.read()
+        with open(WORKSPACE / file_path, "wb") as f:
+            f.write(contents)
+        return Path(WORKSPACE / file_path)
+
+
 
 class CodeInterpreterFactory:
     """Factory class for creating code interpreters.

From 97556be30c9348fceae8285f6558dd3e506da26e Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Mon, 26 Aug 2024 09:55:25 -0700
Subject: [PATCH 04/37] fix upload/download

---
 vision_agent/utils/execute.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/vision_agent/utils/execute.py b/vision_agent/utils/execute.py
index 1ae5e446..299f5d2c 100644
--- a/vision_agent/utils/execute.py
+++ b/vision_agent/utils/execute.py
@@ -609,7 +609,7 @@ def exec_cell(self, code: str) -> Execution:
             return Execution.from_exception(e, traceback_raw)
 
     def upload_file(self, file_path: str) -> Path:
-        with open(file_path) as f:
+        with open(file_path, "rb") as f:
             contents = f.read()
         with open(WORKSPACE / file_path, "wb") as f:
             f.write(contents)
@@ -617,11 +617,11 @@ def upload_file(self, file_path: str) -> Path:
         return Path(WORKSPACE / file_path)
 
     def download_file(self, file_path: str) -> Path:
-        with open(file_path, "rb") as f:
+        with open(WORKSPACE / file_path, "rb") as f:
             contents = f.read()
-        with open(WORKSPACE / file_path, "wb") as f:
+        with open(file_path, "wb") as f:
             f.write(contents)
-        return Path(WORKSPACE / file_path)
+        return Path(file_path)
 
 
 

From 82169c24284764a8fc9634341a184551ead98698 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Mon, 26 Aug 2024 12:46:53 -0700
Subject: [PATCH 05/37] cleaned up code for artifacts

---
 vision_agent/agent/vision_agent.py | 63 ++++++++++++++++--------------
 1 file changed, 34 insertions(+), 29 deletions(-)

diff --git a/vision_agent/agent/vision_agent.py b/vision_agent/agent/vision_agent.py
index dad2d824..f497f467 100644
--- a/vision_agent/agent/vision_agent.py
+++ b/vision_agent/agent/vision_agent.py
@@ -1,8 +1,8 @@
 import copy
 import logging
 import os
-import tempfile
 import pickle as pkl
+import tempfile
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Union, cast
 
@@ -14,8 +14,8 @@
     VA_CODE,
 )
 from vision_agent.lmm import LMM, Message, OpenAILMM
-from vision_agent.tools.meta_tools import Artifacts
 from vision_agent.tools import META_TOOL_DOCSTRING
+from vision_agent.tools.meta_tools import Artifacts
 from vision_agent.utils import CodeInterpreterFactory
 from vision_agent.utils.execute import CodeInterpreter
 
@@ -28,24 +28,30 @@
     os.environ["PYTHONPATH"] = f"{WORKSPACE}:{os.getenv('PYTHONPATH', '')}"
 
 
-class DefaultImports:
-    code = [
+class BoilerplateCode:
+    pre_code = [
         "from typing import *",
         "from vision_agent.utils.execute import CodeInterpreter",
         "from vision_agent.tools.meta_tools import Artifacts, open_artifact, create_artifact, edit_artifact, get_tool_descriptions",
-        f"artifacts = Artifacts({ARTIFACT})",
+        "artifacts = Artifacts({remote_path})",
+        "artifacts.load({remote_path})",
+    ]
+    post_code = [
+        "artifacts.save()",
     ]
 
     @staticmethod
-    def to_code_string() -> str:
-        return "\n".join(DefaultImports.code)
-
-    @staticmethod
-    def prepend_imports(code: str) -> str:
+    def add_boilerplate(code: str) -> str:
         """Run this method to prepend the default imports to the code.
         NOTE: be sure to run this method after the custom tools have been registered.
         """
-        return DefaultImports.to_code_string() + "\n\n" + code
+        return (
+            "\n".join(BoilerplateCode.pre_code)
+            + "\n\n"
+            + code
+            + "\n\n"
+            + "\n".join(BoilerplateCode.post_code)
+        )
 
 
 def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
@@ -71,22 +77,10 @@ def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
     return extract_json(orch([{"role": "user", "content": prompt}], stream=False))  # type: ignore
 
 
-def run_code_action(code: str, artifacts: Artifacts, code_interpreter: CodeInterpreter) -> str:
-    with tempfile.TemporaryDirectory() as tmpdirname:
-        for name in artifacts:
-            temp_file_path = Path(tmpdirname) / name + ".py"
-            with open(temp_file_path, "w") as f:
-                f.write(artifacts[name])
-            code_interpreter.upload_file(temp_file_path)
-            temp_file_path.unlink()
-
-        temp_file_path = Path(tmpdirname) / ARTIFACT
-        with open(temp_file_path, "wb") as f:
-            pkl.dump(artifacts.artifacts, f)
-        code_interpreter.upload_file(temp_file_path)
-        temp_file_path.unlink()
-
-    result = code_interpreter.exec_cell(DefaultImports.prepend_imports(code))
+def run_code_action(
+    code: str, code_interpreter: CodeInterpreter
+) -> str:
+    result = code_interpreter.exec_cell(BoilerplateCode.add_boilerplate(code))
 
     return_str = ""
     if result.success:
@@ -150,6 +144,7 @@ def __call__(
         self,
         input: Union[str, List[Message]],
         media: Optional[Union[str, Path]] = None,
+        artifacts: Optional[Artifacts] = None,
     ) -> str:
         """Chat with VisionAgent and get the conversation response.
 
@@ -172,13 +167,13 @@ def __call__(
     def chat_with_code(
         self,
         chat: List[Message],
+        artifacts: Optional[Artifacts] = None,
     ) -> List[Message]:
         """Chat with VisionAgent, it will use code to execute actions to accomplish
         its tasks.
 
         Parameters:
-            chat (List[Message]): A conversation
-                in the format of:
+            chat (List[Message]): A conversation in the format of:
                 [{"role": "user", "content": "describe your task here..."}]
                 or if it contains media files, it should be in the format of:
                 [{"role": "user", "content": "describe your task here...", "media": ["image1.jpg", "image2.jpg"]}]
@@ -190,6 +185,10 @@ def chat_with_code(
         if not chat:
             raise ValueError("chat cannot be empty")
 
+        if not artifacts:
+            artifacts = Artifacts("artifacts.pkl")
+            artifacts.save()
+
         with CodeInterpreterFactory.new_instance(
             code_sandbox_runtime=self.code_sandbox_runtime
         ) as code_interpreter:
@@ -222,6 +221,8 @@ def chat_with_code(
             finished = False
             iterations = 0
             while not finished and iterations < self.max_iterations:
+                artifacts_remote_path = code_interpreter.upload_file(artifacts.save_path)
+
                 response = run_conversation(self.agent, int_chat)
                 if self.verbosity >= 1:
                     _LOGGER.info(response)
@@ -235,6 +236,10 @@ def chat_with_code(
 
                 if code_action is not None:
                     obs = run_code_action(code_action, code_interpreter)
+                    artifacts_local_path = code_interpreter.download_file(artifacts_remote_path)
+                    artifacts.load(artifacts_local_path)
+                    artifacts.save()
+
                     if self.verbosity >= 1:
                         _LOGGER.info(obs)
                     int_chat.append({"role": "observation", "content": obs})

From d1f160201a4c541a70279a8d9fdf1abbbfb5bc8d Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Mon, 26 Aug 2024 12:47:03 -0700
Subject: [PATCH 06/37] starting artifact prompts

---
 vision_agent/agent/vision_agent_prompts.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/vision_agent/agent/vision_agent_prompts.py b/vision_agent/agent/vision_agent_prompts.py
index 7b714378..cf1e9a33 100644
--- a/vision_agent/agent/vision_agent_prompts.py
+++ b/vision_agent/agent/vision_agent_prompts.py
@@ -32,6 +32,16 @@
 """
 
 
+EXAMPLES_CODE1_ARTIFACT = """
+USER: Can you write a simple application that adds two numbers?
+
+AGENT: {"thoughts": "The user has asked to add two numbers, I will generate the code to add two numbers.", "response": "<execute_python>create_artifact(artifact, 'add_two_numbers')</execute_python>", "let_user_respond": false}
+
+OBSERVATION:
+[Artifact add_two_numbers created]
+"""
+
+
 EXAMPLES_CODE1 = """
 USER: Can you detect the dogs in this image? Media name dog.jpg
 

From 2fc76a53ac256550c5983d8aaacc29ed938e5aa5 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Wed, 28 Aug 2024 08:39:24 -0700
Subject: [PATCH 07/37] app to add files to artifacts

---
 examples/chat/app.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/examples/chat/app.py b/examples/chat/app.py
index f1cd62e7..94f3b23a 100644
--- a/examples/chat/app.py
+++ b/examples/chat/app.py
@@ -26,6 +26,8 @@
     "response": "saved",
     "style": {"bottom": "calc(50% - 4.25rem", "right": "0.4rem"},
 }
+artifacts = va.tools.Artifacts("artifacts.pkl")
+artifacts.save()
 agent = va.agent.VisionAgent(verbosity=1)
 
 st.set_page_config(layout="wide")
@@ -44,7 +46,9 @@
 
 
 def update_messages(messages, lock):
-    new_chat = agent.chat_with_code(messages)
+    if Path("artifacts.pkl").exists():
+        artifacts.load("artifacts.pkl")
+    new_chat = agent.chat_with_code(messages, artifacts=artifacts)
     with lock:
         for new_message in new_chat:
             if new_message not in messages:
@@ -121,6 +125,7 @@ def main():
             if uploaded_file is not None:
                 with open(WORKSPACE / uploaded_file.name, "wb") as f:
                     f.write(uploaded_file.getbuffer())
+                artifacts.artifacts[WORKSPACE / uploaded_file.name] = ""
 
             for file in WORKSPACE.iterdir():
                 if "__pycache__" not in str(file) and not str(file).startswith("."):

From 11cef6f4e60a831c6ba73d7165c21b2c98fbefc4 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Wed, 28 Aug 2024 08:40:11 -0700
Subject: [PATCH 08/37] add support for artifacts

---
 vision_agent/agent/vision_agent.py | 83 +++++++++++++++---------------
 1 file changed, 41 insertions(+), 42 deletions(-)

diff --git a/vision_agent/agent/vision_agent.py b/vision_agent/agent/vision_agent.py
index f497f467..0d617c8a 100644
--- a/vision_agent/agent/vision_agent.py
+++ b/vision_agent/agent/vision_agent.py
@@ -1,17 +1,15 @@
 import copy
 import logging
 import os
-import pickle as pkl
-import tempfile
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Union, cast
 
 from vision_agent.agent import Agent
 from vision_agent.agent.agent_utils import extract_json
 from vision_agent.agent.vision_agent_prompts import (
-    EXAMPLES_CODE1,
-    EXAMPLES_CODE2,
-    VA_CODE,
+    EXAMPLES_CODE1_ARTIFACT,
+    EXAMPLES_CODE2_ARTIFACT,
+    VA_CODE_ARTIFACT,
 )
 from vision_agent.lmm import LMM, Message, OpenAILMM
 from vision_agent.tools import META_TOOL_DOCSTRING
@@ -21,7 +19,6 @@
 
 logging.basicConfig(level=logging.INFO)
 _LOGGER = logging.getLogger(__name__)
-ARTIFACT = "artifacts.pkl"
 WORKSPACE = Path(os.getenv("WORKSPACE", ""))
 WORKSPACE.mkdir(parents=True, exist_ok=True)
 if str(WORKSPACE) != "":
@@ -32,25 +29,25 @@ class BoilerplateCode:
     pre_code = [
         "from typing import *",
         "from vision_agent.utils.execute import CodeInterpreter",
-        "from vision_agent.tools.meta_tools import Artifacts, open_artifact, create_artifact, edit_artifact, get_tool_descriptions",
-        "artifacts = Artifacts({remote_path})",
-        "artifacts.load({remote_path})",
+        "from vision_agent.tools.meta_tools import Artifacts, open_artifact, create_artifact, edit_artifact, get_tool_descriptions, generate_vision_code, edit_vision_code",
+        "artifacts = Artifacts('{remote_path}')",
+        "artifacts.load('{remote_path}')",
     ]
     post_code = [
         "artifacts.save()",
     ]
 
     @staticmethod
-    def add_boilerplate(code: str) -> str:
+    def add_boilerplate(code: str, **format) -> str:
         """Run this method to prepend the default imports to the code.
         NOTE: be sure to run this method after the custom tools have been registered.
         """
         return (
-            "\n".join(BoilerplateCode.pre_code)
+            "\n".join([s.format(**format) for s in BoilerplateCode.pre_code])
             + "\n\n"
             + code
             + "\n\n"
-            + "\n".join(BoilerplateCode.post_code)
+            + "\n".join([s.format(**format) for s in BoilerplateCode.post_code])
         )
 
 
@@ -68,38 +65,21 @@ def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
         else:
             raise ValueError(f"role {chat_i['role']} is not supported")
 
-    prompt = VA_CODE.format(
+    prompt = VA_CODE_ARTIFACT.format(
         documentation=META_TOOL_DOCSTRING,
-        examples=f"{EXAMPLES_CODE1}\n{EXAMPLES_CODE2}",
-        dir=WORKSPACE,
+        examples=f"{EXAMPLES_CODE1_ARTIFACT}\n{EXAMPLES_CODE2_ARTIFACT}",
         conversation=conversation,
     )
     return extract_json(orch([{"role": "user", "content": prompt}], stream=False))  # type: ignore
 
 
 def run_code_action(
-    code: str, code_interpreter: CodeInterpreter
+    code: str, code_interpreter: CodeInterpreter, artifact_remote_path: str
 ) -> str:
-    result = code_interpreter.exec_cell(BoilerplateCode.add_boilerplate(code))
-
-    return_str = ""
-    if result.success:
-        for res in result.results:
-            if res.text is not None:
-                return_str += res.text.replace("\\n", "\n")
-        if result.logs.stdout:
-            return_str += "----- stdout -----\n"
-            for log in result.logs.stdout:
-                return_str += log.replace("\\n", "\n")
-    else:
-        # for log in result.logs.stderr:
-        #     return_str += log.replace("\\n", "\n")
-        if result.error:
-            return_str += (
-                "\n" + result.error.value + "\n".join(result.error.traceback_raw)
-            )
-
-    return return_str
+    result = code_interpreter.exec_cell(
+        BoilerplateCode.add_boilerplate(code, remote_path=artifact_remote_path)
+    )
+    return result.text()
 
 
 def parse_execution(response: str) -> Optional[str]:
@@ -112,8 +92,8 @@ def parse_execution(response: str) -> Optional[str]:
 
 class VisionAgent(Agent):
     """Vision Agent is an agent that can chat with the user and call tools or other
-    agents to generate code for it. Vision Agent uses python code to execute actions for
-    the user. Vision Agent is inspired by by OpenDev
+    agents to generate code for it. Vision Agent uses python code to execute actions
+    for the user. Vision Agent is inspired by by OpenDev
     https://github.com/OpenDevin/OpenDevin and CodeAct https://arxiv.org/abs/2402.01030
 
     Example
@@ -161,7 +141,7 @@ def __call__(
             input = [{"role": "user", "content": input}]
             if media is not None:
                 input[0]["media"] = [media]
-        results = self.chat_with_code(input)
+        results = self.chat_with_code(input, artifacts)
         return results  # type: ignore
 
     def chat_with_code(
@@ -200,6 +180,10 @@ def chat_with_code(
                     for media in chat_i["media"]:
                         media = code_interpreter.upload_file(media)
                         chat_i["content"] += f" Media name {media}"  # type: ignore
+                        # Save dummy value for now since we just need to know the path
+                        # name in the key 'media'. Later on we can add artifact support
+                        # for byte data.
+                        artifacts.artifacts[media] = ""
                         media_list.append(media)
 
             int_chat = cast(
@@ -220,8 +204,14 @@ def chat_with_code(
 
             finished = False
             iterations = 0
+            last_response = None
             while not finished and iterations < self.max_iterations:
-                artifacts_remote_path = code_interpreter.upload_file(artifacts.save_path)
+                artifacts_remote_path = code_interpreter.upload_file(
+                    artifacts.save_path
+                )
+                artifacts_loaded = artifacts.show()
+                int_chat.append({"role": "observation", "content": artifacts_loaded})
+                orig_chat.append({"role": "observation", "content": artifacts_loaded})
 
                 response = run_conversation(self.agent, int_chat)
                 if self.verbosity >= 1:
@@ -229,14 +219,22 @@ def chat_with_code(
                 int_chat.append({"role": "assistant", "content": str(response)})
                 orig_chat.append({"role": "assistant", "content": str(response)})
 
+                # sometimes it gets stuck in a loop, so we force it to exit
+                if last_response == response:
+                    response["let_user_respond"] = True
+
                 if response["let_user_respond"]:
                     break
 
                 code_action = parse_execution(response["response"])
 
                 if code_action is not None:
-                    obs = run_code_action(code_action, code_interpreter)
-                    artifacts_local_path = code_interpreter.download_file(artifacts_remote_path)
+                    obs = run_code_action(
+                        code_action, code_interpreter, artifacts_remote_path
+                    )
+                    artifacts_local_path = code_interpreter.download_file(
+                        artifacts_remote_path
+                    )
                     artifacts.load(artifacts_local_path)
                     artifacts.save()
 
@@ -246,6 +244,7 @@ def chat_with_code(
                     orig_chat.append({"role": "observation", "content": obs})
 
                 iterations += 1
+                last_response = response
         return orig_chat
 
     def log_progress(self, data: Dict[str, Any]) -> None:

From 0163daa5708e7aa5117679331f974fa7ef7cd35b Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Wed, 28 Aug 2024 08:40:34 -0700
Subject: [PATCH 09/37] add artifact meta tools

---
 vision_agent/tools/meta_tools.py | 134 ++++++++++++++++++++++++++++---
 1 file changed, 123 insertions(+), 11 deletions(-)

diff --git a/vision_agent/tools/meta_tools.py b/vision_agent/tools/meta_tools.py
index 29d56c55..4b245757 100644
--- a/vision_agent/tools/meta_tools.py
+++ b/vision_agent/tools/meta_tools.py
@@ -3,7 +3,7 @@
 import subprocess
 import tempfile
 from pathlib import Path
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Callable, Dict, List, Optional, Union
 from uuid import UUID
 
 import vision_agent as va
@@ -11,8 +11,8 @@
 from vision_agent.lmm.types import Message
 from vision_agent.tools.tool_utils import get_tool_documentation
 from vision_agent.tools.tools import TOOL_DESCRIPTIONS
-from vision_agent.utils.image_utils import convert_to_b64
 from vision_agent.utils import CodeInterpreterFactory
+from vision_agent.utils.image_utils import convert_to_b64
 
 # These tools are adapted from SWE-Agent https://github.com/princeton-nlp/SWE-agent
 
@@ -43,7 +43,7 @@ def filter_file(file_name: Union[str, Path]) -> bool:
 
 class Artifacts:
     def __init__(self, save_path: Union[str, Path]) -> None:
-        self.save_path = save_path
+        self.save_path = Path(save_path)
         self.artifacts = {}
 
         self.code_sandbox_runtime = None
@@ -51,6 +51,16 @@ def __init__(self, save_path: Union[str, Path]) -> None:
     def load(self, file_path: Union[str, Path]) -> None:
         with open(file_path, "rb") as f:
             self.artifacts = pkl.load(f)
+        for k, v in self.artifacts.items():
+            with open(self.save_path.parent / k, "w") as f:
+                f.write(v)
+
+    def show(self) -> str:
+        out_str = "[Artifacts loaded]\n"
+        for k in self.artifacts.keys():
+            out_str += f"Artifact {k} loaded to {str(self.save_path.parent / k)}\n"
+        out_str += "[End of artifacts]\n"
+        return out_str
 
     def save(self) -> None:
         with open(self.save_path, "wb") as f:
@@ -81,7 +91,7 @@ def view_lines(
 ) -> str:
     start = max(0, line_num - window_size)
     end = min(len(lines), line_num + window_size)
-    return (
+    return_str = (
         f"[Artifact: {name} ({total_lines} lines total)]\n"
         + format_lines(lines[start:end], start)
         + (
@@ -90,6 +100,8 @@ def view_lines(
             else f"[{len(lines) - end} more lines]"
         )
     )
+    print(return_str)
+    return return_str
 
 
 def open_artifact(
@@ -116,7 +128,7 @@ def open_artifact(
     elif line_num >= total_lines:
         line_num = total_lines - 1 - window_size
 
-    lines = artifacts[name].splitlines()
+    lines = artifacts[name].splitlines(keepends=True)
 
     return view_lines(lines, line_num, window_size, name, total_lines)
 
@@ -129,9 +141,12 @@ def create_artifact(artifacts: Artifacts, name: str) -> str:
         name (str): The name of the new artifact.
     """
     if name in artifacts:
-        return f"[Artifact {name} already exists]"
-    artifacts[name] = ""
-    return f"[Artifact {name} created]"
+        return_str = f"[Artifact {name} already exists]"
+    else:
+        artifacts[name] = ""
+        return_str = f"[Artifact {name} created]"
+    print(return_str)
+    return return_str
 
 
 def edit_artifact(
@@ -151,8 +166,10 @@ def edit_artifact(
         end (int): The line number to end the edit.
         content (str): The content to insert.
     """
+    # just make the artifact if it doesn't exist instead of forcing agent to call
+    # create_artifact
     if name not in artifacts:
-        return f"[Artifact {name} does not exist]"
+        artifacts[name] = ""
 
     total_lines = len(artifacts[name].splitlines())
     if start < 0 or end < 0 or start > end or end > total_lines:
@@ -208,6 +225,101 @@ def edit_artifact(
     return open_artifact(artifacts, name, cur_line)
 
 
+def generate_vision_code(
+    artifacts: Artifacts, name: str, chat: str, media: List[str]
+) -> str:
+    """Generates python code to solve vision based tasks.
+
+    Parameters:
+        artifacts (Artifacts): The artifacts object to save the code to.
+        name (str): The name of the artifact to save the code to.
+        chat (str): The chat message from the user.
+        media (List[str]): The media files to use.
+
+    Returns:
+        str: The generated code.
+
+    Examples
+    --------
+        >>> generate_vision_code(artifacts, "code.py", "Can you detect the dogs in this image?", ["image.jpg"])
+        from vision_agent.tools import load_image, owl_v2
+        def detect_dogs(image_path: str):
+            image = load_image(image_path)
+            dogs = owl_v2("dog", image)
+            return dogs
+    """
+
+    if ZMQ_PORT is not None:
+        agent = va.agent.VisionAgentCoder(
+            report_progress_callback=lambda inp: report_progress_callback(
+                int(ZMQ_PORT), inp
+            )
+        )
+    else:
+        agent = va.agent.VisionAgentCoder()
+
+    fixed_chat: List[Message] = [{"role": "user", "content": chat, "media": media}]
+    response = agent.chat_with_workflow(fixed_chat)
+    code = response["code"]
+    artifacts[name] = code
+    code_lines = code.splitlines(keepends=True)
+    total_lines = len(code_lines)
+    return view_lines(code_lines, 0, total_lines, name, total_lines)
+
+
+def edit_vision_code(
+    artifacts: Artifacts, name: str, chat_history: List[str], media: List[str]
+) -> str:
+    """Edits python code to solve a vision based task.
+
+    Parameters:
+        artifacts (Artifacts): The artifacts object to save the code to.
+        name (str): The file path to the code.
+        chat_history (List[str]): The chat history to used to generate the code.
+
+    Returns:
+        str: The edited code.
+
+    Examples
+    --------
+        >>> edit_vision_code(
+        >>>     artifacts,
+        >>>     "code.py",
+        >>>     ["Can you detect the dogs in this image?", "Can you use a higher threshold?"],
+        >>>     ["dog.jpg"],
+        >>> )
+        from vision_agent.tools import load_image, owl_v2
+        def detect_dogs(image_path: str):
+            image = load_image(image_path)
+            dogs = owl_v2("dog", image, threshold=0.8)
+            return dogs
+    """
+
+    agent = va.agent.VisionAgentCoder()
+    if name not in artifacts:
+        return f"[Artifact {name} does not exist]"
+
+    code = artifacts[name]
+
+    # Append latest code to second to last message from assistant
+    fixed_chat_history: List[Message] = []
+    for i, chat in enumerate(chat_history):
+        if i == 0:
+            fixed_chat_history.append({"role": "user", "content": chat, "media": media})
+        elif i > 0 and i < len(chat_history) - 1:
+            fixed_chat_history.append({"role": "user", "content": chat})
+        elif i == len(chat_history) - 1:
+            fixed_chat_history.append({"role": "assistant", "content": code})
+            fixed_chat_history.append({"role": "user", "content": chat})
+
+    response = agent.chat_with_workflow(fixed_chat_history, test_multi_plan=False)
+    code = response["code"]
+    artifacts[name] = code
+    code_lines = code.splitlines(keepends=True)
+    total_lines = len(code_lines)
+    return view_lines(code_lines, 0, total_lines, name, total_lines)
+
+
 # def generate_vision_code(save_file: str, chat: str, media: List[str]) -> str:
 #     """Generates python code to solve vision based tasks.
 
@@ -564,8 +676,8 @@ def get_tool_descriptions() -> str:
         open_artifact,
         create_artifact,
         edit_artifact,
-        # generate_vision_code,
-        # edit_vision_code,
+        generate_vision_code,
+        edit_vision_code,
         # open_file,
         # create_file,
         # scroll_up,

From 2596f433ae42e00ed2b4d55eb29353c1f2394f11 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Wed, 28 Aug 2024 08:41:06 -0700
Subject: [PATCH 10/37] ran isort

---
 vision_agent/clients/landing_public_api.py | 4 ++--
 vision_agent/tools/tools_types.py          | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/vision_agent/clients/landing_public_api.py b/vision_agent/clients/landing_public_api.py
index eec218ad..2319bf89 100644
--- a/vision_agent/clients/landing_public_api.py
+++ b/vision_agent/clients/landing_public_api.py
@@ -5,9 +5,9 @@
 from requests.exceptions import HTTPError
 
 from vision_agent.clients.http import BaseHTTP
-from vision_agent.utils.type_defs import LandingaiAPIKey
+from vision_agent.tools.tools_types import BboxInputBase64, JobStatus, PromptTask
 from vision_agent.utils.exceptions import FineTuneModelNotFound
-from vision_agent.tools.tools_types import BboxInputBase64, PromptTask, JobStatus
+from vision_agent.utils.type_defs import LandingaiAPIKey
 
 
 class LandingPublicAPI(BaseHTTP):
diff --git a/vision_agent/tools/tools_types.py b/vision_agent/tools/tools_types.py
index aeb45c95..7b640adb 100644
--- a/vision_agent/tools/tools_types.py
+++ b/vision_agent/tools/tools_types.py
@@ -1,8 +1,8 @@
-from uuid import UUID
 from enum import Enum
-from typing import List, Tuple, Optional
+from typing import List, Optional, Tuple
+from uuid import UUID
 
-from pydantic import BaseModel, ConfigDict, Field, field_serializer, SerializationInfo
+from pydantic import BaseModel, ConfigDict, Field, SerializationInfo, field_serializer
 
 
 class BboxInput(BaseModel):

From 51d49f5ee4689008d242602fa45039ccaafa6deb Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Wed, 28 Aug 2024 09:38:18 -0700
Subject: [PATCH 11/37] prompt to work with artifacts

---
 vision_agent/agent/vision_agent_prompts.py | 55 +++++++++++++---------
 1 file changed, 32 insertions(+), 23 deletions(-)

diff --git a/vision_agent/agent/vision_agent_prompts.py b/vision_agent/agent/vision_agent_prompts.py
index cf1e9a33..93acb871 100644
--- a/vision_agent/agent/vision_agent_prompts.py
+++ b/vision_agent/agent/vision_agent_prompts.py
@@ -1,7 +1,7 @@
 VA_CODE = """
 **Role**: You are a helpful conversational agent that assists users with their requests by writing code to solve it.
 
-**Taks**: As a conversational agent, you are required to understand the user's request and provide a helpful response. Use a Chain-of-Thought approach to break down the problem, create a plan, and then provide a response. Ensure that your response is clear, concise, and helpful. You can use an interactive Python (Jupyter Notebook) environment, executing code with <execution_python>.
+**Taks**: As a conversational agent, you are required to understand the user's request and provide a helpful response. Use a Chain-of-Thought approach to break down the problem, create a plan, and then provide a response. Ensure that your response is clear, concise, and helpful. You can use an interactive Python (Jupyter Notebook) environment, executing code with <execution_python>. You are given access to an `artifacts` object which contains files shared between you and the user. `artifacts` will be automatically saved everytime you execute python code.
 
 <execute_python>
 print("Hello World!")
@@ -32,29 +32,24 @@
 """
 
 
-EXAMPLES_CODE1_ARTIFACT = """
-USER: Can you write a simple application that adds two numbers?
-
-AGENT: {"thoughts": "The user has asked to add two numbers, I will generate the code to add two numbers.", "response": "<execute_python>create_artifact(artifact, 'add_two_numbers')</execute_python>", "let_user_respond": false}
+EXAMPLES_CODE1_ARTIFACTS = """
+USER: Can you detect the dogs in this image? Media name dog.jpg
 
 OBSERVATION:
-[Artifact add_two_numbers created]
-"""
-
-
-EXAMPLES_CODE1 = """
-USER: Can you detect the dogs in this image? Media name dog.jpg
+[Artifacts loaded]
+Artifact dog.jpg loaded to /path/to/images/dog.jpg
+[End of artifacts]
 
-AGENT: {"thoughts": "I will use the generate_vision_code to detect the dogs in the image.", "response": "<execute_python>generate_vision_code('/example/workspace/dog_detector.py', 'Can you write code to detect dogs in this image?', media=['/example/workspace/dog.jpg'])</execute_python>", "let_user_respond": false}
+AGENT: {"thoughts": "I will use the generate_vision_code to detect the dogs in the image.", "response": "<execute_python>generate_vision_code(artifacts, 'dog_detector.py', 'Can you write code to detect dogs in this image?', media=['/path/to/images/dog.jpg'])</execute_python>", "let_user_respond": false}
 
 OBSERVATION:
-[File /example/workspace/dog_detector.py]
+[Artifact dog_detector.py]
 0|from vision_agent.tools import load_image, owl_v2
 1|def detect_dogs(image_path: str):
 2|    image = load_image(image_path)
 3|    dogs = owl_v2("dog", image)
 4|    return dogs
-[End of file]
+[End of artifact]
 
 AGENT: {"thoughts": "I have generated the code to detect the dogs in the image, I must now run the code to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n    print(detect_dogs('/example/workspace/dog.jpg'))</execute_python>", "let_user_respond": false}
 
@@ -67,18 +62,23 @@
 
 USER: The the image only has one dog, can you fix this?
 
-AGENT: {"thoughts": "I will use edit_vision_code to fix the code to detect only one dog. The lower score is 0.23 so I will set the threshold above that.", "response": "<execute_python>edit_vision_code('/example/workspace/dog_detector.py', ['Can you write code to detect dogs in this image?', 'Can you increase the threshold to 0.24?'], media=['/example/workspace/dog.jpg'])</execute_python>", "let_user_respond": false}
+[Artifacts loaded]
+Artifact dog.jpg loaded to /path/to/images/dog.jpg
+Artifact dog_detector.py loaded to /path/to/code/dog_detector.py
+[End of artifacts]
+
+AGENT: {"thoughts": "I will use edit_vision_code to fix the code to detect only one dog. The lower score is 0.23 so I will set the threshold above that.", "response": "<execute_python>edit_vision_code(artifacts, 'dog_detector.py', ['Can you write code to detect dogs in this image?', 'Can you increase the threshold to 0.24?'], media=['/path/to/images/dog.jpg'])</execute_python>", "let_user_respond": false}
 
 OBSERVATION:
-[File /example/workspace/dog_detector.py]
+[Artifact dog_detector.py]
 0|from vision_agent.tools import load_image, owl_v2
 1|def detect_dogs(image_path: str):
 2|    image = load_image(image_path)
 3|    dogs = owl_v2("dog", image, threshold=0.24)
 4|    return dogs
-[End of file]
+[End of artifact]
 
-AGENT: {"thoughts": "I have edited the code to detect only one dog, I must now run the code to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n    print(detect_dogs('/example/workspace/dog.jpg'))</execute_python>", "let_user_respond": false}
+AGENT: {"thoughts": "I have edited the code to detect only one dog, I must now run the code to get the output.", "response": "<execute_python>from dog_detector import detect_dogs\n    print(detect_dogs('/path/to/images/dog.jpg'))</execute_python>", "let_user_respond": false}
 
 OBSERVATION:
 ----- stdout -----
@@ -88,17 +88,26 @@
 """
 
 
-EXAMPLES_CODE2 = """
+EXAMPLES_CODE2_ARTIFACTS = """
 USER: Can you create a function to count workers with helmets?
 
+OBSERVATION:
+[Artifacts loaded]
+[End of artifacts]
+
 AGENT: {"thoughts": "The user has asked to count workers with helmets but has not provided an image. I will ask the user for an image and then generate the code to count workers with helmets.", "response": "Can you provide an image of workers with helmets?", "let_user_respond": true}
 
 USER: Yes you can use workers.png
 
-AGENT: {"thoughts": "I will use the generate_vision_code to count the workers with helmets in the image.", "response": "<execute_python>generate_vision_code('/example/workspace/code.py', 'Can you write code to count workers with helmets in this image?', media=['/example/workspace/workers.png'])</execute_python>", "let_user_respond": false}
+OBSERVATION:
+[Artifacts loaded]
+Artifact workers.png loaded to /path/to/images/workers.png
+[End of artifacts]
+
+AGENT: {"thoughts": "I will use the generate_vision_code to count the workers with helmets in the image.", "response": "<execute_python>generate_vision_code(artifacts, 'code.py', 'Can you write code to count workers with helmets in this image?', media=['/paths/to/images/workers.png'])</execute_python>", "let_user_respond": false}
 
 OBSERVATION:
-[File /example/workspace/code.py]
+[Artifact code.py]
 0|from vision_agent.tools import load_image, owl_v2, closest_box_distance
 1|def count_workers_with_helmets(image_path: str):
 2|    image = load_image(image_path)
@@ -115,9 +124,9 @@
 13|        if person_has_helmet:
 14|            count += 1
 15|    return count
-[End of file]
+[End of artifact]
 
-AGENT: {"thoughts": "I have generated the code to count the workers with helmets in the image, I must now run the code to get the output.", "response": "<execute_python>from code import count_workers_with_helmets\n    print(count_workers_with_helmets('/example/workspace/workers.png'))</execute_python>", "let_user_respond": false}
+AGENT: {"thoughts": "I have generated the code to count the workers with helmets in the image, I must now run the code to get the output.", "response": "<execute_python>from code import count_workers_with_helmets\n    print(count_workers_with_helmets('/path/to/images/workers.png'))</execute_python>", "let_user_respond": false}
 
 OBSERVATION:
 ----- stdout -----

From e31de9f0784b2b4e358f010fd1ef269897d18f21 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Wed, 28 Aug 2024 09:42:35 -0700
Subject: [PATCH 12/37] minor fixes for prompts

---
 vision_agent/agent/vision_agent_prompts.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/vision_agent/agent/vision_agent_prompts.py b/vision_agent/agent/vision_agent_prompts.py
index 93acb871..7e98bd94 100644
--- a/vision_agent/agent/vision_agent_prompts.py
+++ b/vision_agent/agent/vision_agent_prompts.py
@@ -1,4 +1,4 @@
-VA_CODE = """
+VA_CODE_ARTIFACTS = """
 **Role**: You are a helpful conversational agent that assists users with their requests by writing code to solve it.
 
 **Taks**: As a conversational agent, you are required to understand the user's request and provide a helpful response. Use a Chain-of-Thought approach to break down the problem, create a plan, and then provide a response. Ensure that your response is clear, concise, and helpful. You can use an interactive Python (Jupyter Notebook) environment, executing code with <execution_python>. You are given access to an `artifacts` object which contains files shared between you and the user. `artifacts` will be automatically saved everytime you execute python code.
@@ -15,7 +15,6 @@
 **Examples**:
 Here is an example of how you can interact with a user and Actions to complete a task:
 --- START EXAMPLES ---
-[Current directory: /example/workspace]
 {examples}
 --- END EXAMPLES ---
 
@@ -26,8 +25,6 @@
 **Conversation**:
 Here is the current conversation so far:
 --- START CONVERSATION ---
-[Current directory: {dir}]
-
 {conversation}
 """
 

From 9e83881ca81a6caa6fb1f19e7cda3cf9d3d892db Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Wed, 28 Aug 2024 13:02:36 -0700
Subject: [PATCH 13/37] add docs, fix load and saving remote files

---
 vision_agent/agent/vision_agent.py | 72 +++++++++++++++++++++---------
 1 file changed, 50 insertions(+), 22 deletions(-)

diff --git a/vision_agent/agent/vision_agent.py b/vision_agent/agent/vision_agent.py
index 0d617c8a..17fe347d 100644
--- a/vision_agent/agent/vision_agent.py
+++ b/vision_agent/agent/vision_agent.py
@@ -7,9 +7,9 @@
 from vision_agent.agent import Agent
 from vision_agent.agent.agent_utils import extract_json
 from vision_agent.agent.vision_agent_prompts import (
-    EXAMPLES_CODE1_ARTIFACT,
-    EXAMPLES_CODE2_ARTIFACT,
-    VA_CODE_ARTIFACT,
+    EXAMPLES_CODE1,
+    EXAMPLES_CODE2,
+    VA_CODE,
 )
 from vision_agent.lmm import LMM, Message, OpenAILMM
 from vision_agent.tools import META_TOOL_DOCSTRING
@@ -65,9 +65,9 @@ def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
         else:
             raise ValueError(f"role {chat_i['role']} is not supported")
 
-    prompt = VA_CODE_ARTIFACT.format(
+    prompt = VA_CODE.format(
         documentation=META_TOOL_DOCSTRING,
-        examples=f"{EXAMPLES_CODE1_ARTIFACT}\n{EXAMPLES_CODE2_ARTIFACT}",
+        examples=f"{EXAMPLES_CODE1}\n{EXAMPLES_CODE2}",
         conversation=conversation,
     )
     return extract_json(orch([{"role": "user", "content": prompt}], stream=False))  # type: ignore
@@ -109,8 +109,20 @@ def __init__(
         self,
         agent: Optional[LMM] = None,
         verbosity: int = 0,
+        local_artifacts_path: Optional[Union[str, Path]] = None,
         code_sandbox_runtime: Optional[str] = None,
     ) -> None:
+        """Initialize the VisionAgent.
+
+        Parameters:
+            agent (Optional[LMM]): The agent to use for conversation and orchestration
+                of other agents.
+            verbosity (int): The verbosity level of the agent.
+            local_artifacts_path (Optional[Union[str, Path]]): The path to the local
+                artifacts file.
+            code_sandbox_runtime (Optional[str]): The code sandbox runtime to use.
+        """
+
         self.agent = (
             OpenAILMM(temperature=0.0, json_mode=True) if agent is None else agent
         )
@@ -119,13 +131,18 @@ def __init__(
         self.code_sandbox_runtime = code_sandbox_runtime
         if self.verbosity >= 1:
             _LOGGER.setLevel(logging.INFO)
+        self.local_artifacts_path = (
+            Path(local_artifacts_path)
+            if local_artifacts_path is not None
+            else "artifacts.pkl"
+        )
 
     def __call__(
         self,
         input: Union[str, List[Message]],
         media: Optional[Union[str, Path]] = None,
         artifacts: Optional[Artifacts] = None,
-    ) -> str:
+    ) -> List[Message]:
         """Chat with VisionAgent and get the conversation response.
 
         Parameters:
@@ -133,6 +150,7 @@ def __call__(
                 [{"role": "user", "content": "describe your task here..."}, ...] or a
                 string of just the contents.
             media (Optional[Union[str, Path]]): The media file to be used in the task.
+            artifacts (Optional[Artifacts]): The artifacts to use in the task.
 
         Returns:
             str: The conversation response.
@@ -157,6 +175,7 @@ def chat_with_code(
                 [{"role": "user", "content": "describe your task here..."}]
                 or if it contains media files, it should be in the format of:
                 [{"role": "user", "content": "describe your task here...", "media": ["image1.jpg", "image2.jpg"]}]
+            artifacts (Optional[Artifacts]): The artifacts to use in the task.
 
         Returns:
             List[Message]: The conversation response.
@@ -166,8 +185,7 @@ def chat_with_code(
             raise ValueError("chat cannot be empty")
 
         if not artifacts:
-            artifacts = Artifacts("artifacts.pkl")
-            artifacts.save()
+            artifacts = Artifacts(WORKSPACE / "artifacts.pkl")
 
         with CodeInterpreterFactory.new_instance(
             code_sandbox_runtime=self.code_sandbox_runtime
@@ -183,7 +201,7 @@ def chat_with_code(
                         # Save dummy value for now since we just need to know the path
                         # name in the key 'media'. Later on we can add artifact support
                         # for byte data.
-                        artifacts.artifacts[media] = ""
+                        artifacts.artifacts[Path(media).name] = None
                         media_list.append(media)
 
             int_chat = cast(
@@ -205,14 +223,22 @@ def chat_with_code(
             finished = False
             iterations = 0
             last_response = None
-            while not finished and iterations < self.max_iterations:
-                artifacts_remote_path = code_interpreter.upload_file(
-                    artifacts.save_path
-                )
-                artifacts_loaded = artifacts.show()
-                int_chat.append({"role": "observation", "content": artifacts_loaded})
-                orig_chat.append({"role": "observation", "content": artifacts_loaded})
 
+            # Save the current state of artifacts, will include any images the user
+            # passed in.
+            artifacts.save(self.local_artifacts_path)
+
+            # Upload artifacts to remote location and show where they are going
+            # to be loaded to. The actual loading happens in BoilerplateCode as
+            # part of the pre_code.
+            remote_artifacts_path = code_interpreter.upload_file(
+                self.local_artifacts_path
+            )
+            artifacts_loaded = artifacts.show()
+            int_chat.append({"role": "observation", "content": artifacts_loaded})
+            orig_chat.append({"role": "observation", "content": artifacts_loaded})
+
+            while not finished and iterations < self.max_iterations:
                 response = run_conversation(self.agent, int_chat)
                 if self.verbosity >= 1:
                     _LOGGER.info(response)
@@ -230,13 +256,8 @@ def chat_with_code(
 
                 if code_action is not None:
                     obs = run_code_action(
-                        code_action, code_interpreter, artifacts_remote_path
+                        code_action, code_interpreter, str(remote_artifacts_path)
                     )
-                    artifacts_local_path = code_interpreter.download_file(
-                        artifacts_remote_path
-                    )
-                    artifacts.load(artifacts_local_path)
-                    artifacts.save()
 
                     if self.verbosity >= 1:
                         _LOGGER.info(obs)
@@ -245,6 +266,13 @@ def chat_with_code(
 
                 iterations += 1
                 last_response = response
+
+            # after running the agent, download the artifacts locally
+            code_interpreter.download_file(
+                str(remote_artifacts_path.name), str(self.local_artifacts_path)
+            )
+            artifacts.load(self.local_artifacts_path)
+            artifacts.save()
         return orig_chat
 
     def log_progress(self, data: Dict[str, Any]) -> None:

From 84757f7b41b052a802628a5b7f17459859b1e757 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Wed, 28 Aug 2024 13:02:44 -0700
Subject: [PATCH 14/37] rename prompts

---
 vision_agent/agent/vision_agent_prompts.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vision_agent/agent/vision_agent_prompts.py b/vision_agent/agent/vision_agent_prompts.py
index 7e98bd94..c1cf541e 100644
--- a/vision_agent/agent/vision_agent_prompts.py
+++ b/vision_agent/agent/vision_agent_prompts.py
@@ -1,4 +1,4 @@
-VA_CODE_ARTIFACTS = """
+VA_CODE = """
 **Role**: You are a helpful conversational agent that assists users with their requests by writing code to solve it.
 
 **Taks**: As a conversational agent, you are required to understand the user's request and provide a helpful response. Use a Chain-of-Thought approach to break down the problem, create a plan, and then provide a response. Ensure that your response is clear, concise, and helpful. You can use an interactive Python (Jupyter Notebook) environment, executing code with <execution_python>. You are given access to an `artifacts` object which contains files shared between you and the user. `artifacts` will be automatically saved everytime you execute python code.
@@ -29,7 +29,7 @@
 """
 
 
-EXAMPLES_CODE1_ARTIFACTS = """
+EXAMPLES_CODE1 = """
 USER: Can you detect the dogs in this image? Media name dog.jpg
 
 OBSERVATION:
@@ -85,7 +85,7 @@
 """
 
 
-EXAMPLES_CODE2_ARTIFACTS = """
+EXAMPLES_CODE2 = """
 USER: Can you create a function to count workers with helmets?
 
 OBSERVATION:

From 65c8cdb38e9d88d70795c0737df94005996ebdca Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Wed, 28 Aug 2024 13:03:20 -0700
Subject: [PATCH 15/37] add docs for artifacts, allow None artifacts (which
 don't load) to be added

---
 vision_agent/tools/meta_tools.py | 35 ++++++++++++++++++++++++--------
 1 file changed, 27 insertions(+), 8 deletions(-)

diff --git a/vision_agent/tools/meta_tools.py b/vision_agent/tools/meta_tools.py
index 4b245757..a25645c4 100644
--- a/vision_agent/tools/meta_tools.py
+++ b/vision_agent/tools/meta_tools.py
@@ -42,28 +42,47 @@ def filter_file(file_name: Union[str, Path]) -> bool:
 
 
 class Artifacts:
-    def __init__(self, save_path: Union[str, Path]) -> None:
-        self.save_path = Path(save_path)
+    """Artifacts is a class that allows you to sync files between a local and remote
+    environment. In our case, the remote environment could be where the VisionAgent is
+    executing code and as the user adds new images, files or modifies files, those
+    need to be in sync with the remote environment the VisionAgent is running in.
+    """
+
+    def __init__(self, remote_save_path: Union[str, Path]) -> None:
+        self.remote_save_path = Path(remote_save_path)
         self.artifacts = {}
 
         self.code_sandbox_runtime = None
 
     def load(self, file_path: Union[str, Path]) -> None:
+        """Loads are artifacts into the remote environment. If an artifact value is None
+        it will skip loading it.
+
+        Parameters:
+            file_path (Union[str, Path]): The file path to load the artifacts from
+        """
         with open(file_path, "rb") as f:
             self.artifacts = pkl.load(f)
         for k, v in self.artifacts.items():
-            with open(self.save_path.parent / k, "w") as f:
-                f.write(v)
+            if v is not None:
+                with open(self.remote_save_path.parent / k, "w") as f:
+                    f.write(v)
 
     def show(self) -> str:
+        """Shows the artifacts that have been loaded and their remote save paths."""
         out_str = "[Artifacts loaded]\n"
         for k in self.artifacts.keys():
-            out_str += f"Artifact {k} loaded to {str(self.save_path.parent / k)}\n"
+            out_str += (
+                f"Artifact {k} loaded to {str(self.remote_save_path.parent / k)}\n"
+            )
         out_str += "[End of artifacts]\n"
         return out_str
 
-    def save(self) -> None:
-        with open(self.save_path, "wb") as f:
+    def save(self, local_path: Optional[Union[str, Path]] = None) -> None:
+        save_path = (
+            Path(local_path) if local_path is not None else self.remote_save_path
+        )
+        with open(save_path, "wb") as f:
             pkl.dump(self.artifacts, f)
 
     def __iter__(self):
@@ -259,7 +278,7 @@ def detect_dogs(image_path: str):
         agent = va.agent.VisionAgentCoder()
 
     fixed_chat: List[Message] = [{"role": "user", "content": chat, "media": media}]
-    response = agent.chat_with_workflow(fixed_chat)
+    response = agent.chat_with_workflow(fixed_chat, test_multi_plan=False)
     code = response["code"]
     artifacts[name] = code
     code_lines = code.splitlines(keepends=True)

From b3c13b1ecca2624e8209ad793e6c3b559e8ffd4a Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Wed, 28 Aug 2024 13:10:21 -0700
Subject: [PATCH 16/37] e2b and local uplaod/download work similarly now, can
 pass in target download path

---
 vision_agent/utils/execute.py | 68 +++++++++++++++++++++--------------
 1 file changed, 42 insertions(+), 26 deletions(-)

diff --git a/vision_agent/utils/execute.py b/vision_agent/utils/execute.py
index 299f5d2c..ee671a15 100644
--- a/vision_agent/utils/execute.py
+++ b/vision_agent/utils/execute.py
@@ -407,17 +407,19 @@ def exec_isolation(self, code: str) -> Execution:
         self.restart_kernel()
         return self.exec_cell(code)
 
-    def upload_file(self, file: Union[str, Path]) -> str:
+    def upload_file(self, file: Union[str, Path]) -> Path:
         # Default behavior is a no-op (for local code interpreter)
-        return str(file)
+        return Path(file)
 
-    def download_file(self, file_path: str) -> Path:
+    def download_file(self, remote_file_path: str, local_file_path: str) -> Path:
         # Default behavior is a no-op (for local code interpreter)
-        return Path(file_path)
+        return Path(local_file_path)
 
 
 class E2BCodeInterpreter(CodeInterpreter):
-    def __init__(self, *args: Any, **kwargs: Any) -> None:
+    def __init__(
+        self, remote_path: Optional[Union[str, Path]] = None, *args: Any, **kwargs: Any
+    ) -> None:
         super().__init__(*args, **kwargs)
         assert os.getenv("E2B_API_KEY"), "E2B_API_KEY environment variable must be set"
         try:
@@ -444,6 +446,9 @@ def __init__(self, *args: Any, **kwargs: Any) -> None:
         _LOGGER.info(
             f"E2BCodeInterpreter (sandbox id: {self.interpreter.sandbox_id}) initialized:\n{sys_versions}"
         )
+        self.remote_path = Path(
+            remote_path if remote_path is not None else "/home/user"
+        )
 
     def close(self, *args: Any, **kwargs: Any) -> None:
         try:
@@ -517,19 +522,18 @@ def exec_cell(self, code: str) -> Execution:
         before_sleep=tenacity.before_sleep_log(_LOGGER, logging.INFO),
         after=tenacity.after_log(_LOGGER, logging.INFO),
     )
-    def upload_file(self, file: Union[str, Path]) -> str:
+    def upload_file(self, file: Union[str, Path]) -> Path:
         file_name = Path(file).name
-        remote_path = f"/home/user/{file_name}"
         with open(file, "rb") as f:
-            self.interpreter.files.write(path=remote_path, data=f)
-            _LOGGER.info(f"File ({file}) is uploaded to: {remote_path}")
-            return remote_path
+            self.interpreter.files.write(path=str(self.remote_path / file_name), data=f)
+        _LOGGER.info(f"File ({file}) is uploaded to: {str(self.remote_path)}")
+        return self.remote_path
 
-    def download_file(self, file_path: str) -> Path:
-        with tempfile.NamedTemporaryFile(mode="w+b", delete=False) as file:
-            file.write(self.interpreter.files.read(path=file_path, format="bytes"))
-            _LOGGER.info(f"File ({file_path}) is downloaded to: {file.name}")
-            return Path(file.name)
+    def download_file(self, remote_file_path: str, local_file_path: str) -> Path:
+        with open(local_file_path, "w+b") as f:
+            f.write(self.interpreter.files.read(path=remote_file_path, format="bytes"))
+        _LOGGER.info(f"File ({remote_file_path}) is downloaded to: {local_file_path}")
+        return Path(local_file_path)
 
     @staticmethod
     def _new_e2b_interpreter_impl(*args, **kwargs) -> E2BCodeInterpreterImpl:  # type: ignore
@@ -541,7 +545,11 @@ def _new_e2b_interpreter_impl(*args, **kwargs) -> E2BCodeInterpreterImpl:  # typ
 
 
 class LocalCodeInterpreter(CodeInterpreter):
-    def __init__(self, timeout: int = _SESSION_TIMEOUT) -> None:
+    def __init__(
+        self,
+        timeout: int = _SESSION_TIMEOUT,
+        remote_path: Optional[Union[str, Path]] = None,
+    ) -> None:
         super().__init__(timeout=timeout)
         self.nb = nbformat.v4.new_notebook()
         self.nb_client = NotebookClient(self.nb, timeout=self.timeout)
@@ -555,6 +563,7 @@ def __init__(self, timeout: int = _SESSION_TIMEOUT) -> None:
         )
         sleep(1)
         self._new_kernel()
+        self.remote_path = Path(remote_path if remote_path is not None else WORKSPACE)
 
     def _new_kernel(self) -> None:
         if self.nb_client.kc is None or not run_sync(self.nb_client.kc.is_alive)():  # type: ignore
@@ -611,18 +620,19 @@ def exec_cell(self, code: str) -> Execution:
     def upload_file(self, file_path: str) -> Path:
         with open(file_path, "rb") as f:
             contents = f.read()
-        with open(WORKSPACE / file_path, "wb") as f:
+        with open(self.remote_path / Path(file_path).name, "wb") as f:
             f.write(contents)
+        _LOGGER.info(f"File ({file_path}) is uploaded to: {str(self.remote_path)}")
 
-        return Path(WORKSPACE / file_path)
+        return Path(self.remote_path / file_path)
 
-    def download_file(self, file_path: str) -> Path:
-        with open(WORKSPACE / file_path, "rb") as f:
+    def download_file(self, remote_file_path: str, local_file_path: str) -> Path:
+        with open(self.remote_path / remote_file_path, "rb") as f:
             contents = f.read()
-        with open(file_path, "wb") as f:
+        with open(local_file_path, "wb") as f:
             f.write(contents)
-        return Path(file_path)
-
+        _LOGGER.info(f"File ({remote_file_path}) is downloaded to: {local_file_path}")
+        return Path(local_file_path)
 
 
 class CodeInterpreterFactory:
@@ -647,13 +657,19 @@ def get_default_instance() -> CodeInterpreter:
         return instance
 
     @staticmethod
-    def new_instance(code_sandbox_runtime: Optional[str] = None) -> CodeInterpreter:
+    def new_instance(
+        code_sandbox_runtime: Optional[str] = None, remote_path: Optional[str] = None
+    ) -> CodeInterpreter:
         if not code_sandbox_runtime:
             code_sandbox_runtime = os.getenv("CODE_SANDBOX_RUNTIME", "local")
         if code_sandbox_runtime == "e2b":
-            instance: CodeInterpreter = E2BCodeInterpreter(timeout=_SESSION_TIMEOUT)
+            instance: CodeInterpreter = E2BCodeInterpreter(
+                timeout=_SESSION_TIMEOUT, remote_path=remote_path
+            )
         elif code_sandbox_runtime == "local":
-            instance = LocalCodeInterpreter(timeout=_SESSION_TIMEOUT)
+            instance = LocalCodeInterpreter(
+                timeout=_SESSION_TIMEOUT, remote_path=remote_path
+            )
         else:
             raise ValueError(
                 f"Unsupported code sandbox runtime: {code_sandbox_runtime}. Supported runtimes: e2b, local"

From 6ebb75b84caa81253b75bff06bf7470bf5ffafc6 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Wed, 28 Aug 2024 13:10:33 -0700
Subject: [PATCH 17/37] add Artifacts to exports

---
 vision_agent/tools/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vision_agent/tools/__init__.py b/vision_agent/tools/__init__.py
index 3372fcbb..cbd92358 100644
--- a/vision_agent/tools/__init__.py
+++ b/vision_agent/tools/__init__.py
@@ -1,6 +1,6 @@
 from typing import Callable, List, Optional
 
-from .meta_tools import META_TOOL_DOCSTRING
+from .meta_tools import META_TOOL_DOCSTRING, Artifacts
 from .prompts import CHOOSE_PARAMS, SYSTEM_PROMPT
 from .tool_utils import get_tool_descriptions_by_names
 from .tools import (

From 907c44970192d981f281576cf11fe1bd67985b0d Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Wed, 28 Aug 2024 13:10:45 -0700
Subject: [PATCH 18/37] local chat app to work with artifacts

---
 examples/chat/app.py | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/examples/chat/app.py b/examples/chat/app.py
index 94f3b23a..68dede8d 100644
--- a/examples/chat/app.py
+++ b/examples/chat/app.py
@@ -26,9 +26,14 @@
     "response": "saved",
     "style": {"bottom": "calc(50% - 4.25rem", "right": "0.4rem"},
 }
-artifacts = va.tools.Artifacts("artifacts.pkl")
-artifacts.save()
-agent = va.agent.VisionAgent(verbosity=1)
+# set artifacts remote_path to WORKSPACE
+artifacts = va.tools.Artifacts(WORKSPACE / "artifacts.pkl")
+if Path("artifacts.pkl").exists():
+    artifacts.load("artifacts.pkl")
+else:
+    artifacts.save("artifacts.pkl")
+
+agent = va.agent.VisionAgent(verbosity=1, local_artifacts_path="artifacts.pkl")
 
 st.set_page_config(layout="wide")
 
@@ -125,7 +130,9 @@ def main():
             if uploaded_file is not None:
                 with open(WORKSPACE / uploaded_file.name, "wb") as f:
                     f.write(uploaded_file.getbuffer())
-                artifacts.artifacts[WORKSPACE / uploaded_file.name] = ""
+
+                # make it None so it wont load and overwrite the image
+                artifacts.artifacts[uploaded_file.name] = None
 
             for file in WORKSPACE.iterdir():
                 if "__pycache__" not in str(file) and not str(file).startswith("."):

From bbae983d856f34acc6fb6abc400c1e9ef4aca45b Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Wed, 28 Aug 2024 13:16:33 -0700
Subject: [PATCH 19/37] updated docs

---
 README.md                         | 18 +++++++++++-------
 docs/index.md                     | 21 +++++++++++++--------
 vision_agent/utils/image_utils.py |  2 +-
 3 files changed, 25 insertions(+), 16 deletions(-)

diff --git a/README.md b/README.md
index f41bef31..88c59973 100644
--- a/README.md
+++ b/README.md
@@ -41,15 +41,15 @@ export OPENAI_API_KEY="your-api-key"
 ```
 
 ### Vision Agent
-There are two agents that you can use. Vision Agent is a conversational agent that has
+There are two agents that you can use. `VisionAgent` is a conversational agent that has
 access to tools that allow it to write an navigate python code and file systems. It can
-converse with the user in natural language. VisionAgentCoder is an agent that can write
-code for vision tasks, such as counting people in an image. However, it cannot converse
-and can only respond with code. VisionAgent can call VisionAgentCoder to write vision
-code.
+converse with the user in natural language. `VisionAgentCoder` is an agent specifically
+for writing code for vision tasks, such as counting people in an image. However, it
+cannot chat with you and can only respond with code. `VisionAgent` can call
+`VisionAgentCoder` to write vision code.
 
 #### Basic Usage
-To run the streamlit app locally to chat with Vision Agent, you can run the following
+To run the streamlit app locally to chat with `VisionAgent`, you can run the following
 command:
 
 ```bash
@@ -146,7 +146,7 @@ the code and having it update. You just need to add the code as a response from
 assistant:
 
 ```python
-agent = va.agent.VisionAgent(verbosity=2)
+agent = va.agent.VisionAgentCoder(verbosity=2)
 conv = [
     {
         "role": "user",
@@ -212,6 +212,10 @@ function. Make sure the documentation is in the same format above with descripti
 `Parameters:`, `Returns:`, and `Example\n-------`. You can find an example use case
 [here](examples/custom_tools/) as this is what the agent uses to pick and use the tool.
 
+Can't find the tool you need and want add it to `VisionAgent`? Check out our
+[vision-agent-tools](https://github.com/landing-ai/vision-agent-tools) repository where
+we add the source code for all the tools used in `VisionAgent`.
+
 ## Additional Backends
 ### Ollama
 We also provide a `VisionAgentCoder` that uses Ollama. To get started you must download
diff --git a/docs/index.md b/docs/index.md
index 8569c5cc..0f5022f9 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -38,15 +38,15 @@ export OPENAI_API_KEY="your-api-key"
 ```
 
 ### Vision Agent
-There are two agents that you can use. Vision Agent is a conversational agent that has
+There are two agents that you can use. `VisionAgent` is a conversational agent that has
 access to tools that allow it to write an navigate python code and file systems. It can
-converse with the user in natural language. VisionAgentCoder is an agent that can write
-code for vision tasks, such as counting people in an image. However, it cannot converse
-and can only respond with code. VisionAgent can call VisionAgentCoder to write vision
-code.
+converse with the user in natural language. `VisionAgentCoder` is an agent specifically
+for writing code for vision tasks, such as counting people in an image. However, it
+cannot chat with you and can only respond with code. `VisionAgent` can call
+`VisionAgentCoder` to write vision code.
 
 #### Basic Usage
-To run the streamlit app locally to chat with Vision Agent, you can run the following
+To run the streamlit app locally to chat with `VisionAgent`, you can run the following
 command:
 
 ```bash
@@ -143,7 +143,7 @@ the code and having it update. You just need to add the code as a response from
 assistant:
 
 ```python
-agent = va.agent.VisionAgent(verbosity=2)
+agent = va.agent.VisionAgentCoder(verbosity=2)
 conv = [
     {
         "role": "user",
@@ -209,6 +209,10 @@ function. Make sure the documentation is in the same format above with descripti
 `Parameters:`, `Returns:`, and `Example\n-------`. You can find an example use case
 [here](examples/custom_tools/) as this is what the agent uses to pick and use the tool.
 
+Can't find the tool you need and want add it to `VisionAgent`? Check out our
+[vision-agent-tools](https://github.com/landing-ai/vision-agent-tools) repository where
+we add the source code for all the tools used in `VisionAgent`.
+
 ## Additional Backends
 ### Ollama
 We also provide a `VisionAgentCoder` that uses Ollama. To get started you must download
@@ -230,6 +234,7 @@ tools. You can use it just like you would use `VisionAgentCoder`:
 >>> agent = va.agent.OllamaVisionAgentCoder()
 >>> agent("Count the apples in the image", media="apples.jpg")
 ```
+> WARNING: VisionAgent doesn't work well unless the underlying LMM is sufficiently powerful. Do not expect good results or even working code with smaller models like Llama 3.1 8B.
 
 ### Azure OpenAI
 We also provide a `AzureVisionAgentCoder` that uses Azure OpenAI models. To get started
@@ -241,7 +246,7 @@ follow the Azure Setup section below. You can use it just like you would use=
 >>> agent = va.agent.AzureVisionAgentCoder()
 >>> agent("Count the apples in the image", media="apples.jpg")
 ```
-> WARNING: VisionAgent doesn't work well unless the underlying LMM is sufficiently powerful. Do not expect good results or even working code with smaller models like Llama 3.1 8B.
+
 
 ### Azure Setup
 If you want to use Azure OpenAI models, you need to have two OpenAI model deployments:
diff --git a/vision_agent/utils/image_utils.py b/vision_agent/utils/image_utils.py
index d2bc8a6d..54688f93 100644
--- a/vision_agent/utils/image_utils.py
+++ b/vision_agent/utils/image_utils.py
@@ -70,7 +70,7 @@ def rle_decode_array(rle: Dict[str, List[int]]) -> np.ndarray:
     r"""Decode a run-length encoded mask. Returns numpy array, 1 - mask, 0 - background.
 
     Parameters:
-        mask: The mask in run-length encoded as an array.
+        rle: The run-length encoded mask.
     """
     size = rle["size"]
     counts = rle["counts"]

From 3e7cfd2552f3c56ee5745631eac81c47bf7c17e0 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Wed, 28 Aug 2024 13:24:33 -0700
Subject: [PATCH 20/37] fix flake8

---
 vision_agent/tools/meta_tools.py | 358 +------------------------------
 vision_agent/utils/execute.py    |   1 -
 2 files changed, 1 insertion(+), 358 deletions(-)

diff --git a/vision_agent/tools/meta_tools.py b/vision_agent/tools/meta_tools.py
index a25645c4..bc3e3058 100644
--- a/vision_agent/tools/meta_tools.py
+++ b/vision_agent/tools/meta_tools.py
@@ -3,16 +3,12 @@
 import subprocess
 import tempfile
 from pathlib import Path
-from typing import Any, Callable, Dict, List, Optional, Union
-from uuid import UUID
+from typing import Any, Dict, List, Optional, Union
 
 import vision_agent as va
-from vision_agent.clients.landing_public_api import LandingPublicAPI
 from vision_agent.lmm.types import Message
 from vision_agent.tools.tool_utils import get_tool_documentation
 from vision_agent.tools.tools import TOOL_DESCRIPTIONS
-from vision_agent.utils import CodeInterpreterFactory
-from vision_agent.utils.image_utils import convert_to_b64
 
 # These tools are adapted from SWE-Agent https://github.com/princeton-nlp/SWE-agent
 
@@ -339,349 +335,6 @@ def detect_dogs(image_path: str):
     return view_lines(code_lines, 0, total_lines, name, total_lines)
 
 
-# def generate_vision_code(save_file: str, chat: str, media: List[str]) -> str:
-#     """Generates python code to solve vision based tasks.
-
-#     Parameters:
-#         save_file (str): The file path to save the code.
-#         chat (str): The chat message from the user.
-#         media (List[str]): The media files to use.
-
-#     Returns:
-#         str: The generated code.
-
-#     Examples
-#     --------
-#         >>> generate_vision_code("code.py", "Can you detect the dogs in this image?", ["image.jpg"])
-#         from vision_agent.tools import load_image, owl_v2
-#         def detect_dogs(image_path: str):
-#             image = load_image(image_path)
-#             dogs = owl_v2("dog", image)
-#             return dogs
-#     """
-
-#     if ZMQ_PORT is not None:
-#         agent = va.agent.VisionAgentCoder(
-#             report_progress_callback=lambda inp: report_progress_callback(
-#                 int(ZMQ_PORT), inp
-#             )
-#         )
-#     else:
-#         agent = va.agent.VisionAgentCoder()
-#     try:
-#         fixed_chat: List[Message] = [{"role": "user", "content": chat, "media": media}]
-#         response = agent.chat_with_workflow(fixed_chat)
-#         code = response["code"]
-#         with open(save_file, "w") as f:
-#             f.write(code)
-#         code_lines = code.splitlines(keepends=True)
-#         total_lines = len(code_lines)
-#         return view_lines(code_lines, 0, total_lines, save_file, total_lines)
-#     except Exception as e:
-#         return str(e)
-
-
-# def edit_vision_code(code_file: str, chat_history: List[str], media: List[str]) -> str:
-#     """Edits python code to solve a vision based task.
-
-#     Parameters:
-#         code_file (str): The file path to the code.
-#         chat_history (List[str]): The chat history to used to generate the code.
-
-#     Returns:
-#         str: The edited code.
-
-#     Examples
-#     --------
-#         >>> edit_vision_code(
-#         >>>     "code.py",
-#         >>>     ["Can you detect the dogs in this image?", "Can you use a higher threshold?"],
-#         >>>     ["dog.jpg"],
-#         >>> )
-#         from vision_agent.tools import load_image, owl_v2
-#         def detect_dogs(image_path: str):
-#             image = load_image(image_path)
-#             dogs = owl_v2("dog", image, threshold=0.8)
-#             return dogs
-#     """
-
-#     agent = va.agent.VisionAgentCoder()
-#     with open(code_file, "r") as f:
-#         code = f.read()
-
-#     # Append latest code to second to last message from assistant
-#     fixed_chat_history: List[Message] = []
-#     for i, chat in enumerate(chat_history):
-#         if i == 0:
-#             fixed_chat_history.append({"role": "user", "content": chat, "media": media})
-#         elif i > 0 and i < len(chat_history) - 1:
-#             fixed_chat_history.append({"role": "user", "content": chat})
-#         elif i == len(chat_history) - 1:
-#             fixed_chat_history.append({"role": "assistant", "content": code})
-#             fixed_chat_history.append({"role": "user", "content": chat})
-
-#     try:
-#         response = agent.chat_with_workflow(fixed_chat_history, test_multi_plan=False)
-#         code = response["code"]
-#         with open(code_file, "w") as f:
-#             f.write(code)
-#         code_lines = code.splitlines(keepends=True)
-#         total_lines = len(code_lines)
-#         return view_lines(code_lines, 0, total_lines, code_file, total_lines)
-#     except Exception as e:
-#         return str(e)
-
-
-# def format_lines(lines: List[str], start_idx: int) -> str:
-#     output = ""
-#     for i, line in enumerate(lines):
-#         output += f"{i + start_idx}|{line}"
-#     return output
-
-
-# def view_lines(
-#     lines: List[str], line_num: int, window_size: int, file_path: str, total_lines: int
-# ) -> str:
-#     start = max(0, line_num - window_size)
-#     end = min(len(lines), line_num + window_size)
-#     return (
-#         f"[File: {file_path} ({total_lines} lines total)]\n"
-#         + format_lines(lines[start:end], start)
-#         + ("[End of file]" if end == len(lines) else f"[{len(lines) - end} more lines]")
-#     )
-
-
-# def open_file(file_path: str, line_num: int = 0, window_size: int = 100) -> str:
-#     """Opens the file at at the given path in the editor. If `line_num` is provided,
-#     the window will be moved to include that line. It only shows the first 100 lines by
-#     default! Max `window_size` supported is 2000. use `scroll up/down` to view the file
-#     if you want to see more.
-
-#     Parameters:
-#         file_path (str): The file path to open, preferred absolute path.
-#         line_num (int): The line number to move the window to.
-#         window_size (int): The number of lines to show above and below the line.
-#     """
-
-#     file_path_p = Path(file_path)
-#     if not file_path_p.exists():
-#         return f"[File {file_path} does not exist]"
-
-#     total_lines = sum(1 for _ in open(file_path_p))
-#     window_size = min(window_size, 2000)
-#     window_size = window_size // 2
-#     if line_num - window_size < 0:
-#         line_num = window_size
-#     elif line_num >= total_lines:
-#         line_num = total_lines - 1 - window_size
-
-#     global CURRENT_LINE, CURRENT_FILE
-#     CURRENT_LINE = line_num
-#     CURRENT_FILE = file_path
-
-#     with open(file_path, "r") as f:
-#         lines = f.readlines()
-
-#     return view_lines(lines, line_num, window_size, file_path, total_lines)
-
-
-# def create_file(file_path: str) -> str:
-#     """Creates and opens a new file with the given name.
-
-#     Parameters:
-#         file_path (str): The file path to create, preferred absolute path.
-#     """
-
-#     file_path_p = Path(file_path)
-#     if file_path_p.exists():
-#         return f"[File {file_path} already exists]"
-#     file_path_p.touch()
-#     global CURRENT_FILE
-#     CURRENT_FILE = file_path
-#     return f"[File created {file_path}]"
-
-
-# def scroll_up() -> str:
-#     """Moves the window up by 100 lines."""
-#     if CURRENT_FILE is None:
-#         return "[No file is open]"
-
-#     return open_file(CURRENT_FILE, CURRENT_LINE + DEFAULT_WINDOW_SIZE)
-
-
-# def scroll_down() -> str:
-#     """Moves the window down by 100 lines."""
-#     if CURRENT_FILE is None:
-#         return "[No file is open]"
-
-#     return open_file(CURRENT_FILE, CURRENT_LINE - DEFAULT_WINDOW_SIZE)
-
-
-# def search_dir(search_term: str, dir_path: str) -> str:
-#     """Searches for search_term in all files in a directory.
-
-#     Parameters:
-#         search_term (str): The search term to look for.
-#         dir_path (str): The directory path to search in, preferred absolute path.
-#     """
-
-#     dir_path_p = Path(dir_path)
-#     if not dir_path_p.exists():
-#         return f"[Directory {dir_path} does not exist]"
-
-#     matches = []
-#     for file in dir_path_p.glob("**/*"):
-#         if filter_file(file):
-#             with open(file, "r") as f:
-#                 lines = f.readlines()
-#                 for i, line in enumerate(lines):
-#                     if search_term in line:
-#                         matches.append(f"{file}:{i}|{line.strip()}\n")
-#     if not matches:
-#         return f"[No matches found for {search_term} in {dir_path}]"
-#     if len(matches) > 100:
-#         return f"[More than {len(matches)} matches found for {search_term} in {dir_path}. Please narrow your search]"
-
-#     return_str = f"[Found {len(matches)} matches for {search_term} in {dir_path}]\n"
-#     for match in matches:
-#         return_str += match
-
-#     return_str += f"[End of matches for {search_term} in {dir_path}]"
-#     return return_str
-
-
-# def search_file(search_term: str, file_path: str) -> str:
-#     """Searches the file for the given search term.
-
-#     Parameters:
-#         search_term (str): The search term to look for.
-#         file_path (str): The file path to search in, preferred absolute path.
-#     """
-
-#     file_path_p = Path(file_path)
-#     if not file_path_p.exists():
-#         return f"[File {file_path} does not exist]"
-
-#     with open(file_path_p, "r") as f:
-#         lines = f.readlines()
-
-#     search_results = []
-#     for i, line in enumerate(lines):
-#         if search_term in line:
-#             search_results.append(f"{i}|{line.strip()}\n")
-
-#     if not search_results:
-#         return f"[No matches found for {search_term} in {file_path}]"
-
-#     return_str = (
-#         f"[Found {len(search_results)} matches for {search_term} in {file_path}]\n"
-#     )
-#     for result in search_results:
-#         return_str += result
-
-#     return_str += f"[End of matches for {search_term} in {file_path}]"
-#     return return_str
-
-
-# def find_file(file_name: str, dir_path: str = "./") -> str:
-#     """Finds all files with the given name in the specified directory.
-
-#     Parameters:
-#         file_name (str): The file name to look for.
-#         dir_path (str): The directory path to search in, preferred absolute path.
-#     """
-
-#     dir_path_p = Path(dir_path)
-#     if not dir_path_p.exists():
-#         return f"[Directory {dir_path} does not exist]"
-
-#     files = list(dir_path_p.glob(f"**/*{file_name}*"))
-#     files = [f for f in files if filter_file(f)]
-#     if not files:
-#         return f"[No files found in {dir_path} with name {file_name}]"
-
-#     return_str = f"[Found {len(files)} matches for {file_name} in {dir_path}]\n"
-#     for match in files:
-#         return_str += str(match) + "\n"
-
-#     return_str += f"[End of matches for {file_name} in {dir_path}]"
-#     return return_str
-
-
-# def edit_file(file_path: str, start: int, end: int, content: str) -> str:
-#     """Edits the file at the given path with the provided content. The content will be
-#     inserted between the `start` and `end` line numbers. If the `start` and `end` are
-#     the same, the content will be inserted at the `start` line number. If the `end` is
-#     greater than the total number of lines in the file, the content will be inserted at
-#     the end of the file. If the `start` or `end` are negative, the function will return
-#     an error message.
-
-#     Parameters:
-#         file_path (str): The file path to edit, preferred absolute path.
-#         start (int): The line number to start the edit.
-#         end (int): The line number to end the edit.
-#         content (str): The content to insert.
-#     """
-#     file_path_p = Path(file_path)
-#     if not file_path_p.exists():
-#         return f"[File {file_path} does not exist]"
-
-#     total_lines = sum(1 for _ in open(file_path_p))
-#     if start < 0 or end < 0 or start > end or end > total_lines:
-#         return "[Invalid line range]"
-#     if start == end:
-#         end += 1
-
-#     new_content_lines = content.splitlines(keepends=True)
-#     new_content_lines = [
-#         line if line.endswith("\n") else line + "\n" for line in new_content_lines
-#     ]
-#     with open(file_path_p, "r") as f:
-#         lines = f.readlines()
-#         edited_lines = lines[:start] + new_content_lines + lines[end:]
-
-#     cur_line = start + len(content.split("\n")) // 2
-#     tmp_file = file_path_p.with_suffix(".tmp")
-#     with open(tmp_file, "w") as f:
-#         f.writelines(edited_lines)
-
-#     process = subprocess.Popen(
-#         [
-#             "flake8",
-#             "--isolated",
-#             "--select=F821,F822,F831,E111,E112,E113,E999,E902",
-#             tmp_file,
-#         ],
-#         stdout=subprocess.PIPE,
-#         stderr=subprocess.PIPE,
-#         text=True,
-#     )
-#     stdout, _ = process.communicate()
-#     tmp_file.unlink()
-#     if stdout != "":
-#         stdout = stdout.replace(tmp_file.name, file_path)
-#         error_msg = "[Edit failed with the following status]\n" + stdout
-#         original_view = view_lines(
-#             lines,
-#             start + ((end - start) // 2),
-#             DEFAULT_WINDOW_SIZE,
-#             file_path,
-#             total_lines,
-#         )
-#         total_lines_edit = sum(1 for _ in edited_lines)
-#         edited_view = view_lines(
-#             edited_lines, cur_line, DEFAULT_WINDOW_SIZE, file_path, total_lines_edit
-#         )
-
-#         error_msg += f"\n[This is how your edit would have looked like if applied]\n{edited_view}\n\n[This is the original code before your edit]\n{original_view}"
-#         return error_msg
-
-#     with open(file_path_p, "w") as f:
-#         f.writelines(edited_lines)
-
-#     return open_file(file_path, cur_line)
-
-
 def get_tool_descriptions() -> str:
     """Returns a description of all the tools that `generate_vision_code` has access to.
     Helpful for answering questions about what types of vision tasks you can do with
@@ -697,14 +350,5 @@ def get_tool_descriptions() -> str:
         edit_artifact,
         generate_vision_code,
         edit_vision_code,
-        # open_file,
-        # create_file,
-        # scroll_up,
-        # scroll_down,
-        # edit_file,
-        # search_dir,
-        # search_file,
-        # find_file,
-        # florencev2_fine_tuning,
     ]
 )
diff --git a/vision_agent/utils/execute.py b/vision_agent/utils/execute.py
index ee671a15..08924875 100644
--- a/vision_agent/utils/execute.py
+++ b/vision_agent/utils/execute.py
@@ -5,7 +5,6 @@
 import platform
 import re
 import sys
-import tempfile
 import traceback
 import warnings
 from enum import Enum

From afc87c06e460702370e489d5f0cfd0d6059ab13c Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Wed, 28 Aug 2024 14:21:34 -0700
Subject: [PATCH 21/37] fix mypy errors

---
 vision_agent/agent/agent.py              |  2 +-
 vision_agent/agent/vision_agent.py       | 15 +++++++++------
 vision_agent/agent/vision_agent_coder.py |  2 +-
 vision_agent/tools/meta_tools.py         |  8 ++++----
 vision_agent/utils/execute.py            | 10 +++++-----
 5 files changed, 20 insertions(+), 17 deletions(-)

diff --git a/vision_agent/agent/agent.py b/vision_agent/agent/agent.py
index 6b11f297..ca2cf181 100644
--- a/vision_agent/agent/agent.py
+++ b/vision_agent/agent/agent.py
@@ -11,7 +11,7 @@ def __call__(
         self,
         input: Union[str, List[Message]],
         media: Optional[Union[str, Path]] = None,
-    ) -> str:
+    ) -> Union[str, List[Message]]:
         pass
 
     @abstractmethod
diff --git a/vision_agent/agent/vision_agent.py b/vision_agent/agent/vision_agent.py
index 17fe347d..6399016e 100644
--- a/vision_agent/agent/vision_agent.py
+++ b/vision_agent/agent/vision_agent.py
@@ -38,7 +38,7 @@ class BoilerplateCode:
     ]
 
     @staticmethod
-    def add_boilerplate(code: str, **format) -> str:
+    def add_boilerplate(code: str, **format: Any) -> str:
         """Run this method to prepend the default imports to the code.
         NOTE: be sure to run this method after the custom tools have been registered.
         """
@@ -131,10 +131,13 @@ def __init__(
         self.code_sandbox_runtime = code_sandbox_runtime
         if self.verbosity >= 1:
             _LOGGER.setLevel(logging.INFO)
-        self.local_artifacts_path = (
-            Path(local_artifacts_path)
-            if local_artifacts_path is not None
-            else "artifacts.pkl"
+        self.local_artifacts_path = cast(
+            str,
+            (
+                Path(local_artifacts_path)
+                if local_artifacts_path is not None
+                else "artifacts.pkl"
+            ),
         )
 
     def __call__(
@@ -160,7 +163,7 @@ def __call__(
             if media is not None:
                 input[0]["media"] = [media]
         results = self.chat_with_code(input, artifacts)
-        return results  # type: ignore
+        return results
 
     def chat_with_code(
         self,
diff --git a/vision_agent/agent/vision_agent_coder.py b/vision_agent/agent/vision_agent_coder.py
index 7856bdb8..cc0711b6 100644
--- a/vision_agent/agent/vision_agent_coder.py
+++ b/vision_agent/agent/vision_agent_coder.py
@@ -725,7 +725,7 @@ def chat_with_workflow(
                             else code_interpreter.upload_file(media)
                         )
                         chat_i["content"] += f" Media name {media}"  # type: ignore
-                        media_list.append(media)
+                        media_list.append(str(media))
 
             int_chat = cast(
                 List[Message],
diff --git a/vision_agent/tools/meta_tools.py b/vision_agent/tools/meta_tools.py
index bc3e3058..89c2dbdd 100644
--- a/vision_agent/tools/meta_tools.py
+++ b/vision_agent/tools/meta_tools.py
@@ -46,7 +46,7 @@ class Artifacts:
 
     def __init__(self, remote_save_path: Union[str, Path]) -> None:
         self.remote_save_path = Path(remote_save_path)
-        self.artifacts = {}
+        self.artifacts: Dict[str, Any] = {}
 
         self.code_sandbox_runtime = None
 
@@ -81,10 +81,10 @@ def save(self, local_path: Optional[Union[str, Path]] = None) -> None:
         with open(save_path, "wb") as f:
             pkl.dump(self.artifacts, f)
 
-    def __iter__(self):
+    def __iter__(self) -> Any:
         return iter(self.artifacts)
 
-    def __getitem__(self, name: str) -> str:
+    def __getitem__(self, name: str) -> Any:
         return self.artifacts[name]
 
     def __setitem__(self, name: str, value: str) -> None:
@@ -201,7 +201,7 @@ def edit_artifact(
 
     cur_line = start + len(content.split("\n")) // 2
     with tempfile.NamedTemporaryFile(delete=True) as f:
-        with open(f.name, "w") as f:
+        with open(f.name, "w") as f:  # type: ignore
             f.writelines(edited_lines)
 
         process = subprocess.Popen(
diff --git a/vision_agent/utils/execute.py b/vision_agent/utils/execute.py
index 08924875..15e4f9b9 100644
--- a/vision_agent/utils/execute.py
+++ b/vision_agent/utils/execute.py
@@ -410,7 +410,7 @@ def upload_file(self, file: Union[str, Path]) -> Path:
         # Default behavior is a no-op (for local code interpreter)
         return Path(file)
 
-    def download_file(self, remote_file_path: str, local_file_path: str) -> Path:
+    def download_file(self, remote_file_path: Union[str, Path], local_file_path: Union[str, Path]) -> Path:
         # Default behavior is a no-op (for local code interpreter)
         return Path(local_file_path)
 
@@ -528,9 +528,9 @@ def upload_file(self, file: Union[str, Path]) -> Path:
         _LOGGER.info(f"File ({file}) is uploaded to: {str(self.remote_path)}")
         return self.remote_path
 
-    def download_file(self, remote_file_path: str, local_file_path: str) -> Path:
+    def download_file(self, remote_file_path: Union[str, Path], local_file_path: Union[str, Path]) -> Path:
         with open(local_file_path, "w+b") as f:
-            f.write(self.interpreter.files.read(path=remote_file_path, format="bytes"))
+            f.write(self.interpreter.files.read(path=str(remote_file_path), format="bytes"))
         _LOGGER.info(f"File ({remote_file_path}) is downloaded to: {local_file_path}")
         return Path(local_file_path)
 
@@ -616,7 +616,7 @@ def exec_cell(self, code: str) -> Execution:
             traceback_raw = traceback.format_exc().splitlines()
             return Execution.from_exception(e, traceback_raw)
 
-    def upload_file(self, file_path: str) -> Path:
+    def upload_file(self, file_path: Union[str, Path]) -> Path:
         with open(file_path, "rb") as f:
             contents = f.read()
         with open(self.remote_path / Path(file_path).name, "wb") as f:
@@ -625,7 +625,7 @@ def upload_file(self, file_path: str) -> Path:
 
         return Path(self.remote_path / file_path)
 
-    def download_file(self, remote_file_path: str, local_file_path: str) -> Path:
+    def download_file(self, remote_file_path: Union[str, Path], local_file_path: Union[str, Path]) -> Path:
         with open(self.remote_path / remote_file_path, "rb") as f:
             contents = f.read()
         with open(local_file_path, "wb") as f:

From 4aa9fec019a77ad1122a1a805c6f54606607eba2 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Wed, 28 Aug 2024 15:15:19 -0700
Subject: [PATCH 22/37] fix format

---
 vision_agent/utils/execute.py | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/vision_agent/utils/execute.py b/vision_agent/utils/execute.py
index 15e4f9b9..05b03612 100644
--- a/vision_agent/utils/execute.py
+++ b/vision_agent/utils/execute.py
@@ -410,7 +410,9 @@ def upload_file(self, file: Union[str, Path]) -> Path:
         # Default behavior is a no-op (for local code interpreter)
         return Path(file)
 
-    def download_file(self, remote_file_path: Union[str, Path], local_file_path: Union[str, Path]) -> Path:
+    def download_file(
+        self, remote_file_path: Union[str, Path], local_file_path: Union[str, Path]
+    ) -> Path:
         # Default behavior is a no-op (for local code interpreter)
         return Path(local_file_path)
 
@@ -528,9 +530,13 @@ def upload_file(self, file: Union[str, Path]) -> Path:
         _LOGGER.info(f"File ({file}) is uploaded to: {str(self.remote_path)}")
         return self.remote_path
 
-    def download_file(self, remote_file_path: Union[str, Path], local_file_path: Union[str, Path]) -> Path:
+    def download_file(
+        self, remote_file_path: Union[str, Path], local_file_path: Union[str, Path]
+    ) -> Path:
         with open(local_file_path, "w+b") as f:
-            f.write(self.interpreter.files.read(path=str(remote_file_path), format="bytes"))
+            f.write(
+                self.interpreter.files.read(path=str(remote_file_path), format="bytes")
+            )
         _LOGGER.info(f"File ({remote_file_path}) is downloaded to: {local_file_path}")
         return Path(local_file_path)
 
@@ -625,7 +631,9 @@ def upload_file(self, file_path: Union[str, Path]) -> Path:
 
         return Path(self.remote_path / file_path)
 
-    def download_file(self, remote_file_path: Union[str, Path], local_file_path: Union[str, Path]) -> Path:
+    def download_file(
+        self, remote_file_path: Union[str, Path], local_file_path: Union[str, Path]
+    ) -> Path:
         with open(self.remote_path / remote_file_path, "rb") as f:
             contents = f.read()
         with open(local_file_path, "wb") as f:

From 53dea5728be340dba0f12f2b8539ffeb914dce4a Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Wed, 28 Aug 2024 20:28:50 -0700
Subject: [PATCH 23/37] add execution to conversation

---
 vision_agent/agent/vision_agent.py | 14 +++++++-------
 vision_agent/lmm/types.py          |  3 ++-
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/vision_agent/agent/vision_agent.py b/vision_agent/agent/vision_agent.py
index 6399016e..04cafd5e 100644
--- a/vision_agent/agent/vision_agent.py
+++ b/vision_agent/agent/vision_agent.py
@@ -15,7 +15,7 @@
 from vision_agent.tools import META_TOOL_DOCSTRING
 from vision_agent.tools.meta_tools import Artifacts
 from vision_agent.utils import CodeInterpreterFactory
-from vision_agent.utils.execute import CodeInterpreter
+from vision_agent.utils.execute import CodeInterpreter, Execution
 
 logging.basicConfig(level=logging.INFO)
 _LOGGER = logging.getLogger(__name__)
@@ -75,11 +75,10 @@ def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
 
 def run_code_action(
     code: str, code_interpreter: CodeInterpreter, artifact_remote_path: str
-) -> str:
-    result = code_interpreter.exec_cell(
+) -> Execution:
+    return code_interpreter.exec_isolation(
         BoilerplateCode.add_boilerplate(code, remote_path=artifact_remote_path)
     )
-    return result.text()
 
 
 def parse_execution(response: str) -> Optional[str]:
@@ -258,14 +257,15 @@ def chat_with_code(
                 code_action = parse_execution(response["response"])
 
                 if code_action is not None:
-                    obs = run_code_action(
+                    result = run_code_action(
                         code_action, code_interpreter, str(remote_artifacts_path)
                     )
+                    obs = result.text()
 
                     if self.verbosity >= 1:
                         _LOGGER.info(obs)
-                    int_chat.append({"role": "observation", "content": obs})
-                    orig_chat.append({"role": "observation", "content": obs})
+                    int_chat.append({"role": "observation", "content": obs, "execution": result})
+                    orig_chat.append({"role": "observation", "content": obs, "execution": result})
 
                 iterations += 1
                 last_response = response
diff --git a/vision_agent/lmm/types.py b/vision_agent/lmm/types.py
index ded6a42b..ba2b3189 100644
--- a/vision_agent/lmm/types.py
+++ b/vision_agent/lmm/types.py
@@ -1,5 +1,6 @@
 from pathlib import Path
 from typing import Dict, Sequence, Union
+from vision_agent.utils.execute import Execution
 
 TextOrImage = Union[str, Sequence[Union[str, Path]]]
-Message = Dict[str, TextOrImage]
+Message = Dict[str, Union[TextOrImage, Execution]]

From e508809658458e309d68c441f3842a48df050b6e Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Thu, 29 Aug 2024 08:32:32 -0700
Subject: [PATCH 24/37] fixed type errors

---
 vision_agent/agent/vision_agent.py       | 10 +++++++---
 vision_agent/agent/vision_agent_coder.py |  2 +-
 vision_agent/lmm/lmm.py                  |  6 ++++--
 3 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/vision_agent/agent/vision_agent.py b/vision_agent/agent/vision_agent.py
index 04cafd5e..58bea5ca 100644
--- a/vision_agent/agent/vision_agent.py
+++ b/vision_agent/agent/vision_agent.py
@@ -198,7 +198,7 @@ def chat_with_code(
             for chat_i in int_chat:
                 if "media" in chat_i:
                     for media in chat_i["media"]:
-                        media = code_interpreter.upload_file(media)
+                        media = code_interpreter.upload_file(cast(str, media))
                         chat_i["content"] += f" Media name {media}"  # type: ignore
                         # Save dummy value for now since we just need to know the path
                         # name in the key 'media'. Later on we can add artifact support
@@ -264,8 +264,12 @@ def chat_with_code(
 
                     if self.verbosity >= 1:
                         _LOGGER.info(obs)
-                    int_chat.append({"role": "observation", "content": obs, "execution": result})
-                    orig_chat.append({"role": "observation", "content": obs, "execution": result})
+                    int_chat.append(
+                        {"role": "observation", "content": obs, "execution": result}
+                    )
+                    orig_chat.append(
+                        {"role": "observation", "content": obs, "execution": result}
+                    )
 
                 iterations += 1
                 last_response = response
diff --git a/vision_agent/agent/vision_agent_coder.py b/vision_agent/agent/vision_agent_coder.py
index cc0711b6..c8488902 100644
--- a/vision_agent/agent/vision_agent_coder.py
+++ b/vision_agent/agent/vision_agent_coder.py
@@ -722,7 +722,7 @@ def chat_with_workflow(
                             media
                             if type(media) is str
                             and media.startswith(("http", "https"))
-                            else code_interpreter.upload_file(media)
+                            else code_interpreter.upload_file(cast(str, media))
                         )
                         chat_i["content"] += f" Media name {media}"  # type: ignore
                         media_list.append(str(media))
diff --git a/vision_agent/lmm/lmm.py b/vision_agent/lmm/lmm.py
index 15df5ac9..76481f3f 100644
--- a/vision_agent/lmm/lmm.py
+++ b/vision_agent/lmm/lmm.py
@@ -138,7 +138,7 @@ def chat(
             fixed_c["content"] = [{"type": "text", "text": c["content"]}]  # type: ignore
             if "media" in c:
                 for media in c["media"]:
-                    encoded_media = encode_media(media)
+                    encoded_media = encode_media(cast(str, media))
 
                     fixed_c["content"].append(  # type: ignore
                         {
@@ -389,7 +389,9 @@ def chat(
         fixed_chat = []
         for message in chat:
             if "media" in message:
-                message["images"] = [encode_media(m) for m in message["media"]]
+                message["images"] = [
+                    encode_media(cast(str, m)) for m in message["media"]
+                ]
                 del message["media"]
             fixed_chat.append(message)
         url = f"{self.url}/chat"

From d83857e8f430e0458eff13b3eb824a303b502bbc Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Thu, 29 Aug 2024 09:47:26 -0700
Subject: [PATCH 25/37] fixed bug with upload file

---
 vision_agent/utils/execute.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vision_agent/utils/execute.py b/vision_agent/utils/execute.py
index 05b03612..b2da6f11 100644
--- a/vision_agent/utils/execute.py
+++ b/vision_agent/utils/execute.py
@@ -528,7 +528,7 @@ def upload_file(self, file: Union[str, Path]) -> Path:
         with open(file, "rb") as f:
             self.interpreter.files.write(path=str(self.remote_path / file_name), data=f)
         _LOGGER.info(f"File ({file}) is uploaded to: {str(self.remote_path)}")
-        return self.remote_path
+        return self.remote_path / file_name
 
     def download_file(
         self, remote_file_path: Union[str, Path], local_file_path: Union[str, Path]

From 51503b90febc9d6d85b277b2cd2d757d6f30210f Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Thu, 29 Aug 2024 10:15:32 -0700
Subject: [PATCH 26/37] added ability to write media files to artifacts

---
 vision_agent/agent/vision_agent.py | 17 +++++-----
 vision_agent/tools/meta_tools.py   | 53 +++++++++++++++++++-----------
 2 files changed, 43 insertions(+), 27 deletions(-)

diff --git a/vision_agent/agent/vision_agent.py b/vision_agent/agent/vision_agent.py
index 58bea5ca..df73cef4 100644
--- a/vision_agent/agent/vision_agent.py
+++ b/vision_agent/agent/vision_agent.py
@@ -29,7 +29,7 @@ class BoilerplateCode:
     pre_code = [
         "from typing import *",
         "from vision_agent.utils.execute import CodeInterpreter",
-        "from vision_agent.tools.meta_tools import Artifacts, open_artifact, create_artifact, edit_artifact, get_tool_descriptions, generate_vision_code, edit_vision_code",
+        "from vision_agent.tools.meta_tools import Artifacts, open_code_artifact, create_code_artifact, edit_code_artifact, get_tool_descriptions, generate_vision_code, edit_vision_code, write_media_artifact",
         "artifacts = Artifacts('{remote_path}')",
         "artifacts.load('{remote_path}')",
     ]
@@ -198,13 +198,14 @@ def chat_with_code(
             for chat_i in int_chat:
                 if "media" in chat_i:
                     for media in chat_i["media"]:
-                        media = code_interpreter.upload_file(cast(str, media))
-                        chat_i["content"] += f" Media name {media}"  # type: ignore
-                        # Save dummy value for now since we just need to know the path
-                        # name in the key 'media'. Later on we can add artifact support
-                        # for byte data.
-                        artifacts.artifacts[Path(media).name] = None
-                        media_list.append(media)
+                        media = cast(str, media)
+                        artifacts.artifacts[Path(media).name] = open(media, "rb").read()
+
+                        media_remote_path = (
+                            Path(code_interpreter.remote_path) / Path(media).name
+                        )
+                        chat_i["content"] += f" Media name {media_remote_path}"  # type: ignore
+                        media_list.append(media_remote_path)
 
             int_chat = cast(
                 List[Message],
diff --git a/vision_agent/tools/meta_tools.py b/vision_agent/tools/meta_tools.py
index 89c2dbdd..364bdf0e 100644
--- a/vision_agent/tools/meta_tools.py
+++ b/vision_agent/tools/meta_tools.py
@@ -61,7 +61,8 @@ def load(self, file_path: Union[str, Path]) -> None:
             self.artifacts = pkl.load(f)
         for k, v in self.artifacts.items():
             if v is not None:
-                with open(self.remote_save_path.parent / k, "w") as f:
+                mode = "w" if isinstance(v, str) else "wb"
+                with open(self.remote_save_path.parent / k, mode) as f:
                     f.write(v)
 
     def show(self) -> str:
@@ -87,7 +88,7 @@ def __iter__(self) -> Any:
     def __getitem__(self, name: str) -> Any:
         return self.artifacts[name]
 
-    def __setitem__(self, name: str, value: str) -> None:
+    def __setitem__(self, name: str, value: Any) -> None:
         self.artifacts[name] = value
 
     def __contains__(self, name: str) -> bool:
@@ -119,11 +120,11 @@ def view_lines(
     return return_str
 
 
-def open_artifact(
+def open_code_artifact(
     artifacts: Artifacts, name: str, line_num: int = 0, window_size: int = 100
 ) -> str:
-    """Opens the provided artifact. If `line_num` is provided, the window will be moved
-    to include that line. It only shows the first 100 lines by default! Max
+    """Opens the provided code artifact. If `line_num` is provided, the window will be
+    moved to include that line. It only shows the first 100 lines by default! Max
     `window_size` supported is 2000.
 
     Parameters:
@@ -148,8 +149,8 @@ def open_artifact(
     return view_lines(lines, line_num, window_size, name, total_lines)
 
 
-def create_artifact(artifacts: Artifacts, name: str) -> str:
-    """Creates a new artifiact with the given name.
+def create_code_artifact(artifacts: Artifacts, name: str) -> str:
+    """Creates a new code artifiact with the given name.
 
     Parameters:
         artifacts (Artifacts): The artifacts object to add the new artifact to.
@@ -164,15 +165,15 @@ def create_artifact(artifacts: Artifacts, name: str) -> str:
     return return_str
 
 
-def edit_artifact(
+def edit_code_artifact(
     artifacts: Artifacts, name: str, start: int, end: int, content: str
 ) -> str:
-    """Edits the given artifact with the provided content. The content will be inserted
-    between the `start` and `end` line numbers. If the `start` and `end` are the same,
-    the content will be inserted at the `start` line number. If the `end` is greater
-    than the total number of lines in the file, the content will be inserted at the end
-    of the file. If the `start` or `end` are negative, the function will return an
-    error message.
+    """Edits the given code artifact with the provided content. The content will be
+    inserted between the `start` and `end` line numbers. If the `start` and `end` are
+    the same, the content will be inserted at the `start` line number. If the `end` is
+    greater than the total number of lines in the file, the content will be inserted at
+    the end of the file. If the `start` or `end` are negative, the function will return
+    an error message.
 
     Parameters:
         artifacts (Artifacts): The artifacts object to edit the artifact from.
@@ -237,7 +238,7 @@ def edit_artifact(
 
     artifacts[name] = "".join(edited_lines)
 
-    return open_artifact(artifacts, name, cur_line)
+    return open_code_artifact(artifacts, name, cur_line)
 
 
 def generate_vision_code(
@@ -274,7 +275,7 @@ def detect_dogs(image_path: str):
         agent = va.agent.VisionAgentCoder()
 
     fixed_chat: List[Message] = [{"role": "user", "content": chat, "media": media}]
-    response = agent.chat_with_workflow(fixed_chat, test_multi_plan=False)
+    response = agent.chat_with_workflow(fixed_chat, test_multi_plan=True)
     code = response["code"]
     artifacts[name] = code
     code_lines = code.splitlines(keepends=True)
@@ -335,6 +336,19 @@ def detect_dogs(image_path: str):
     return view_lines(code_lines, 0, total_lines, name, total_lines)
 
 
+def write_media_artifact(artifacts: Artifacts, local_path: str) -> str:
+    """Writes a media file to the artifacts object.
+
+    Parameters:
+        artifacts (Artifacts): The artifacts object to save the media to.
+        local_path (str): The local path to the media file.
+    """
+    with open(local_path, "rb") as f:
+        media = f.read()
+    artifacts[Path(local_path).name] = media
+    return f"[Media {Path(local_path).name} saved]"
+
+
 def get_tool_descriptions() -> str:
     """Returns a description of all the tools that `generate_vision_code` has access to.
     Helpful for answering questions about what types of vision tasks you can do with
@@ -345,10 +359,11 @@ def get_tool_descriptions() -> str:
 META_TOOL_DOCSTRING = get_tool_documentation(
     [
         get_tool_descriptions,
-        open_artifact,
-        create_artifact,
-        edit_artifact,
+        open_code_artifact,
+        create_code_artifact,
+        edit_code_artifact,
         generate_vision_code,
         edit_vision_code,
+        write_media_artifact,
     ]
 )

From 0ed6bb7bfc22f9167b2eb991ddda0725b3cf7a8f Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Thu, 29 Aug 2024 10:15:58 -0700
Subject: [PATCH 27/37] return outside of context

---
 vision_agent/tools/tools.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py
index 62a1908a..0695b547 100644
--- a/vision_agent/tools/tools.py
+++ b/vision_agent/tools/tools.py
@@ -1,8 +1,9 @@
-import os
 import io
 import json
 import logging
+import os
 import tempfile
+import urllib.request
 from importlib import resources
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Tuple, Union, cast
@@ -15,7 +16,6 @@
 from PIL import Image, ImageDraw, ImageFont
 from pillow_heif import register_heif_opener  # type: ignore
 from pytube import YouTube  # type: ignore
-import urllib.request
 
 from vision_agent.clients.landing_public_api import LandingPublicAPI
 from vision_agent.tools.tool_utils import (
@@ -1332,7 +1332,7 @@ def save_video(
         video.write_videofile(f.name, codec="libx264")
         f.close()
         _save_video_to_result(f.name)
-        return f.name
+    return f.name
 
 
 def _save_video_to_result(video_uri: str) -> None:

From 04bd7686d752baea5293cd6c72d0d1d2cb260244 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Thu, 29 Aug 2024 10:17:03 -0700
Subject: [PATCH 28/37] make remote path execute variable

---
 vision_agent/utils/execute.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/vision_agent/utils/execute.py b/vision_agent/utils/execute.py
index b2da6f11..37c8d260 100644
--- a/vision_agent/utils/execute.py
+++ b/vision_agent/utils/execute.py
@@ -384,8 +384,15 @@ def from_e2b_execution(exec: E2BExecution) -> "Execution":
 class CodeInterpreter(abc.ABC):
     """Code interpreter interface."""
 
-    def __init__(self, timeout: int, *args: Any, **kwargs: Any) -> None:
+    def __init__(
+        self,
+        timeout: int,
+        remote_path: Optional[Union[str, Path]] = None,
+        *args: Any,
+        **kwargs: Any,
+    ) -> None:
         self.timeout = timeout
+        self.remote_path = Path(remote_path if remote_path is not None else WORKSPACE)
 
     def __enter__(self) -> Self:
         return self

From 9782893dc7c5624eb297e6115eec12f828392905 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Thu, 29 Aug 2024 10:19:24 -0700
Subject: [PATCH 29/37] add codec for video encoding

---
 vision_agent/utils/image_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vision_agent/utils/image_utils.py b/vision_agent/utils/image_utils.py
index 54688f93..979f6c97 100644
--- a/vision_agent/utils/image_utils.py
+++ b/vision_agent/utils/image_utils.py
@@ -100,7 +100,7 @@ def frames_to_bytes(
     """
     with tempfile.NamedTemporaryFile(delete=True) as temp_file:
         clip = ImageSequenceClip(frames, fps=fps)
-        clip.write_videofile(temp_file.name + f".{file_ext}", fps=fps)
+        clip.write_videofile(temp_file.name + f".{file_ext}", fps=fps, code="libx264")
         with open(temp_file.name + f".{file_ext}", "rb") as f:
             buffer_bytes = f.read()
     return buffer_bytes

From 75c12893d90d82e680230fa1d0f501e350dbafcf Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Thu, 29 Aug 2024 10:19:52 -0700
Subject: [PATCH 30/37] fix prompts to include writing media artifacts

---
 vision_agent/agent/vision_agent_prompts.py | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/vision_agent/agent/vision_agent_prompts.py b/vision_agent/agent/vision_agent_prompts.py
index c1cf541e..85e34cd5 100644
--- a/vision_agent/agent/vision_agent_prompts.py
+++ b/vision_agent/agent/vision_agent_prompts.py
@@ -86,7 +86,7 @@
 
 
 EXAMPLES_CODE2 = """
-USER: Can you create a function to count workers with helmets?
+USER: Can you create a function to count workers with helmets, return the count and save a visaulization of the bounding boxes?
 
 OBSERVATION:
 [Artifacts loaded]
@@ -101,16 +101,17 @@
 Artifact workers.png loaded to /path/to/images/workers.png
 [End of artifacts]
 
-AGENT: {"thoughts": "I will use the generate_vision_code to count the workers with helmets in the image.", "response": "<execute_python>generate_vision_code(artifacts, 'code.py', 'Can you write code to count workers with helmets in this image?', media=['/paths/to/images/workers.png'])</execute_python>", "let_user_respond": false}
+AGENT: {"thoughts": "I will use the generate_vision_code to count the workers with helmets in the image.", "response": "<execute_python>generate_vision_code(artifacts, 'code.py', 'Can you create a function to count workers with helmets, return the count and save a visaulization of the bounding boxes?', media=['/paths/to/images/workers.png'])</execute_python>", "let_user_respond": false}
 
 OBSERVATION:
 [Artifact code.py]
-0|from vision_agent.tools import load_image, owl_v2, closest_box_distance
-1|def count_workers_with_helmets(image_path: str):
+0|from vision_agent.tools import load_image, owl_v2, closest_box_distance, overlay_bounding_boxes, save_image
+1|def count_workers_with_helmets(image_path: str, output_path: str):
 2|    image = load_image(image_path)
-3|    workers = owl_v2("worker", image)
-4|    helmets = owl_v2("helmet", image)
-5|    count = 0
+3|    detections = owl_v2("worker, helmet", image)
+4|    workers = [d for d in detections if d['label'] == 'worker']
+5|    helmets = [d for d in detections if d['label'] == 'helmet']
+6|    count = 0
 6|    for worker in workers:
 7|        person_box = worker['bbox']
 8|        person_has_helmet = False
@@ -120,14 +121,16 @@
 12|                break
 13|        if person_has_helmet:
 14|            count += 1
+15|    overlay_bounding_boxes(image, detections)
+16|    save_image(output_path, image)
 15|    return count
 [End of artifact]
 
-AGENT: {"thoughts": "I have generated the code to count the workers with helmets in the image, I must now run the code to get the output.", "response": "<execute_python>from code import count_workers_with_helmets\n    print(count_workers_with_helmets('/path/to/images/workers.png'))</execute_python>", "let_user_respond": false}
+AGENT: {"thoughts": "I have generated the code to count the workers with helmets in the image, I must now run the code to get the output and write the visualization to the artifacts so the user can see it.", "response": "<execute_python>from code import count_workers_with_helmets\n    print(count_workers_with_helmets('/path/to/images/workers.png', 'workers_viz.png'))\n    write_media_artifact(artifacts, 'workers_viz.png')</execute_python>", "let_user_respond": false}
 
 OBSERVATION:
 ----- stdout -----
 2
 
-AGENT: {"thoughts": "Two workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "I have written the code to count the workers wearing helmets in code.py", "let_user_respond": true}
+AGENT: {"thoughts": "Two workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "I have written the code to count the workers wearing helmets in code.py and saved the visualization under 'workers_viz.png'.", "let_user_respond": true}
 """

From 1d8dd7863e53f5238ad2515a6befbf05b1945c7d Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Thu, 29 Aug 2024 10:20:04 -0700
Subject: [PATCH 31/37] isort

---
 vision_agent/lmm/types.py      | 1 +
 vision_agent/tools/__init__.py | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/vision_agent/lmm/types.py b/vision_agent/lmm/types.py
index ba2b3189..b9c99fe2 100644
--- a/vision_agent/lmm/types.py
+++ b/vision_agent/lmm/types.py
@@ -1,5 +1,6 @@
 from pathlib import Path
 from typing import Dict, Sequence, Union
+
 from vision_agent.utils.execute import Execution
 
 TextOrImage = Union[str, Sequence[Union[str, Path]]]
diff --git a/vision_agent/tools/__init__.py b/vision_agent/tools/__init__.py
index cbd92358..e82d7553 100644
--- a/vision_agent/tools/__init__.py
+++ b/vision_agent/tools/__init__.py
@@ -21,8 +21,8 @@
     dpt_hybrid_midas,
     extract_frames,
     florence2_image_caption,
-    florence2_phrase_grounding,
     florence2_ocr,
+    florence2_phrase_grounding,
     florence2_roberta_vqa,
     florence2_sam2_image,
     florence2_sam2_video,

From 7a510e3bc21cc9d9f74efc1405d1b3d47c3f7eac Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Thu, 29 Aug 2024 13:18:26 -0700
Subject: [PATCH 32/37] fix typo

---
 vision_agent/utils/image_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vision_agent/utils/image_utils.py b/vision_agent/utils/image_utils.py
index 979f6c97..c1cc8eb6 100644
--- a/vision_agent/utils/image_utils.py
+++ b/vision_agent/utils/image_utils.py
@@ -100,7 +100,7 @@ def frames_to_bytes(
     """
     with tempfile.NamedTemporaryFile(delete=True) as temp_file:
         clip = ImageSequenceClip(frames, fps=fps)
-        clip.write_videofile(temp_file.name + f".{file_ext}", fps=fps, code="libx264")
+        clip.write_videofile(temp_file.name + f".{file_ext}", fps=fps, codec="libx264")
         with open(temp_file.name + f".{file_ext}", "rb") as f:
             buffer_bytes = f.read()
     return buffer_bytes

From ac9a5e09df599fb40249077fc8ad30f19c718415 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Thu, 29 Aug 2024 14:59:15 -0700
Subject: [PATCH 33/37] added redisplay for nested notebook sessions

---
 vision_agent/tools/meta_tools.py | 34 ++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/vision_agent/tools/meta_tools.py b/vision_agent/tools/meta_tools.py
index 364bdf0e..b3c3ed8c 100644
--- a/vision_agent/tools/meta_tools.py
+++ b/vision_agent/tools/meta_tools.py
@@ -5,10 +5,13 @@
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Union
 
+from IPython.display import display
+
 import vision_agent as va
 from vision_agent.lmm.types import Message
 from vision_agent.tools.tool_utils import get_tool_documentation
 from vision_agent.tools.tools import TOOL_DESCRIPTIONS
+from vision_agent.utils.execute import Execution, MimeType
 
 # These tools are adapted from SWE-Agent https://github.com/princeton-nlp/SWE-agent
 
@@ -37,6 +40,35 @@ def filter_file(file_name: Union[str, Path]) -> bool:
     )
 
 
+def redisplay_results(execution: Execution) -> None:
+    """This function is used to add previous execution results to the current output.
+    This is handy if you are inside a notebook environment, call it notebook1, and you
+    have a nested notebook environment, call it notebook2, and you want the execution
+    results from notebook2 to be included in the execution results for notebook1.
+    """
+    for result in execution.results:
+        if result.text is not None:
+            display({MimeType.TEXT_PLAIN: result.text})
+        if result.html is not None:
+            display({MimeType.TEXT_HTML: result.html})
+        if result.markdown is not None:
+            display({MimeType.TEXT_MARKDOWN: result.markdown})
+        if result.svg is not None:
+            display({MimeType.IMAGE_SVG: result.svg})
+        if result.png is not None:
+            display({MimeType.IMAGE_PNG: result.png})
+        if result.jpeg is not None:
+            display({MimeType.IMAGE_JPEG: result.jpeg})
+        if result.mp4 is not None:
+            display({MimeType.VIDEO_MP4_B64: result.mp4})
+        if result.latex is not None:
+            display({MimeType.TEXT_LATEX: result.latex})
+        if result.json is not None:
+            display({MimeType.APPLICATION_JSON: result.json})
+        if result.extra is not None:
+            display(result.extra)
+
+
 class Artifacts:
     """Artifacts is a class that allows you to sync files between a local and remote
     environment. In our case, the remote environment could be where the VisionAgent is
@@ -276,6 +308,7 @@ def detect_dogs(image_path: str):
 
     fixed_chat: List[Message] = [{"role": "user", "content": chat, "media": media}]
     response = agent.chat_with_workflow(fixed_chat, test_multi_plan=True)
+    redisplay_results(response["test_result"])
     code = response["code"]
     artifacts[name] = code
     code_lines = code.splitlines(keepends=True)
@@ -329,6 +362,7 @@ def detect_dogs(image_path: str):
             fixed_chat_history.append({"role": "user", "content": chat})
 
     response = agent.chat_with_workflow(fixed_chat_history, test_multi_plan=False)
+    redisplay_results(response["test_result"])
     code = response["code"]
     artifacts[name] = code
     code_lines = code.splitlines(keepends=True)

From 32b1ce92d6083cebe706e0dc5e393b656b959f3f Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Thu, 29 Aug 2024 19:13:08 -0700
Subject: [PATCH 34/37] return artifacts

---
 vision_agent/agent/vision_agent.py | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/vision_agent/agent/vision_agent.py b/vision_agent/agent/vision_agent.py
index df73cef4..5544b188 100644
--- a/vision_agent/agent/vision_agent.py
+++ b/vision_agent/agent/vision_agent.py
@@ -1,8 +1,9 @@
 import copy
 import logging
 import os
+import tempfile
 from pathlib import Path
-from typing import Any, Dict, List, Optional, Union, cast
+from typing import Any, Dict, List, Optional, Tuple, Union, cast
 
 from vision_agent.agent import Agent
 from vision_agent.agent.agent_utils import extract_json
@@ -135,7 +136,7 @@ def __init__(
             (
                 Path(local_artifacts_path)
                 if local_artifacts_path is not None
-                else "artifacts.pkl"
+                else Path(tempfile.NamedTemporaryFile(delete=False).name)
             ),
         )
 
@@ -161,14 +162,14 @@ def __call__(
             input = [{"role": "user", "content": input}]
             if media is not None:
                 input[0]["media"] = [media]
-        results = self.chat_with_code(input, artifacts)
+        results, _ = self.chat_with_code(input, artifacts)
         return results
 
     def chat_with_code(
         self,
         chat: List[Message],
         artifacts: Optional[Artifacts] = None,
-    ) -> List[Message]:
+    ) -> Tuple[List[Message], Artifacts]:
         """Chat with VisionAgent, it will use code to execute actions to accomplish
         its tasks.
 
@@ -187,6 +188,7 @@ def chat_with_code(
             raise ValueError("chat cannot be empty")
 
         if not artifacts:
+            # this is setting remote artifacts path
             artifacts = Artifacts(WORKSPACE / "artifacts.pkl")
 
         with CodeInterpreterFactory.new_instance(
@@ -265,9 +267,8 @@ def chat_with_code(
 
                     if self.verbosity >= 1:
                         _LOGGER.info(obs)
-                    int_chat.append(
-                        {"role": "observation", "content": obs, "execution": result}
-                    )
+                    # don't add execution results to internal chat
+                    int_chat.append({"role": "observation", "content": obs})
                     orig_chat.append(
                         {"role": "observation", "content": obs, "execution": result}
                     )
@@ -281,7 +282,7 @@ def chat_with_code(
             )
             artifacts.load(self.local_artifacts_path)
             artifacts.save()
-        return orig_chat
+        return orig_chat, artifacts
 
     def log_progress(self, data: Dict[str, Any]) -> None:
         pass

From 33cf8e71af32c59058fc15a8ac1901fa3fab0b69 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Thu, 29 Aug 2024 19:18:53 -0700
Subject: [PATCH 35/37] add trace for last edited artifact

---
 vision_agent/tools/meta_tools.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/vision_agent/tools/meta_tools.py b/vision_agent/tools/meta_tools.py
index b3c3ed8c..833ad542 100644
--- a/vision_agent/tools/meta_tools.py
+++ b/vision_agent/tools/meta_tools.py
@@ -194,6 +194,8 @@ def create_code_artifact(artifacts: Artifacts, name: str) -> str:
         artifacts[name] = ""
         return_str = f"[Artifact {name} created]"
     print(return_str)
+
+    display({MimeType.APPLICATION_JSON: {"last_artifact": name}})
     return return_str
 
 
@@ -270,6 +272,7 @@ def edit_code_artifact(
 
     artifacts[name] = "".join(edited_lines)
 
+    display({MimeType.APPLICATION_JSON: {"last_artifact": name}})
     return open_code_artifact(artifacts, name, cur_line)
 
 
@@ -313,6 +316,8 @@ def detect_dogs(image_path: str):
     artifacts[name] = code
     code_lines = code.splitlines(keepends=True)
     total_lines = len(code_lines)
+
+    display({MimeType.APPLICATION_JSON: {"last_artifact": name}})
     return view_lines(code_lines, 0, total_lines, name, total_lines)
 
 
@@ -367,6 +372,8 @@ def detect_dogs(image_path: str):
     artifacts[name] = code
     code_lines = code.splitlines(keepends=True)
     total_lines = len(code_lines)
+
+    display({MimeType.APPLICATION_JSON: {"last_artifact": name}})
     return view_lines(code_lines, 0, total_lines, name, total_lines)
 
 

From 40c1cbdce4feec26de15a35c15b588ca3dc10539 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Thu, 29 Aug 2024 20:19:22 -0700
Subject: [PATCH 36/37] handle artifact return

---
 examples/chat/app.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/chat/app.py b/examples/chat/app.py
index 68dede8d..9291f65a 100644
--- a/examples/chat/app.py
+++ b/examples/chat/app.py
@@ -53,7 +53,7 @@
 def update_messages(messages, lock):
     if Path("artifacts.pkl").exists():
         artifacts.load("artifacts.pkl")
-    new_chat = agent.chat_with_code(messages, artifacts=artifacts)
+    new_chat, _ = agent.chat_with_code(messages, artifacts=artifacts)
     with lock:
         for new_message in new_chat:
             if new_message not in messages:

From 58a1be455a579e0dbd6248762849aceffc854a08 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Thu, 29 Aug 2024 20:19:44 -0700
Subject: [PATCH 37/37] only add text to obs, no trace

---
 vision_agent/agent/vision_agent.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vision_agent/agent/vision_agent.py b/vision_agent/agent/vision_agent.py
index 5544b188..2bb04343 100644
--- a/vision_agent/agent/vision_agent.py
+++ b/vision_agent/agent/vision_agent.py
@@ -263,7 +263,7 @@ def chat_with_code(
                     result = run_code_action(
                         code_action, code_interpreter, str(remote_artifacts_path)
                     )
-                    obs = result.text()
+                    obs = str(result.logs)
 
                     if self.verbosity >= 1:
                         _LOGGER.info(obs)