From 35db97eaddb64888f231915b6958a59b4b432807 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Thu, 9 May 2024 16:26:26 -0700
Subject: [PATCH 01/10] fixed save and load

---
 vision_agent/utils/__init__.py |  2 +-
 vision_agent/utils/sim.py      | 19 +++++++++++++++++--
 2 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/vision_agent/utils/__init__.py b/vision_agent/utils/__init__.py
index 11dc0d42..10a49df4 100644
--- a/vision_agent/utils/__init__.py
+++ b/vision_agent/utils/__init__.py
@@ -1,3 +1,3 @@
 from .execute import Execute
-from .sim import Sim
+from .sim import Sim, load_sim, merge_sim
 from .video import extract_frames_from_video
diff --git a/vision_agent/utils/sim.py b/vision_agent/utils/sim.py
index 3a244cd8..4b4298d7 100644
--- a/vision_agent/utils/sim.py
+++ b/vision_agent/utils/sim.py
@@ -1,6 +1,7 @@
 from pathlib import Path
 from typing import Dict, List, Optional, Sequence, Union
 
+import numpy as np
 import pandas as pd
 from openai import Client
 from scipy.spatial.distance import cosine  # type: ignore
@@ -46,7 +47,14 @@ def __init__(
             )
 
     def save(self, sim_file: Union[str, Path]) -> None:
-        self.df.to_csv(sim_file, index=False)
+        sim_file = Path(sim_file)
+        sim_file.mkdir(parents=True, exist_ok=True)
+
+        df = self.df.copy()
+        embs = np.array(df.embs.tolist())
+        np.save(sim_file / "embs.npy", embs)
+        df = df.drop("embs", axis=1)
+        df.to_csv(sim_file / "df.csv", index=False)
 
     def top_k(self, query: str, k: int = 5) -> Sequence[Dict]:
         """Returns the top k most similar items to the query.
@@ -65,6 +73,13 @@ def top_k(self, query: str, k: int = 5) -> Sequence[Dict]:
         return res[[c for c in res.columns if c != "embs"]].to_dict(orient="records")
 
 
+def merge_sim(sim1: Sim, sim2: Sim) -> Sim:
+    return Sim(pd.concat([sim1.df, sim2.df], ignore_index=True))
+
+
 def load_sim(sim_file: Union[str, Path]) -> Sim:
-    df = pd.read_csv(sim_file)
+    sim_file = Path(sim_file)
+    df = pd.read_csv(sim_file / "df.csv")
+    embs = np.load(sim_file / "embs.npy")
+    df["embs"] = list(embs)
     return Sim(df)

From 8d3c60d94b6b6ea7aec16b2bc13fd3e03e187ea5 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Thu, 9 May 2024 16:26:42 -0700
Subject: [PATCH 02/10] added long term memory

---
 vision_agent/agent/vision_agent_v2.py        | 76 ++++++++++++++++----
 vision_agent/agent/vision_agent_v2_prompt.py | 33 ++++++---
 2 files changed, 84 insertions(+), 25 deletions(-)

diff --git a/vision_agent/agent/vision_agent_v2.py b/vision_agent/agent/vision_agent_v2.py
index 0889bf4e..c2a875ce 100644
--- a/vision_agent/agent/vision_agent_v2.py
+++ b/vision_agent/agent/vision_agent_v2.py
@@ -3,6 +3,7 @@
 from pathlib import Path
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
+import pandas as pd
 from rich.console import Console
 from rich.syntax import Syntax
 from tabulate import tabulate
@@ -20,6 +21,7 @@
     TEST,
     USER_REQ_CONTEXT,
     USER_REQ_SUBTASK_CONTEXT,
+    USER_REQ_SUBTASK_WM_CONTEXT,
 )
 from vision_agent.llm import LLM, OpenAILLM
 from vision_agent.tools.tools_v2 import TOOL_DESCRIPTIONS, TOOLS_DF
@@ -48,11 +50,16 @@ def write_plan(
 
 
 def write_code(
-    user_req: str, subtask: str, tool_info: str, code: str, model: LLM
+    user_req: str,
+    subtask: str,
+    working_memory: str,
+    tool_info: str,
+    code: str,
+    model: LLM,
 ) -> str:
     prompt = CODE.format(
-        context=USER_REQ_SUBTASK_CONTEXT.format(
-            user_requirement=user_req, subtask=subtask
+        context=USER_REQ_SUBTASK_WM_CONTEXT.format(
+            user_requirement=user_req, working_memory=working_memory, subtask=subtask
         ),
         tool_info=tool_info,
         code=code,
@@ -83,14 +90,24 @@ def write_test(
     return extract_code(code)
 
 
-def debug_code(sub_task: str, working_memory: List[str], model: LLM) -> Tuple[str, str]:
+def debug_code(
+    user_req: str,
+    subtask: str,
+    retrieved_ltm: str,
+    working_memory: str,
+    model: LLM,
+) -> Tuple[str, str]:
     # Make debug model output JSON
     if hasattr(model, "kwargs"):
         model.kwargs["response_format"] = {"type": "json_object"}
     prompt = DEBUG.format(
         debug_example=DEBUG_EXAMPLE,
-        context=USER_REQ_CONTEXT.format(user_requirement=sub_task),
-        previous_impl="\n".join(working_memory),
+        context=USER_REQ_SUBTASK_WM_CONTEXT.format(
+            user_requirement=user_req,
+            subtask=subtask,
+            working_memory=retrieved_ltm,
+        ),
+        previous_impl=working_memory,
     )
     messages = [
         {"role": "system", "content": DEBUG_SYS_MSG},
@@ -110,6 +127,7 @@ def write_and_exec_code(
     model: LLM,
     tool_info: str,
     exec: Execute,
+    retrieved_ltm: str,
     max_retry: int = 3,
     verbose: bool = False,
 ) -> Tuple[bool, str, str, Dict[str, List[str]]]:
@@ -117,8 +135,9 @@ def write_and_exec_code(
     counter = 0
     reflection = ""
 
-    # TODO: add working memory to code_writer_call and debug_code
-    code = code_writer_call(user_req, subtask, tool_info, orig_code, model)
+    code = code_writer_call(
+        user_req, subtask, retrieved_ltm, tool_info, orig_code, model
+    )
     success, result = exec.run_isolation(code)
     working_memory: Dict[str, List[str]] = {}
     while not success and counter < max_retry:
@@ -136,7 +155,9 @@ def write_and_exec_code(
                 PREV_CODE_CONTEXT.format(code=code, result=result)
             )
 
-        code, reflection = debug_code(subtask, working_memory[subtask], model)
+        code, reflection = debug_code(
+            user_req, subtask, retrieved_ltm, "\n".join(working_memory[subtask]), model
+        )
         success, result = exec.run_isolation(code)
         counter += 1
         if verbose:
@@ -148,7 +169,7 @@ def write_and_exec_code(
         if success:
             working_memory[subtask].append(
                 PREV_CODE_CONTEXT_WITH_REFLECTION.format(
-                    code=code, result=result, reflection=reflection
+                    reflection=reflection, code=code, result=result
                 )
             )
 
@@ -162,12 +183,14 @@ def run_plan(
     exec: Execute,
     code: str,
     tool_recommender: Sim,
+    long_term_memory: Optional[Sim] = None,
     verbose: bool = False,
 ) -> Tuple[str, str, List[Dict[str, Any]], Dict[str, List[str]]]:
     active_plan = [e for e in plan if "success" not in e or not e["success"]]
-    working_memory: Dict[str, List[str]] = {}
     current_code = code
     current_test = ""
+    retrieved_ltm = ""
+    working_memory: Dict[str, List[str]] = {}
     for task in active_plan:
         _LOGGER.info(
             f"""
@@ -176,7 +199,13 @@ def run_plan(
         tool_info = "\n".join(
             [e["doc"] for e in tool_recommender.top_k(task["instruction"])]
         )
-        success, code, result, task_memory = write_and_exec_code(
+
+        if long_term_memory is not None:
+            retrieved_ltm = "\n".join(
+                [e["doc"] for e in long_term_memory.top_k(task["instruction"], 1)]
+            )
+
+        success, code, result, working_memory_i = write_and_exec_code(
             user_req,
             task["instruction"],
             current_code,
@@ -184,6 +213,7 @@ def run_plan(
             coder,
             tool_info,
             exec,
+            retrieved_ltm,
             verbose,
         )
         if task["type"] == "code":
@@ -191,7 +221,7 @@ def run_plan(
         else:
             current_test = code
 
-        working_memory.update(task_memory)
+        working_memory.update(working_memory_i)
 
         if verbose:
             _CONSOLE.print(
@@ -231,6 +261,7 @@ def __init__(
         self,
         timeout: int = 600,
         tool_recommender: Optional[Sim] = None,
+        long_term_memory: Optional[Sim] = None,
         verbose: bool = False,
     ) -> None:
         self.planner = OpenAILLM(temperature=0.1, json_mode=True)
@@ -241,6 +272,11 @@ def __init__(
         else:
             self.tool_recommender = tool_recommender
         self.verbose = verbose
+        self._working_memory: Dict[str, List[str]] = {}
+        if long_term_memory is not None:
+            if "doc" not in long_term_memory.df.columns:
+                raise ValueError("Long term memory must have a 'doc' column.")
+        self.long_term_memory = long_term_memory
         if self.verbose:
             _LOGGER.setLevel(logging.INFO)
 
@@ -271,12 +307,12 @@ def chat_with_tests(
             f"""Plan:
 {tabulate(tabular_data=plan, headers="keys", tablefmt="mixed_grid", maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"""
         )
-        working_memory: Dict[str, List[str]] = {}
 
         working_code = ""
         working_test = ""
         success = False
 
+        __import__("ipdb").set_trace()
         while not success:
             working_code, working_test, plan, working_memory_i = run_plan(
                 user_req,
@@ -285,10 +321,11 @@ def chat_with_tests(
                 self.exec,
                 working_code,
                 self.tool_recommender,
+                self.long_term_memory,
                 self.verbose,
             )
             success = all(task["success"] for task in plan)
-            working_memory.update(working_memory_i)
+            self._working_memory.update(working_memory_i)
 
             if not success:
                 # TODO: ask for feedback and replan
@@ -296,5 +333,14 @@ def chat_with_tests(
 
         return working_code, working_test
 
+    @property
+    def working_memory(self) -> Sim:
+        data: Dict[str, List[str]] = {"desc": [], "doc": []}
+        for key, value in self._working_memory.items():
+            data["desc"].append(key)
+            data["doc"].append("\n".join(value))
+        df = pd.DataFrame(data)
+        return Sim(df, sim_key="desc")
+
     def log_progress(self, description: str) -> None:
         pass
diff --git a/vision_agent/agent/vision_agent_v2_prompt.py b/vision_agent/agent/vision_agent_v2_prompt.py
index 881274c5..6965aa7b 100644
--- a/vision_agent/agent/vision_agent_v2_prompt.py
+++ b/vision_agent/agent/vision_agent_v2_prompt.py
@@ -1,3 +1,8 @@
+USER_REQ_CONTEXT = """
+## User Requirement
+{user_requirement}
+"""
+
 USER_REQ_SUBTASK_CONTEXT = """
 ## User Requirement
 {user_requirement}
@@ -6,11 +11,16 @@
 {subtask}
 """
 
-USER_REQ_CONTEXT = """
+USER_REQ_SUBTASK_WM_CONTEXT = """
 ## User Requirement
 {user_requirement}
-"""
 
+## Current Subtask
+{subtask}
+
+## Previous Task
+{working_memory}
+"""
 
 PLAN = """
 # Context
@@ -61,8 +71,9 @@
 {code}
 
 # Constraints
-- Write a function that accomplishes the User Requirement. You are supplied code from a previous task, feel free to copy over that code into your own implementation if you need it.
-- Always prioritize using pre-defined tools or code for the same functionality. You have access to all these tools through the `from vision_agent.tools.tools_v2 import *` import.
+- Write a function that accomplishes the 'User Requirement'. You are supplied code from a previous task under 'Previous Code', feel free to copy over that code into your own implementation if you need it.
+- Always prioritize using pre-defined tools or code for the same functionality from 'Tool Info for Current Subtask'. You have access to all these tools through the `from vision_agent.tools.tools_v2 import *` import.
+- You may recieve previous trials and errors under 'Previous Task', this is code, output and reflections from previous tasks. You can use these to avoid running in to the same issues when writing your code.
 - Write clean, readable, and well-documented code.
 
 # Output
@@ -102,6 +113,7 @@ def add(a: int, b: int) -> int:
 
 
 PREV_CODE_CONTEXT = """
+[previous impl]
 ```python
 {code}
 ```
@@ -112,18 +124,20 @@ def add(a: int, b: int) -> int:
 
 
 PREV_CODE_CONTEXT_WITH_REFLECTION = """
+[reflection on previous impl]
+{reflection}
+
+[new impl]
 ```python
 {code}
 ```
 
-[previous output]
+[new output]
 {result}
 
-[reflection on previous impl]
-{reflection}
 """
 
-
+# don't need [previous impl] because it will come from PREV_CODE_CONTEXT or PREV_CODE_CONTEXT_WITH_REFLECTION
 DEBUG = """
 [example]
 Here is an example of debugging with reflection.
@@ -133,7 +147,6 @@ def add(a: int, b: int) -> int:
 [context]
 {context}
 
-[previous impl]
 {previous_impl}
 
 [instruction]
@@ -158,7 +171,7 @@ def add(a: int, b: int) -> int:
 {code}
 
 # Constraints
-- Write code to test the functionality of the provided code according to the Current Subtask. If you cannot test the code, then write code to visualize the result by calling the code.
+- Write code to test the functionality of the provided code according to the 'Current Subtask'. If you cannot test the code, then write code to visualize the result by calling the code.
 - Always prioritize using pre-defined tools for the same functionality.
 - Write clean, readable, and well-documented code.
 

From bf50f746362cd36b75bb0f619e66db572514b4db Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Fri, 10 May 2024 09:18:29 -0700
Subject: [PATCH 03/10] added dynamic re-planning

---
 vision_agent/agent/vision_agent_v2.py        | 88 +++++++++++++-------
 vision_agent/agent/vision_agent_v2_prompt.py |  2 +
 2 files changed, 61 insertions(+), 29 deletions(-)

diff --git a/vision_agent/agent/vision_agent_v2.py b/vision_agent/agent/vision_agent_v2.py
index c2a875ce..dc5d9626 100644
--- a/vision_agent/agent/vision_agent_v2.py
+++ b/vision_agent/agent/vision_agent_v2.py
@@ -1,7 +1,7 @@
 import json
 import logging
 from pathlib import Path
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from typing import Any, Callable, Dict, List, Mapping, Optional, Tuple, Union
 
 import pandas as pd
 from rich.console import Console
@@ -33,6 +33,15 @@
 _CONSOLE = Console()
 
 
+def build_working_memory(working_memory: Mapping[str, List[str]]) -> Sim:
+    data: Mapping[str, List[str]] = {"desc": [], "doc": []}
+    for key, value in working_memory.items():
+        data["desc"].append(key)
+        data["doc"].append("\n".join(value))
+    df = pd.DataFrame(data)  # type: ignore
+    return Sim(df, sim_key="desc")
+
+
 def extract_code(code: str) -> str:
     if "```python" in code:
         code = code[code.find("```python") + len("```python") :]
@@ -41,12 +50,21 @@ def extract_code(code: str) -> str:
 
 
 def write_plan(
-    user_requirements: str, tool_desc: str, model: LLM
-) -> List[Dict[str, Any]]:
+    chat: List[Dict[str, str]],
+    plan: Optional[List[Dict[str, Any]]],
+    tool_desc: str,
+    model: LLM,
+) -> Tuple[str, List[Dict[str, Any]]]:
+    # Get last user request
+    if chat[-1]["role"] != "user":
+        raise ValueError("Last chat message must be from the user.")
+    user_requirements = chat[-1]["content"]
+
     context = USER_REQ_CONTEXT.format(user_requirement=user_requirements)
-    prompt = PLAN.format(context=context, plan="", tool_desc=tool_desc)
-    plan = json.loads(model(prompt).replace("```", "").strip())
-    return plan["plan"]  # type: ignore
+    prompt = PLAN.format(context=context, plan=str(plan), tool_desc=tool_desc)
+    chat[-1]["content"] = prompt
+    plan = json.loads(model.chat(chat).replace("```", "").strip())
+    return plan["user_req"], plan["plan"]  # type: ignore
 
 
 def write_code(
@@ -123,7 +141,7 @@ def write_and_exec_code(
     user_req: str,
     subtask: str,
     orig_code: str,
-    code_writer_call: Callable,
+    code_writer_call: Callable[..., str],
     model: LLM,
     tool_info: str,
     exec: Execute,
@@ -191,6 +209,7 @@ def run_plan(
     current_test = ""
     retrieved_ltm = ""
     working_memory: Dict[str, List[str]] = {}
+
     for task in active_plan:
         _LOGGER.info(
             f"""
@@ -209,7 +228,7 @@ def run_plan(
             user_req,
             task["instruction"],
             current_code,
-            write_code if task["type"] == "code" else write_test,
+            write_code if task["type"] == "code" else write_test,  # type: ignore
             coder,
             tool_info,
             exec,
@@ -277,6 +296,7 @@ def __init__(
             if "doc" not in long_term_memory.df.columns:
                 raise ValueError("Long term memory must have a 'doc' column.")
         self.long_term_memory = long_term_memory
+        self.max_retries = 3
         if self.verbose:
             _LOGGER.setLevel(logging.INFO)
 
@@ -284,36 +304,47 @@ def __call__(
         self,
         input: Union[List[Dict[str, str]], str],
         image: Optional[Union[str, Path]] = None,
+        plan: Optional[List[Dict[str, Any]]] = None,
     ) -> str:
         if isinstance(input, str):
             input = [{"role": "user", "content": input}]
-        code, _ = self.chat_with_tests(input, image)
-        return code
+        results = self.chat_with_workflow(input, image, plan)
+        return results["code"]  # type: ignore
 
-    def chat_with_tests(
+    def chat_with_workflow(
         self,
         chat: List[Dict[str, str]],
         image: Optional[Union[str, Path]] = None,
-    ) -> Tuple[str, str]:
+        plan: Optional[List[Dict[str, Any]]] = None,
+    ) -> Dict[str, Any]:
         if len(chat) == 0:
             raise ValueError("Input cannot be empty.")
 
-        user_req = chat[0]["content"]
         if image is not None:
-            user_req += f" Image name {image}"
+            # append file names to all user messages
+            for chat_i in chat:
+                if chat_i["role"] == "user":
+                    chat_i["content"] += f" Image name {image}"
+
+        working_code = ""
+        if plan is not None:
+            # grab the latest working code from a previous plan
+            for task in plan:
+                if "success" in task and "code" in task and task["success"]:
+                    working_code = task["code"]
 
-        plan = write_plan(user_req, TOOL_DESCRIPTIONS, self.planner)
+        user_req, plan = write_plan(chat, plan, TOOL_DESCRIPTIONS, self.planner)
         _LOGGER.info(
             f"""Plan:
 {tabulate(tabular_data=plan, headers="keys", tablefmt="mixed_grid", maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"""
         )
 
-        working_code = ""
         working_test = ""
+        working_memory: Dict[str, List[str]] = {}
         success = False
+        retries = 0
 
-        __import__("ipdb").set_trace()
-        while not success:
+        while not success and retries < self.max_retries:
             working_code, working_test, plan, working_memory_i = run_plan(
                 user_req,
                 plan,
@@ -325,22 +356,21 @@ def chat_with_tests(
                 self.verbose,
             )
             success = all(task["success"] for task in plan)
-            self._working_memory.update(working_memory_i)
+            working_memory.update(working_memory_i)
 
             if not success:
-                # TODO: ask for feedback and replan
+                # return to user and request feedback
                 break
 
-        return working_code, working_test
+            retries += 1
 
-    @property
-    def working_memory(self) -> Sim:
-        data: Dict[str, List[str]] = {"desc": [], "doc": []}
-        for key, value in self._working_memory.items():
-            data["desc"].append(key)
-            data["doc"].append("\n".join(value))
-        df = pd.DataFrame(data)
-        return Sim(df, sim_key="desc")
+        return {
+            "code": working_code,
+            "test": working_test,
+            "success": success,
+            "working_memory": build_working_memory(working_memory),
+            "plan": plan,
+        }
 
     def log_progress(self, description: str) -> None:
         pass
diff --git a/vision_agent/agent/vision_agent_v2_prompt.py b/vision_agent/agent/vision_agent_v2_prompt.py
index 6965aa7b..4003b4df 100644
--- a/vision_agent/agent/vision_agent_v2_prompt.py
+++ b/vision_agent/agent/vision_agent_v2_prompt.py
@@ -37,11 +37,13 @@
 - For each subtask, you should provide a short instruction on what to do. Ensure the subtasks are large enough to be meaningful, encompassing multiple lines of code.
 - You do not need to have the agent rewrite any tool functionality you already have, you should instead instruct it to utilize one or more of those tools in each subtask.
 - You can have agents either write coding tasks, to code some functionality or testing tasks to test previous functionality.
+- If a current plan exists, examine each item in the plan to determine if it was successful. If there was an item that failed, i.e. 'success': False, then you should rewrite that item and all subsequent items to ensure that the rewritten plan is successful.
 
 Output a list of jsons in the following format:
 
 ```json
 {{
+    "user_req": str, # "a summarized version of the user requirement"
     "plan":
         [
             {{

From e7ef95d1feb1e3ee1fc9f06b5d06650076d86482 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Mon, 13 May 2024 13:17:43 -0700
Subject: [PATCH 04/10] add gpt-4o

---
 vision_agent/llm/llm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vision_agent/llm/llm.py b/vision_agent/llm/llm.py
index 96b6477b..e3563fb6 100644
--- a/vision_agent/llm/llm.py
+++ b/vision_agent/llm/llm.py
@@ -34,7 +34,7 @@ class OpenAILLM(LLM):
 
     def __init__(
         self,
-        model_name: str = "gpt-4-turbo",
+        model_name: str = "gpt-4o",
         api_key: Optional[str] = None,
         json_mode: bool = False,
         system_prompt: Optional[str] = None,

From fd5563625fc23cce4502ca50c64b8cb8566f3933 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Mon, 13 May 2024 13:21:40 -0700
Subject: [PATCH 05/10] update tests

---
 tests/test_llm.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/test_llm.py b/tests/test_llm.py
index bbcc203e..fc6da747 100644
--- a/tests/test_llm.py
+++ b/tests/test_llm.py
@@ -18,7 +18,7 @@ def test_generate_with_mock(openai_llm_mock):  # noqa: F811
     response = llm.generate("test prompt")
     assert response == "mocked response"
     openai_llm_mock.chat.completions.create.assert_called_once_with(
-        model="gpt-4-turbo",
+        model="gpt-4o",
         messages=[{"role": "user", "content": "test prompt"}],
     )
 
@@ -31,7 +31,7 @@ def test_chat_with_mock(openai_llm_mock):  # noqa: F811
     response = llm.chat([{"role": "user", "content": "test prompt"}])
     assert response == "mocked response"
     openai_llm_mock.chat.completions.create.assert_called_once_with(
-        model="gpt-4-turbo",
+        model="gpt-4o",
         messages=[{"role": "user", "content": "test prompt"}],
     )
 
@@ -44,7 +44,7 @@ def test_call_with_mock(openai_llm_mock):  # noqa: F811
     response = llm("test prompt")
     assert response == "mocked response"
     openai_llm_mock.chat.completions.create.assert_called_once_with(
-        model="gpt-4-turbo",
+        model="gpt-4o",
         messages=[{"role": "user", "content": "test prompt"}],
     )
 

From 13df90fbcbe8cd4c7770beb84018d8b5b3e24235 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Mon, 13 May 2024 14:16:14 -0700
Subject: [PATCH 06/10] update tests

---
 tests/test_llm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_llm.py b/tests/test_llm.py
index fc6da747..b10ba7fc 100644
--- a/tests/test_llm.py
+++ b/tests/test_llm.py
@@ -51,7 +51,7 @@ def test_call_with_mock(openai_llm_mock):  # noqa: F811
     response = llm([{"role": "user", "content": "test prompt"}])
     assert response == "mocked response"
     openai_llm_mock.chat.completions.create.assert_called_with(
-        model="gpt-4-turbo",
+        model="gpt-4o",
         messages=[{"role": "user", "content": "test prompt"}],
     )
 

From 3bfab15ef4c6d388a38571bc5c061fa3db9f965a Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Mon, 13 May 2024 14:27:02 -0700
Subject: [PATCH 07/10] fixed exit loop early

---
 vision_agent/agent/vision_agent_v2.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/vision_agent/agent/vision_agent_v2.py b/vision_agent/agent/vision_agent_v2.py
index dc5d9626..8c4fd9cd 100644
--- a/vision_agent/agent/vision_agent_v2.py
+++ b/vision_agent/agent/vision_agent_v2.py
@@ -91,7 +91,12 @@ def write_code(
 
 
 def write_test(
-    user_req: str, subtask: str, tool_info: str, code: str, model: LLM
+    user_req: str,
+    subtask: str,
+    tool_info: str,
+    _: str,
+    code: str,
+    model: LLM
 ) -> str:
     prompt = TEST.format(
         context=USER_REQ_SUBTASK_CONTEXT.format(
@@ -233,7 +238,7 @@ def run_plan(
             tool_info,
             exec,
             retrieved_ltm,
-            verbose,
+            verbose=verbose,
         )
         if task["type"] == "code":
             current_code = code

From 6e09b1f869f6cd700152bd57d82afce90795f011 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Mon, 13 May 2024 14:31:55 -0700
Subject: [PATCH 08/10] add some extra parsing for code snippets

---
 vision_agent/agent/vision_agent_v2.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/vision_agent/agent/vision_agent_v2.py b/vision_agent/agent/vision_agent_v2.py
index 8c4fd9cd..59ec7a2d 100644
--- a/vision_agent/agent/vision_agent_v2.py
+++ b/vision_agent/agent/vision_agent_v2.py
@@ -46,6 +46,8 @@ def extract_code(code: str) -> str:
     if "```python" in code:
         code = code[code.find("```python") + len("```python") :]
         code = code[: code.find("```")]
+    if code.startswith("python\n"):
+        code = code[len("python\n") :]
     return code
 
 

From 6bd0bb68d1658fe402ae15810266d535a90690f5 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Mon, 13 May 2024 14:41:12 -0700
Subject: [PATCH 09/10] fix formatting

---
 vision_agent/agent/vision_agent_v2.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/vision_agent/agent/vision_agent_v2.py b/vision_agent/agent/vision_agent_v2.py
index 59ec7a2d..7f660fb3 100644
--- a/vision_agent/agent/vision_agent_v2.py
+++ b/vision_agent/agent/vision_agent_v2.py
@@ -93,12 +93,7 @@ def write_code(
 
 
 def write_test(
-    user_req: str,
-    subtask: str,
-    tool_info: str,
-    _: str,
-    code: str,
-    model: LLM
+    user_req: str, subtask: str, tool_info: str, _: str, code: str, model: LLM
 ) -> str:
     prompt = TEST.format(
         context=USER_REQ_SUBTASK_CONTEXT.format(

From ebd4d96ca9d4db63f587682359072889ae9246a8 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Mon, 13 May 2024 14:48:03 -0700
Subject: [PATCH 10/10] fix typing error

---
 vision_agent/agent/vision_agent_v2.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vision_agent/agent/vision_agent_v2.py b/vision_agent/agent/vision_agent_v2.py
index 7f660fb3..650b5c30 100644
--- a/vision_agent/agent/vision_agent_v2.py
+++ b/vision_agent/agent/vision_agent_v2.py
@@ -230,7 +230,7 @@ def run_plan(
             user_req,
             task["instruction"],
             current_code,
-            write_code if task["type"] == "code" else write_test,  # type: ignore
+            write_code if task["type"] == "code" else write_test,
             coder,
             tool_info,
             exec,