From c8d5343ec7698924c3275d183d1d3aaac66dde0e Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Fri, 17 May 2024 10:00:07 -0700
Subject: [PATCH 01/12] renamed prompt to prompts

---
 vision_agent/agent/vision_agent_v2.py         |   2 +-
 ...2_prompt.py => vision_agent_v2_prompts.py} |   0
 vision_agent/agent/vision_agent_v3_prompts.py | 107 ++++++++++++++++++
 3 files changed, 108 insertions(+), 1 deletion(-)
 rename vision_agent/agent/{vision_agent_v2_prompt.py => vision_agent_v2_prompts.py} (100%)
 create mode 100644 vision_agent/agent/vision_agent_v3_prompts.py

diff --git a/vision_agent/agent/vision_agent_v2.py b/vision_agent/agent/vision_agent_v2.py
index a20afcee..1c7083a8 100644
--- a/vision_agent/agent/vision_agent_v2.py
+++ b/vision_agent/agent/vision_agent_v2.py
@@ -10,7 +10,7 @@
 from tabulate import tabulate
 
 from vision_agent.agent import Agent
-from vision_agent.agent.vision_agent_v2_prompt import (
+from vision_agent.agent.vision_agent_v2_prompts import (
     CODE,
     CODE_SYS_MSG,
     DEBUG,
diff --git a/vision_agent/agent/vision_agent_v2_prompt.py b/vision_agent/agent/vision_agent_v2_prompts.py
similarity index 100%
rename from vision_agent/agent/vision_agent_v2_prompt.py
rename to vision_agent/agent/vision_agent_v2_prompts.py
diff --git a/vision_agent/agent/vision_agent_v3_prompts.py b/vision_agent/agent/vision_agent_v3_prompts.py
new file mode 100644
index 00000000..85fe817f
--- /dev/null
+++ b/vision_agent/agent/vision_agent_v3_prompts.py
@@ -0,0 +1,107 @@
+USER_REQ = """
+## User Request
+{user_request}
+"""
+
+
+PLAN = """
+# Context
+{context}
+
+# Tools Available
+{tool_desc}
+
+# Task
+Based on the context and tools you have available, write a plan of subtasks to achieve the user request utilizing tools when necessary. Output a list of jsons in the following format:
+
+```json
+{{
+    "user_req": str # a detailed version of the user requirement
+    "plan":
+        [
+            {{
+                "instruction": str # what you should do in this task, one short phrase or sentence
+            }}
+        ]
+}}
+```
+"""
+
+CODE = """
+**Role**: You are a software programmer.
+
+**Task**: As a programmer, you are required to complete the function. Use a Chain-of-Thought approach to break down the problem, create pseudocode, and then write the code in Python language. Ensure that your code is efficient, readable, and well-commented. Return the requested information from the function you create.
+
+**Documentation**:
+This is the documentation for the functions you have access to. You may call any of these functions to help you complete the task. They are available through importing `from vision_agent.tools.tools_v2 import *`.
+{docstring}
+
+**Input Code Snippet**:
+```python
+def execute(image_path: str):
+    # Your code here
+```
+
+**User Instructions**:
+{question}
+
+**Previous Feedback**:
+{feedback}
+
+**Instructions**:
+1. **Understand and Clarify**: Make sure you understand the task.
+2. **Algorithm/Method Selection**: Decide on the most efficient way.
+3. **Pseudocode Creation**: Write down the steps you will follow in pseudocode.
+4. **Code Generation**: Translate your pseudocode into executable Python code.
+"""
+
+TEST = """
+**Role**: As a tester, your task is to create comprehensive test cases for the incomplete `execute` function. These test cases should encompass Basic and Edge case scenarios to ensure the code's robustness and reliability.
+
+**Documentation**:
+This is the documentation for the functions you have access to. You may call any of these functions to help you complete the task. They are available through importing `from vision_agent.tools.tools_v2 import *`.
+{docstring}
+
+**User Instructions**:
+{question}
+
+**Input Code Snippet**:
+```python
+### Please decided how would you want to generate test cases. Based on incomplete code or completed version.
+{code}
+```
+
+**1. Basic Test Cases**:
+- **Objective**: To verify the fundamental functionality under normal conditions.
+
+**2. Edge Test Cases**:
+- **Objective**: To evaluate the function's behavior under extreme or unusual conditions.
+
+**Instructions**:
+- Implement a comprehensive set of test cases following the guidelines above.
+- Ensure each test case is well-documented with comments explaining the scenario it covers.
+- Pay special attention to edge cases as they often reveal hidden bugs.
+"""
+
+FIX_BUG = """
+Please re-complete the code to fix the error message. Here is the previous version:
+```python
+{code}
+```
+
+When we run this code:
+```python
+{tests}
+```
+
+It raises this error:
+```python
+{result}
+```
+
+This is previous feedback provided on the code:
+{feedback}
+
+Please fix the bug by follow the error information and only return python code. You do not need return the test cases. The re-completion code should in triple backticks format(i.e., in ```python ```).
+"""
+

From 6f3e62ff98da798fc0b16f9ef561cf417db4701b Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Fri, 17 May 2024 21:52:45 -0700
Subject: [PATCH 02/12] added vision agent v3

---
 vision_agent/agent/__init__.py                |   1 +
 vision_agent/agent/vision_agent_v3.py         | 298 ++++++++++++++++++
 vision_agent/agent/vision_agent_v3_prompts.py | 113 ++++++-
 3 files changed, 398 insertions(+), 14 deletions(-)
 create mode 100644 vision_agent/agent/vision_agent_v3.py

diff --git a/vision_agent/agent/__init__.py b/vision_agent/agent/__init__.py
index b358d3b0..2f62dbf1 100644
--- a/vision_agent/agent/__init__.py
+++ b/vision_agent/agent/__init__.py
@@ -4,3 +4,4 @@
 from .reflexion import Reflexion
 from .vision_agent import VisionAgent
 from .vision_agent_v2 import VisionAgentV2
+from .vision_agent_v3 import VisionAgentV3
diff --git a/vision_agent/agent/vision_agent_v3.py b/vision_agent/agent/vision_agent_v3.py
new file mode 100644
index 00000000..8ce99152
--- /dev/null
+++ b/vision_agent/agent/vision_agent_v3.py
@@ -0,0 +1,298 @@
+import json
+import logging
+import sys
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple, Union, cast
+
+from rich.console import Console
+from rich.syntax import Syntax
+from tabulate import tabulate
+
+from vision_agent.agent import Agent
+from vision_agent.agent.vision_agent_v3_prompts import (
+    CODE,
+    FIX_BUG,
+    PLAN,
+    REFLECT,
+    TEST,
+    USER_REQ,
+)
+from vision_agent.llm import LLM, OpenAILLM
+from vision_agent.tools.tools_v2 import TOOL_DESCRIPTIONS, TOOLS_DF, UTILITIES_DOCSTRING
+from vision_agent.utils import Execute
+from vision_agent.utils.sim import Sim
+
+logging.basicConfig(stream=sys.stdout)
+_LOGGER = logging.getLogger(__name__)
+_MAX_TABULATE_COL_WIDTH = 80
+_EXECUTE = Execute(600)
+_CONSOLE = Console()
+
+
+def format_memory(memory: List[Dict[str, str]]) -> str:
+    return "\n\n".join(
+        [f"Code: {m['code']}\nFeedback: {m['feedback']}" for m in memory]
+    )
+
+
+def extract_code(code: str) -> str:
+    if "\n```python" in code:
+        start = "\n```python"
+    elif "```python" in code:
+        start = "```python"
+    else:
+        return code
+
+    code = code[code.find(start) + len(start) :]
+    code = code[: code.find("```")]
+    if code.startswith("python\n"):
+        code = code[len("python\n") :]
+    return code
+
+
+def extract_json(json_str: str) -> Dict[str, Any]:
+    try:
+        json_dict = json.loads(json_str)
+    except json.JSONDecodeError:
+        if "```json" in json_str:
+            json_str = json_str[json_str.find("```json") + len("```json") :]
+            json_str = json_str[: json_str.find("```")]
+        elif "```" in json_str:
+            json_str = json_str[json_str.find("```") + len("```") :]
+            # get the last ``` not one from an intermediate string
+            json_str = json_str[: json_str.find("}```")]
+        json_dict = json.loads(json_str)
+    return json_dict  # type: ignore
+
+
+def write_plan(
+    chat: List[Dict[str, str]],
+    tool_desc: str,
+    working_memory: str,
+    model: LLM,
+) -> List[Dict[str, str]]:
+    chat = chat.copy()
+    if chat[-1]["role"] != "user":
+        raise ValueError("Last chat message must be from the user.")
+
+    user_request = chat[-1]["content"]
+    context = USER_REQ.format(user_request=user_request)
+    prompt = PLAN.format(context=context, tool_desc=tool_desc, feedback=working_memory)
+    chat[-1]["content"] = prompt
+    return extract_json(model.chat(chat))["plan"]
+
+
+def reflect(
+    chat: List[Dict[str, str]],
+    plan: str,
+    code: str,
+    test: str,
+    model: LLM,
+) -> Dict[str, Union[str, bool]]:
+    chat = chat.copy()
+    if chat[-1]["role"] != "user":
+        raise ValueError("Last chat message must be from the user.")
+
+    user_request = chat[-1]["content"]
+    context = USER_REQ.format(user_request=user_request)
+    prompt = REFLECT.format(context=context, plan=plan, code=code, test=test)
+    chat[-1]["content"] = prompt
+    return extract_json(model.chat(chat))
+
+
+def write_and_test_code(
+    task: str,
+    tool_info: str,
+    tool_utils: str,
+    working_memory: str,
+    coder: LLM,
+    tester: LLM,
+    debugger: LLM,
+    verbosity: int = 0,
+    max_retries: int = 3,
+) -> Dict[str, Any]:
+    code = extract_code(
+        coder(CODE.format(docstring=tool_info, question=task, feedback=working_memory))
+    )
+    test = extract_code(
+        tester(
+            TEST.format(
+                docstring=tool_utils, question=task, code=code, feedback=working_memory
+            )
+        )
+    )
+
+    success, result = _EXECUTE.run_isolation(f"{code}\n{test}")
+    if verbosity == 2:
+        _LOGGER.info(f"First code and tests:")
+        _CONSOLE.print(
+            Syntax(f"{code}\n{test}", "python", theme="gruvbox-dark", line_numbers=True)
+        )
+        _LOGGER.info(f"First result: {result}")
+
+    count = 0
+    new_working_memory = []
+    while not success and count < max_retries:
+        fixed_code_and_test = extract_json(
+            debugger(
+                FIX_BUG.format(
+                    code=code, tests=test, result=result, feedback=working_memory
+                )
+            )
+        )
+        if fixed_code_and_test["code"].strip() != "":
+            code = extract_code(fixed_code_and_test["code"])
+        if fixed_code_and_test["test"].strip() != "":
+            test = extract_code(fixed_code_and_test["test"])
+        new_working_memory.append(
+            {"code": f"{code}\n{test}", "feedback": fixed_code_and_test["reflections"]}
+        )
+
+        success, result = _EXECUTE.run_isolation(f"{code}\n{test}")
+        if verbosity == 2:
+            _LOGGER.info(
+                f"Debug attempt {count + 1}, reflection: {fixed_code_and_test['reflections']}"
+            )
+            _CONSOLE.print(
+                Syntax(
+                    f"{code}\n{test}", "python", theme="gruvbox-dark", line_numbers=True
+                )
+            )
+            _LOGGER.info(f"Debug result: {result}")
+        count += 1
+
+    if verbosity == 1:
+        _CONSOLE.print(
+            Syntax(f"{code}\n{test}", "python", theme="gruvbox-dark", line_numbers=True)
+        )
+        _LOGGER.info(f"Result: {result}")
+
+    return {
+        "code": code,
+        "test": test,
+        "success": success,
+        "working_memory": new_working_memory,
+    }
+
+
+def retrieve_tools(
+    plan: List[Dict[str, str]], tool_recommender: Sim, verbosity: int = 0
+) -> str:
+    tool_info = []
+    tool_desc = []
+    for task in plan:
+        tools = tool_recommender.top_k(task["instruction"], k=2, thresh=0.3)
+        tool_info.extend([e["doc"] for e in tools])
+        tool_desc.extend([e["desc"] for e in tools])
+    if verbosity == 2:
+        _LOGGER.info(f"Tools: {tool_desc}")
+    tool_info = set(tool_info)
+    return "\n\n".join(tool_info)
+
+
+class VisionAgentV3(Agent):
+    def __init__(
+        self,
+        timeout: int = 600,
+        planner: Optional[LLM] = None,
+        coder: Optional[LLM] = None,
+        tester: Optional[LLM] = None,
+        debugger: Optional[LLM] = None,
+        tool_recommender: Optional[Sim] = None,
+        verbosity: int = 0,
+    ) -> None:
+        self.planner = (
+            OpenAILLM(temperature=0.0, json_mode=True) if planner is None else planner
+        )
+        self.coder = OpenAILLM(temperature=0.0) if coder is None else coder
+        self.tester = OpenAILLM(temperature=0.0) if tester is None else tester
+        self.debugger = (
+            OpenAILLM(temperature=0.0, json_mode=True) if debugger is None else debugger
+        )
+
+        self.tool_recommender = (
+            Sim(TOOLS_DF, sim_key="desc")
+            if tool_recommender is None
+            else tool_recommender
+        )
+        self.verbosity = verbosity
+        self.max_retries = 3
+
+    def __call__(
+        self,
+        input: Union[List[Dict[str, str]], str],
+        image: Optional[Union[str, Path]] = None,
+    ) -> str:
+        if isinstance(input, str):
+            input = [{"role": "user", "content": input}]
+        results = self.chat_with_workflow(input, image)
+        return results["code"]  # type: ignore
+
+    def chat_with_workflow(
+        self,
+        chat: List[Dict[str, str]],
+        image: Optional[Union[str, Path]] = None,
+    ) -> Dict[str, Any]:
+        if len(chat) == 0:
+            raise ValueError("Chat cannot be empty.")
+
+        if image is not None:
+            for chat_i in chat:
+                if chat_i["role"] == "user":
+                    chat_i["content"] += f" Image name {image}"
+
+        code = ""
+        test = ""
+        working_memory = []
+        results = {"code": "", "test": "", "plan": []}
+        plan = []
+        success = False
+        retries = 0
+
+        while not success and retries < self.max_retries:
+            plan_i = write_plan(
+                chat, TOOL_DESCRIPTIONS, format_memory(working_memory), self.planner
+            )
+            plan_i_str = "\n-".join([e["instruction"] for e in plan_i])
+            if self.verbosity == 1 or self.verbosity == 2:
+                _LOGGER.info(
+                    f"""
+{tabulate(tabular_data=plan_i, headers="keys", tablefmt="mixed_grid", maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"""
+                )
+
+            tool_info = retrieve_tools(
+                plan_i,
+                self.tool_recommender,
+                self.verbosity,
+            )
+            results = write_and_test_code(
+                plan_i_str,
+                tool_info,
+                UTILITIES_DOCSTRING,
+                format_memory(working_memory),
+                self.coder,
+                self.tester,
+                self.debugger,
+                verbosity=self.verbosity,
+            )
+            success = cast(bool, results["success"])
+            code = cast(str, results["code"])
+            test = cast(str, results["test"])
+            working_memory.extend(results["working_memory"])
+            plan.append({"code": code, "test": test, "plan": plan_i})
+
+            reflection = reflect(chat, plan_i_str, code, test, self.planner)
+            success = cast(bool, reflection["success"])
+            working_memory.append(
+                {"code": f"{code}\n{test}", "feedback": reflection["feedback"]}
+            )
+
+        return {
+            "code": code,
+            "test": test,
+            "plan": plan,
+            "working_memory": working_memory,
+        }
+
+    def log_progress(self, description: str) -> None:
+        pass
diff --git a/vision_agent/agent/vision_agent_v3_prompts.py b/vision_agent/agent/vision_agent_v3_prompts.py
index 85fe817f..dcd7947d 100644
--- a/vision_agent/agent/vision_agent_v3_prompts.py
+++ b/vision_agent/agent/vision_agent_v3_prompts.py
@@ -5,18 +5,20 @@
 
 
 PLAN = """
-# Context
+**Context**
 {context}
 
-# Tools Available
+**Tools Available**:
 {tool_desc}
 
-# Task
+**Previous Feedback**:
+{feedback}
+
+**Instructions**:
 Based on the context and tools you have available, write a plan of subtasks to achieve the user request utilizing tools when necessary. Output a list of jsons in the following format:
 
 ```json
 {{
-    "user_req": str # a detailed version of the user requirement
     "plan":
         [
             {{
@@ -30,16 +32,16 @@
 CODE = """
 **Role**: You are a software programmer.
 
-**Task**: As a programmer, you are required to complete the function. Use a Chain-of-Thought approach to break down the problem, create pseudocode, and then write the code in Python language. Ensure that your code is efficient, readable, and well-commented. Return the requested information from the function you create.
+**Task**: As a programmer, you are required to complete the function. Use a Chain-of-Thought approach to break down the problem, create pseudocode, and then write the code in Python language. Ensure that your code is efficient, readable, and well-commented. Return the requested information from the function you create. Do no call your code unnecessarily, a test will be run after it is submitted.
 
 **Documentation**:
 This is the documentation for the functions you have access to. You may call any of these functions to help you complete the task. They are available through importing `from vision_agent.tools.tools_v2 import *`.
+
 {docstring}
 
 **Input Code Snippet**:
 ```python
-def execute(image_path: str):
-    # Your code here
+# Your code here
 ```
 
 **User Instructions**:
@@ -56,10 +58,11 @@ def execute(image_path: str):
 """
 
 TEST = """
-**Role**: As a tester, your task is to create comprehensive test cases for the incomplete `execute` function. These test cases should encompass Basic and Edge case scenarios to ensure the code's robustness and reliability.
+**Role**: As a tester, your task is to create comprehensive test cases for the provided code. These test cases should encompass Basic and Edge case scenarios to ensure the code's robustness and reliability if possible.
 
 **Documentation**:
-This is the documentation for the functions you have access to. You may call any of these functions to help you complete the task. They are available through importing `from vision_agent.tools.tools_v2 import *`.
+This is the documentation for the functions you have access to. You may call any of these functions to help you complete the task. They are available through importing `from vision_agent.tools.tools_v2 import *`. You do not need to test these functions, only the code provided by the user.
+
 {docstring}
 
 **User Instructions**:
@@ -78,18 +81,72 @@ def execute(image_path: str):
 - **Objective**: To evaluate the function's behavior under extreme or unusual conditions.
 
 **Instructions**:
-- Implement a comprehensive set of test cases following the guidelines above.
-- Ensure each test case is well-documented with comments explaining the scenario it covers.
-- Pay special attention to edge cases as they often reveal hidden bugs.
+1. Implement a comprehensive set of test cases following the guidelines above and testing each of the user's instructions.
+2. Ensure each test case is well-documented with comments explaining the scenario it covers.
+3. Pay special attention to edge cases as they often reveal hidden bugs.
+4. DO NOT use any files that are not provided by the user's instructions, your test must be run and will crash if it tries to load a non-existent file.
+5. If it is a vision task, you can visualize the output to ensure it is correct.
+
+You should format your test cases at the end of your response wrapped in ```python ``` tags like in the following example:
+```python
+# You can run assertions to ensure the function is working as expected
+assert function(input) == expected_output, "Test case description"
+
+# You can simply call the function to ensure it runs
+function(input)
+
+# Or you can visualize the output
+output = function(input)
+visualize(output)
+```
+
+**Examples**
+## Prompt 1:
+```python
+def detect_cats_and_dogs(image_path: str) -> Dict[str, List[List[float]]]:
+    \""" Detects cats and dogs in an image. Returns a dictionary with
+    {{
+        "cats": [[x1, y1, x2, y2], ...], "dogs": [[x1, y1, x2, y2], ...]
+    }} 
+    \"""
+```
+
+## Completion 1:
+```python
+# We can test to ensure the output has the correct structure but we cannot test the
+# content of the output without knowing the image. We can test on "image.jpg" because
+# it is provided by the user so we know it exists.
+output = detect_cats_and_dogs("image.jpg")
+assert "cats" in output, "The output should contain 'cats'
+assert "dogs" in output, "The output should contain 'dogs'
+```
+
+## Prompt 2:
+```python
+def find_text(image_path: str, text: str) -> str:
+    \""" Finds the text in the image and returns the text. \"""
+
+## Completion 2:
+```python
+# Because we do not know ahead of time what text is in the image, we can only run the
+# code and print the results. We can test on "image.jpg" because it is provided by the
+# user so we know it exists.
+found_text = find_text("image.jpg", "Hello World")
+print(found_text)
+```
 """
 
+
 FIX_BUG = """
+**Role** As a coder, your job is to find the error in the code and fix it. You are running in a notebook setting so feel free to run !pip install to install missing packages.
+
+**Instructions**:
 Please re-complete the code to fix the error message. Here is the previous version:
 ```python
 {code}
 ```
 
-When we run this code:
+When we run this test code:
 ```python
 {tests}
 ```
@@ -102,6 +159,34 @@ def execute(image_path: str):
 This is previous feedback provided on the code:
 {feedback}
 
-Please fix the bug by follow the error information and only return python code. You do not need return the test cases. The re-completion code should in triple backticks format(i.e., in ```python ```).
+Please fix the bug by follow the error information and return a JSON object with the following format:
+{{
+    "reflections": str # any thoughts you have about the bug and how you fixed it
+    "code": str # the fixed code if any, else an empty string
+    "test": str # the fixed test code if any, else an empty string
+}}
 """
 
+
+REFLECT = """
+**Role**: You are a reflection agent. Your job is to look at the original user request and the code and test cases produced and determine if they meet the user's request. If they do not, you must provide feedback on how to improve the code and test cases.
+
+**Context**:
+{context}
+
+**Plan**:
+{plan}
+
+**Code**:
+{code}
+
+**Test Cases**:
+{test}
+
+**Instructions**:
+Respond in JSON format with the following structure:
+{{
+    "feedback": str # the feedback you would give to the coder and tester
+    "success": bool # whether the code and tests meet the user request
+}}
+"""

From 81ea443c7baf6acff4abf1e1aab6abf7d49637ea Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Fri, 17 May 2024 21:52:57 -0700
Subject: [PATCH 03/12] fixed execute issue

---
 vision_agent/utils/execute.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/vision_agent/utils/execute.py b/vision_agent/utils/execute.py
index aa882728..b1c3417b 100644
--- a/vision_agent/utils/execute.py
+++ b/vision_agent/utils/execute.py
@@ -4,6 +4,7 @@
 import base64 as b64
 import io
 import re
+from time import sleep
 from typing import Dict, List, Tuple
 
 import nbformat
@@ -75,6 +76,7 @@ def reset(self) -> None:
         self.terminate()
         self.nb = nbformat.v4.new_notebook()
         self.nb_client = NotebookClient(self.nb, timeout=self.timeout)
+        sleep(1)
         self.build()
 
     def run_cell(self, cell: NotebookNode, cell_index: int) -> Tuple[bool, str]:
@@ -83,6 +85,7 @@ def run_cell(self, cell: NotebookNode, cell_index: int) -> Tuple[bool, str]:
             return parse_outputs(self.nb.cells[-1].outputs)
         except CellTimeoutError:
             run_sync(self.nb_client.km.interrupt_kernel)()  # type: ignore
+            sleep(1)
             return False, "Cell execution timed out."
         except DeadKernelError:
             self.reset()

From a61497b5626e920934b74d6acec38d435db09195 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Fri, 17 May 2024 21:58:09 -0700
Subject: [PATCH 04/12] fixed type issue

---
 vision_agent/agent/vision_agent_v3.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/vision_agent/agent/vision_agent_v3.py b/vision_agent/agent/vision_agent_v3.py
index 8ce99152..3182a674 100644
--- a/vision_agent/agent/vision_agent_v3.py
+++ b/vision_agent/agent/vision_agent_v3.py
@@ -79,7 +79,7 @@ def write_plan(
     context = USER_REQ.format(user_request=user_request)
     prompt = PLAN.format(context=context, tool_desc=tool_desc, feedback=working_memory)
     chat[-1]["content"] = prompt
-    return extract_json(model.chat(chat))["plan"]
+    return extract_json(model.chat(chat))["plan"]  # type: ignore
 
 
 def reflect(
@@ -186,8 +186,8 @@ def retrieve_tools(
         tool_desc.extend([e["desc"] for e in tools])
     if verbosity == 2:
         _LOGGER.info(f"Tools: {tool_desc}")
-    tool_info = set(tool_info)
-    return "\n\n".join(tool_info)
+    tool_info_set = set(tool_info)
+    return "\n\n".join(tool_info_set)
 
 
 class VisionAgentV3(Agent):
@@ -243,7 +243,7 @@ def chat_with_workflow(
 
         code = ""
         test = ""
-        working_memory = []
+        working_memory: List[Dict[str, str]] = []
         results = {"code": "", "test": "", "plan": []}
         plan = []
         success = False
@@ -278,13 +278,14 @@ def chat_with_workflow(
             success = cast(bool, results["success"])
             code = cast(str, results["code"])
             test = cast(str, results["test"])
-            working_memory.extend(results["working_memory"])
+            working_memory.extend(results["working_memory"])  # type: ignore
             plan.append({"code": code, "test": test, "plan": plan_i})
 
             reflection = reflect(chat, plan_i_str, code, test, self.planner)
+            feedback = cast(str, reflection["feedback"])
             success = cast(bool, reflection["success"])
             working_memory.append(
-                {"code": f"{code}\n{test}", "feedback": reflection["feedback"]}
+                {"code": f"{code}\n{test}", "feedback": feedback}
             )
 
         return {

From 482fe2d3dfa2d43016f97f707d687be0c3927430 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Fri, 17 May 2024 21:59:13 -0700
Subject: [PATCH 05/12] fixed flake8

---
 vision_agent/agent/vision_agent_v3.py         | 4 ++--
 vision_agent/agent/vision_agent_v3_prompts.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/vision_agent/agent/vision_agent_v3.py b/vision_agent/agent/vision_agent_v3.py
index 3182a674..0ea4b248 100644
--- a/vision_agent/agent/vision_agent_v3.py
+++ b/vision_agent/agent/vision_agent_v3.py
@@ -2,7 +2,7 @@
 import logging
 import sys
 from pathlib import Path
-from typing import Any, Dict, List, Optional, Tuple, Union, cast
+from typing import Any, Dict, List, Optional, Union, cast
 
 from rich.console import Console
 from rich.syntax import Syntax
@@ -124,7 +124,7 @@ def write_and_test_code(
 
     success, result = _EXECUTE.run_isolation(f"{code}\n{test}")
     if verbosity == 2:
-        _LOGGER.info(f"First code and tests:")
+        _LOGGER.info("First code and tests:")
         _CONSOLE.print(
             Syntax(f"{code}\n{test}", "python", theme="gruvbox-dark", line_numbers=True)
         )
diff --git a/vision_agent/agent/vision_agent_v3_prompts.py b/vision_agent/agent/vision_agent_v3_prompts.py
index dcd7947d..5408b34c 100644
--- a/vision_agent/agent/vision_agent_v3_prompts.py
+++ b/vision_agent/agent/vision_agent_v3_prompts.py
@@ -107,7 +107,7 @@ def detect_cats_and_dogs(image_path: str) -> Dict[str, List[List[float]]]:
     \""" Detects cats and dogs in an image. Returns a dictionary with
     {{
         "cats": [[x1, y1, x2, y2], ...], "dogs": [[x1, y1, x2, y2], ...]
-    }} 
+    }}
     \"""
 ```
 

From 6d786378c307c5077f6e99628929ce12f1043418 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Fri, 17 May 2024 22:01:10 -0700
Subject: [PATCH 06/12] black and isort

---
 tests/test_tools.py                   | 8 ++++----
 vision_agent/agent/vision_agent_v3.py | 4 +---
 vision_agent/tools/tools_v2.py        | 6 +++---
 3 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/tests/test_tools.py b/tests/test_tools.py
index 2a848a02..56ca2e02 100644
--- a/tests/test_tools.py
+++ b/tests/test_tools.py
@@ -2,13 +2,13 @@
 
 from vision_agent.tools.tools_v2 import (
     clip,
-    zero_shot_counting,
-    visual_prompt_counting,
-    image_question_answering,
-    ocr,
     grounding_dino,
     grounding_sam,
     image_caption,
+    image_question_answering,
+    ocr,
+    visual_prompt_counting,
+    zero_shot_counting,
 )
 
 
diff --git a/vision_agent/agent/vision_agent_v3.py b/vision_agent/agent/vision_agent_v3.py
index 0ea4b248..06e37361 100644
--- a/vision_agent/agent/vision_agent_v3.py
+++ b/vision_agent/agent/vision_agent_v3.py
@@ -284,9 +284,7 @@ def chat_with_workflow(
             reflection = reflect(chat, plan_i_str, code, test, self.planner)
             feedback = cast(str, reflection["feedback"])
             success = cast(bool, reflection["success"])
-            working_memory.append(
-                {"code": f"{code}\n{test}", "feedback": feedback}
-            )
+            working_memory.append({"code": f"{code}\n{test}", "feedback": feedback})
 
         return {
             "code": code,
diff --git a/vision_agent/tools/tools_v2.py b/vision_agent/tools/tools_v2.py
index 37e76a28..04f4dedf 100644
--- a/vision_agent/tools/tools_v2.py
+++ b/vision_agent/tools/tools_v2.py
@@ -16,12 +16,12 @@
 from vision_agent.tools.tool_utils import _send_inference_request
 from vision_agent.utils import extract_frames_from_video
 from vision_agent.utils.image_utils import (
+    b64_to_pil,
     convert_to_b64,
+    denormalize_bbox,
+    get_image_size,
     normalize_bbox,
     rle_decode,
-    b64_to_pil,
-    get_image_size,
-    denormalize_bbox,
 )
 
 COLORS = [

From a539cbc55efe0f6becb9e27e1422b97945c6fae2 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Mon, 20 May 2024 07:15:29 +0900
Subject: [PATCH 07/12] switch to simple test case

---
 vision_agent/agent/vision_agent_v3.py         |  3 +-
 vision_agent/agent/vision_agent_v3_prompts.py | 40 ++++++++++++++-----
 2 files changed, 31 insertions(+), 12 deletions(-)

diff --git a/vision_agent/agent/vision_agent_v3.py b/vision_agent/agent/vision_agent_v3.py
index 06e37361..b04eac25 100644
--- a/vision_agent/agent/vision_agent_v3.py
+++ b/vision_agent/agent/vision_agent_v3.py
@@ -14,6 +14,7 @@
     FIX_BUG,
     PLAN,
     REFLECT,
+    SIMPLE_TEST,
     TEST,
     USER_REQ,
 )
@@ -116,7 +117,7 @@ def write_and_test_code(
     )
     test = extract_code(
         tester(
-            TEST.format(
+            SIMPLE_TEST.format(
                 docstring=tool_utils, question=task, code=code, feedback=working_memory
             )
         )
diff --git a/vision_agent/agent/vision_agent_v3_prompts.py b/vision_agent/agent/vision_agent_v3_prompts.py
index 5408b34c..efecabe2 100644
--- a/vision_agent/agent/vision_agent_v3_prompts.py
+++ b/vision_agent/agent/vision_agent_v3_prompts.py
@@ -74,18 +74,11 @@
 {code}
 ```
 
-**1. Basic Test Cases**:
-- **Objective**: To verify the fundamental functionality under normal conditions.
-
-**2. Edge Test Cases**:
-- **Objective**: To evaluate the function's behavior under extreme or unusual conditions.
-
 **Instructions**:
-1. Implement a comprehensive set of test cases following the guidelines above and testing each of the user's instructions.
+1. Verify the fundamental functionality under normal conditions.
 2. Ensure each test case is well-documented with comments explaining the scenario it covers.
-3. Pay special attention to edge cases as they often reveal hidden bugs.
-4. DO NOT use any files that are not provided by the user's instructions, your test must be run and will crash if it tries to load a non-existent file.
-5. If it is a vision task, you can visualize the output to ensure it is correct.
+3. DO NOT use any files that are not provided by the user's instructions, your test must be run and will crash if it tries to load a non-existent file.
+4. DO NOT mock any functions, you must test their functionality as is.
 
 You should format your test cases at the end of your response wrapped in ```python ``` tags like in the following example:
 ```python
@@ -100,7 +93,7 @@
 visualize(output)
 ```
 
-**Examples**
+**Examples**:
 ## Prompt 1:
 ```python
 def detect_cats_and_dogs(image_path: str) -> Dict[str, List[List[float]]]:
@@ -137,6 +130,31 @@ def find_text(image_path: str, text: str) -> str:
 """
 
 
+SIMPLE_TEST = """
+**Role**: As a tester, your task is to create a simple test case for the provided code. This test case should verify the fundamental functionality under normal conditions.
+
+**Documentation**:
+This is the documentation for the functions you have access to. You may call any of these functions to help you complete the task. They are available through importing `from vision_agent.tools.tools_v2 import *`. You do not need to test these functions, only the code provided by the user.
+
+{docstring}
+
+**User Instructions**:
+{question}
+
+**Input Code Snippet**:
+```python
+### Please decided how would you want to generate test cases. Based on incomplete code or completed version.
+{code}
+```
+
+**Instructions**:
+1. Verify the fundamental functionality under normal conditions.
+2. Ensure each test case is well-documented with comments explaining the scenario it covers.
+3. DO NOT use any files that are not provided by the user's instructions, your test must be run and will crash if it tries to load a non-existent file.
+4. DO NOT mock any functions, you must test their functionality as is.
+"""
+
+
 FIX_BUG = """
 **Role** As a coder, your job is to find the error in the code and fix it. You are running in a notebook setting so feel free to run !pip install to install missing packages.
 

From b23a48088ce0e4d7411f77a833cd15300fd97354 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Mon, 20 May 2024 07:49:57 +0900
Subject: [PATCH 08/12] fixed issue with chat not resetting

---
 vision_agent/agent/vision_agent_v3.py         | 27 ++++++++++++-------
 vision_agent/agent/vision_agent_v3_prompts.py | 21 +++++++++++----
 2 files changed, 34 insertions(+), 14 deletions(-)

diff --git a/vision_agent/agent/vision_agent_v3.py b/vision_agent/agent/vision_agent_v3.py
index b04eac25..cb1f704b 100644
--- a/vision_agent/agent/vision_agent_v3.py
+++ b/vision_agent/agent/vision_agent_v3.py
@@ -1,3 +1,4 @@
+import copy
 import json
 import logging
 import sys
@@ -11,6 +12,7 @@
 from vision_agent.agent import Agent
 from vision_agent.agent.vision_agent_v3_prompts import (
     CODE,
+    FEEDBACK,
     FIX_BUG,
     PLAN,
     REFLECT,
@@ -31,8 +33,13 @@
 
 
 def format_memory(memory: List[Dict[str, str]]) -> str:
-    return "\n\n".join(
-        [f"Code: {m['code']}\nFeedback: {m['feedback']}" for m in memory]
+    return FEEDBACK.format(
+        feedback="\n".join(
+            [
+                f"Feedback {i}:\nCode: {m['code']}\nFeedback: {m['feedback']}\n"
+                for i, m in enumerate(memory)
+            ]
+        )
     )
 
 
@@ -72,7 +79,7 @@ def write_plan(
     working_memory: str,
     model: LLM,
 ) -> List[Dict[str, str]]:
-    chat = chat.copy()
+    chat = copy.deepcopy(chat)
     if chat[-1]["role"] != "user":
         raise ValueError("Last chat message must be from the user.")
 
@@ -87,16 +94,15 @@ def reflect(
     chat: List[Dict[str, str]],
     plan: str,
     code: str,
-    test: str,
     model: LLM,
 ) -> Dict[str, Union[str, bool]]:
-    chat = chat.copy()
+    chat = copy.deepcopy(chat)
     if chat[-1]["role"] != "user":
         raise ValueError("Last chat message must be from the user.")
 
     user_request = chat[-1]["content"]
     context = USER_REQ.format(user_request=user_request)
-    prompt = REFLECT.format(context=context, plan=plan, code=code, test=test)
+    prompt = REFLECT.format(context=context, plan=plan, code=code)
     chat[-1]["content"] = prompt
     return extract_json(model.chat(chat))
 
@@ -182,7 +188,7 @@ def retrieve_tools(
     tool_info = []
     tool_desc = []
     for task in plan:
-        tools = tool_recommender.top_k(task["instruction"], k=2, thresh=0.3)
+        tools = tool_recommender.top_k(task["instructions"], k=2, thresh=0.3)
         tool_info.extend([e["doc"] for e in tools])
         tool_desc.extend([e["desc"] for e in tools])
     if verbosity == 2:
@@ -251,10 +257,11 @@ def chat_with_workflow(
         retries = 0
 
         while not success and retries < self.max_retries:
+            __import__("ipdb").set_trace()
             plan_i = write_plan(
                 chat, TOOL_DESCRIPTIONS, format_memory(working_memory), self.planner
             )
-            plan_i_str = "\n-".join([e["instruction"] for e in plan_i])
+            plan_i_str = "\n-".join([e["instructions"] for e in plan_i])
             if self.verbosity == 1 or self.verbosity == 2:
                 _LOGGER.info(
                     f"""
@@ -282,7 +289,9 @@ def chat_with_workflow(
             working_memory.extend(results["working_memory"])  # type: ignore
             plan.append({"code": code, "test": test, "plan": plan_i})
 
-            reflection = reflect(chat, plan_i_str, code, test, self.planner)
+            reflection = reflect(chat, plan_i_str, code, self.planner)
+            if self.verbosity > 0:
+                _LOGGER.info(f"Reflection: {reflection}")
             feedback = cast(str, reflection["feedback"])
             success = cast(bool, reflection["success"])
             working_memory.append({"code": f"{code}\n{test}", "feedback": feedback})
diff --git a/vision_agent/agent/vision_agent_v3_prompts.py b/vision_agent/agent/vision_agent_v3_prompts.py
index efecabe2..087eb042 100644
--- a/vision_agent/agent/vision_agent_v3_prompts.py
+++ b/vision_agent/agent/vision_agent_v3_prompts.py
@@ -3,6 +3,12 @@
 {user_request}
 """
 
+FEEDBACK = """
+## This contains code and feedback from previous runs and is used for providing context so you do not make the same mistake again.
+
+{feedback}
+"""
+
 
 PLAN = """
 **Context**
@@ -22,7 +28,7 @@
     "plan":
         [
             {{
-                "instruction": str # what you should do in this task, one short phrase or sentence
+                "instructions": str # what you should do in this task, one short phrase or sentence
             }}
         ]
 }}
@@ -147,6 +153,9 @@ def find_text(image_path: str, text: str) -> str:
 {code}
 ```
 
+**Previous Feedback**:
+{feedback}
+
 **Instructions**:
 1. Verify the fundamental functionality under normal conditions.
 2. Ensure each test case is well-documented with comments explaining the scenario it covers.
@@ -187,7 +196,7 @@ def find_text(image_path: str, text: str) -> str:
 
 
 REFLECT = """
-**Role**: You are a reflection agent. Your job is to look at the original user request and the code and test cases produced and determine if they meet the user's request. If they do not, you must provide feedback on how to improve the code and test cases.
+**Role**: You are a reflection agent. Your job is to look at the original user request and the code produced and determine if it meets the user's request. If it does not, you must provide feedback on how to improve the code and test cases.
 
 **Context**:
 {context}
@@ -198,10 +207,12 @@ def find_text(image_path: str, text: str) -> str:
 **Code**:
 {code}
 
-**Test Cases**:
-{test}
-
 **Instructions**:
+1. **Understand the User Request**: Read the user request and understand what the user is asking for.
+2. **Review the Plan**: Check the plan to see if it is a good approach to solving the user request.
+3. **Review the Code**: Check the code to see if it follows the plan and solves the user request.
+4. Do no add reflections on test cases, these are taken care of.
+
 Respond in JSON format with the following structure:
 {{
     "feedback": str # the feedback you would give to the coder and tester

From f74eacb7f3eb47050fdd851996dd82f9782e5e03 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Mon, 20 May 2024 07:54:49 +0900
Subject: [PATCH 09/12] removed unused import

---
 vision_agent/agent/vision_agent_v3.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vision_agent/agent/vision_agent_v3.py b/vision_agent/agent/vision_agent_v3.py
index cb1f704b..79ed175d 100644
--- a/vision_agent/agent/vision_agent_v3.py
+++ b/vision_agent/agent/vision_agent_v3.py
@@ -17,7 +17,6 @@
     PLAN,
     REFLECT,
     SIMPLE_TEST,
-    TEST,
     USER_REQ,
 )
 from vision_agent.llm import LLM, OpenAILLM

From f08fb9019b1bea7de1297973edb9bddda6e90b58 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Tue, 21 May 2024 07:16:26 +0900
Subject: [PATCH 10/12] prmopt updates

---
 vision_agent/agent/vision_agent_v3_prompts.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vision_agent/agent/vision_agent_v3_prompts.py b/vision_agent/agent/vision_agent_v3_prompts.py
index 087eb042..bcd84952 100644
--- a/vision_agent/agent/vision_agent_v3_prompts.py
+++ b/vision_agent/agent/vision_agent_v3_prompts.py
@@ -21,7 +21,7 @@
 {feedback}
 
 **Instructions**:
-Based on the context and tools you have available, write a plan of subtasks to achieve the user request utilizing tools when necessary. Output a list of jsons in the following format:
+Based on the context and tools you have available, write a plan of subtasks to achieve the user request utilizing given tools when necessary. Output a list of jsons in the following format:
 
 ```json
 {{
@@ -38,7 +38,7 @@
 CODE = """
 **Role**: You are a software programmer.
 
-**Task**: As a programmer, you are required to complete the function. Use a Chain-of-Thought approach to break down the problem, create pseudocode, and then write the code in Python language. Ensure that your code is efficient, readable, and well-commented. Return the requested information from the function you create. Do no call your code unnecessarily, a test will be run after it is submitted.
+**Task**: As a programmer, you are required to complete the function. Use a Chain-of-Thought approach to break down the problem, create pseudocode, and then write the code in Python language. Ensure that your code is efficient, readable, and well-commented. Return the requested information from the function you create. Do not call your code, a test will be run after the code is submitted.
 
 **Documentation**:
 This is the documentation for the functions you have access to. You may call any of these functions to help you complete the task. They are available through importing `from vision_agent.tools.tools_v2 import *`.
@@ -67,7 +67,7 @@
 **Role**: As a tester, your task is to create comprehensive test cases for the provided code. These test cases should encompass Basic and Edge case scenarios to ensure the code's robustness and reliability if possible.
 
 **Documentation**:
-This is the documentation for the functions you have access to. You may call any of these functions to help you complete the task. They are available through importing `from vision_agent.tools.tools_v2 import *`. You do not need to test these functions, only the code provided by the user.
+This is the documentation for the functions you have access to. You may call any of these functions to help you complete the task. They are available through importing `from vision_agent.tools.tools_v2 import *`. You do not need to test these functions. Test only the code provided by the user.
 
 {docstring}
 

From e28fd087e49d704b0810bff30cff2f4f9396ff00 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Tue, 21 May 2024 07:41:32 +0900
Subject: [PATCH 11/12] update prompts

---
 vision_agent/agent/vision_agent_v3.py         | 2 +-
 vision_agent/agent/vision_agent_v3_prompts.py | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/vision_agent/agent/vision_agent_v3.py b/vision_agent/agent/vision_agent_v3.py
index 79ed175d..bc4ad498 100644
--- a/vision_agent/agent/vision_agent_v3.py
+++ b/vision_agent/agent/vision_agent_v3.py
@@ -35,7 +35,7 @@ def format_memory(memory: List[Dict[str, str]]) -> str:
     return FEEDBACK.format(
         feedback="\n".join(
             [
-                f"Feedback {i}:\nCode: {m['code']}\nFeedback: {m['feedback']}\n"
+                f"### Feedback {i}:\nCode: ```python\n{m['code']}\n```\nFeedback: {m['feedback']}\n"
                 for i, m in enumerate(memory)
             ]
         )
diff --git a/vision_agent/agent/vision_agent_v3_prompts.py b/vision_agent/agent/vision_agent_v3_prompts.py
index bcd84952..3e0813af 100644
--- a/vision_agent/agent/vision_agent_v3_prompts.py
+++ b/vision_agent/agent/vision_agent_v3_prompts.py
@@ -196,7 +196,7 @@ def find_text(image_path: str, text: str) -> str:
 
 
 REFLECT = """
-**Role**: You are a reflection agent. Your job is to look at the original user request and the code produced and determine if it meets the user's request. If it does not, you must provide feedback on how to improve the code and test cases.
+**Role**: You are a reflection agent. Your job is to look at the original user request and the code produced and determine if the code satisfies the user's request. If it does not, you must provide feedback on how to improve the code. You are concerned only if the code meets the user request, not if the code is good or bad.
 
 **Context**:
 {context}
@@ -209,9 +209,9 @@ def find_text(image_path: str, text: str) -> str:
 
 **Instructions**:
 1. **Understand the User Request**: Read the user request and understand what the user is asking for.
-2. **Review the Plan**: Check the plan to see if it is a good approach to solving the user request.
-3. **Review the Code**: Check the code to see if it follows the plan and solves the user request.
-4. Do no add reflections on test cases, these are taken care of.
+2. **Review the Plan**: Check the plan to see if it is a viable approach to solving the user request.
+3. **Review the Code**: Check the code to see if it solves the user request.
+4. DO NOT add any reflections for test cases, these are taken care of.
 
 Respond in JSON format with the following structure:
 {{

From 0b1fa8bda6021eabd3b68799122e956fea1a69a6 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Tue, 21 May 2024 07:48:19 +0900
Subject: [PATCH 12/12] remove debug

---
 vision_agent/agent/vision_agent_v3.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vision_agent/agent/vision_agent_v3.py b/vision_agent/agent/vision_agent_v3.py
index bc4ad498..d8de28c6 100644
--- a/vision_agent/agent/vision_agent_v3.py
+++ b/vision_agent/agent/vision_agent_v3.py
@@ -256,7 +256,6 @@ def chat_with_workflow(
         retries = 0
 
         while not success and retries < self.max_retries:
-            __import__("ipdb").set_trace()
             plan_i = write_plan(
                 chat, TOOL_DESCRIPTIONS, format_memory(working_memory), self.planner
             )