From d6fd63e816fc558fd686449fffb5deebc7e3c3ac Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Mon, 13 May 2024 15:22:14 -0700
Subject: [PATCH] Add Long Term Memory and Feedback (#80)

* fixed save and load

* added long term memory

* added dynamic re-planning

* add gpt-4o

* update tests

* update tests

* fixed exit loop early

* add some extra parsing for code snippets

* fix formatting

* fix typing error
---
 tests/test_llm.py                            |   8 +-
 vision_agent/agent/vision_agent_v2.py        | 146 ++++++++++++++-----
 vision_agent/agent/vision_agent_v2_prompt.py |  35 +++--
 vision_agent/llm/llm.py                      |   2 +-
 vision_agent/utils/__init__.py               |   2 +-
 vision_agent/utils/sim.py                    |  19 ++-
 6 files changed, 160 insertions(+), 52 deletions(-)

diff --git a/tests/test_llm.py b/tests/test_llm.py
index bbcc203e..b10ba7fc 100644
--- a/tests/test_llm.py
+++ b/tests/test_llm.py
@@ -18,7 +18,7 @@ def test_generate_with_mock(openai_llm_mock):  # noqa: F811
     response = llm.generate("test prompt")
     assert response == "mocked response"
     openai_llm_mock.chat.completions.create.assert_called_once_with(
-        model="gpt-4-turbo",
+        model="gpt-4o",
         messages=[{"role": "user", "content": "test prompt"}],
     )
 
@@ -31,7 +31,7 @@ def test_chat_with_mock(openai_llm_mock):  # noqa: F811
     response = llm.chat([{"role": "user", "content": "test prompt"}])
     assert response == "mocked response"
     openai_llm_mock.chat.completions.create.assert_called_once_with(
-        model="gpt-4-turbo",
+        model="gpt-4o",
         messages=[{"role": "user", "content": "test prompt"}],
     )
 
@@ -44,14 +44,14 @@ def test_call_with_mock(openai_llm_mock):  # noqa: F811
     response = llm("test prompt")
     assert response == "mocked response"
     openai_llm_mock.chat.completions.create.assert_called_once_with(
-        model="gpt-4-turbo",
+        model="gpt-4o",
         messages=[{"role": "user", "content": "test prompt"}],
     )
 
     response = llm([{"role": "user", "content": "test prompt"}])
     assert response == "mocked response"
     openai_llm_mock.chat.completions.create.assert_called_with(
-        model="gpt-4-turbo",
+        model="gpt-4o",
         messages=[{"role": "user", "content": "test prompt"}],
     )
 
diff --git a/vision_agent/agent/vision_agent_v2.py b/vision_agent/agent/vision_agent_v2.py
index 0889bf4e..650b5c30 100644
--- a/vision_agent/agent/vision_agent_v2.py
+++ b/vision_agent/agent/vision_agent_v2.py
@@ -1,8 +1,9 @@
 import json
 import logging
 from pathlib import Path
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from typing import Any, Callable, Dict, List, Mapping, Optional, Tuple, Union
 
+import pandas as pd
 from rich.console import Console
 from rich.syntax import Syntax
 from tabulate import tabulate
@@ -20,6 +21,7 @@
     TEST,
     USER_REQ_CONTEXT,
     USER_REQ_SUBTASK_CONTEXT,
+    USER_REQ_SUBTASK_WM_CONTEXT,
 )
 from vision_agent.llm import LLM, OpenAILLM
 from vision_agent.tools.tools_v2 import TOOL_DESCRIPTIONS, TOOLS_DF
@@ -31,28 +33,53 @@
 _CONSOLE = Console()
 
 
+def build_working_memory(working_memory: Mapping[str, List[str]]) -> Sim:
+    data: Mapping[str, List[str]] = {"desc": [], "doc": []}
+    for key, value in working_memory.items():
+        data["desc"].append(key)
+        data["doc"].append("\n".join(value))
+    df = pd.DataFrame(data)  # type: ignore
+    return Sim(df, sim_key="desc")
+
+
 def extract_code(code: str) -> str:
     if "```python" in code:
         code = code[code.find("```python") + len("```python") :]
         code = code[: code.find("```")]
+    if code.startswith("python\n"):
+        code = code[len("python\n") :]
     return code
 
 
 def write_plan(
-    user_requirements: str, tool_desc: str, model: LLM
-) -> List[Dict[str, Any]]:
+    chat: List[Dict[str, str]],
+    plan: Optional[List[Dict[str, Any]]],
+    tool_desc: str,
+    model: LLM,
+) -> Tuple[str, List[Dict[str, Any]]]:
+    # Get last user request
+    if chat[-1]["role"] != "user":
+        raise ValueError("Last chat message must be from the user.")
+    user_requirements = chat[-1]["content"]
+
     context = USER_REQ_CONTEXT.format(user_requirement=user_requirements)
-    prompt = PLAN.format(context=context, plan="", tool_desc=tool_desc)
-    plan = json.loads(model(prompt).replace("```", "").strip())
-    return plan["plan"]  # type: ignore
+    prompt = PLAN.format(context=context, plan=str(plan), tool_desc=tool_desc)
+    chat[-1]["content"] = prompt
+    plan = json.loads(model.chat(chat).replace("```", "").strip())
+    return plan["user_req"], plan["plan"]  # type: ignore
 
 
 def write_code(
-    user_req: str, subtask: str, tool_info: str, code: str, model: LLM
+    user_req: str,
+    subtask: str,
+    working_memory: str,
+    tool_info: str,
+    code: str,
+    model: LLM,
 ) -> str:
     prompt = CODE.format(
-        context=USER_REQ_SUBTASK_CONTEXT.format(
-            user_requirement=user_req, subtask=subtask
+        context=USER_REQ_SUBTASK_WM_CONTEXT.format(
+            user_requirement=user_req, working_memory=working_memory, subtask=subtask
         ),
         tool_info=tool_info,
         code=code,
@@ -66,7 +93,7 @@ def write_code(
 
 
 def write_test(
-    user_req: str, subtask: str, tool_info: str, code: str, model: LLM
+    user_req: str, subtask: str, tool_info: str, _: str, code: str, model: LLM
 ) -> str:
     prompt = TEST.format(
         context=USER_REQ_SUBTASK_CONTEXT.format(
@@ -83,14 +110,24 @@ def write_test(
     return extract_code(code)
 
 
-def debug_code(sub_task: str, working_memory: List[str], model: LLM) -> Tuple[str, str]:
+def debug_code(
+    user_req: str,
+    subtask: str,
+    retrieved_ltm: str,
+    working_memory: str,
+    model: LLM,
+) -> Tuple[str, str]:
     # Make debug model output JSON
     if hasattr(model, "kwargs"):
         model.kwargs["response_format"] = {"type": "json_object"}
     prompt = DEBUG.format(
         debug_example=DEBUG_EXAMPLE,
-        context=USER_REQ_CONTEXT.format(user_requirement=sub_task),
-        previous_impl="\n".join(working_memory),
+        context=USER_REQ_SUBTASK_WM_CONTEXT.format(
+            user_requirement=user_req,
+            subtask=subtask,
+            working_memory=retrieved_ltm,
+        ),
+        previous_impl=working_memory,
     )
     messages = [
         {"role": "system", "content": DEBUG_SYS_MSG},
@@ -106,10 +143,11 @@ def write_and_exec_code(
     user_req: str,
     subtask: str,
     orig_code: str,
-    code_writer_call: Callable,
+    code_writer_call: Callable[..., str],
     model: LLM,
     tool_info: str,
     exec: Execute,
+    retrieved_ltm: str,
     max_retry: int = 3,
     verbose: bool = False,
 ) -> Tuple[bool, str, str, Dict[str, List[str]]]:
@@ -117,8 +155,9 @@ def write_and_exec_code(
     counter = 0
     reflection = ""
 
-    # TODO: add working memory to code_writer_call and debug_code
-    code = code_writer_call(user_req, subtask, tool_info, orig_code, model)
+    code = code_writer_call(
+        user_req, subtask, retrieved_ltm, tool_info, orig_code, model
+    )
     success, result = exec.run_isolation(code)
     working_memory: Dict[str, List[str]] = {}
     while not success and counter < max_retry:
@@ -136,7 +175,9 @@ def write_and_exec_code(
                 PREV_CODE_CONTEXT.format(code=code, result=result)
             )
 
-        code, reflection = debug_code(subtask, working_memory[subtask], model)
+        code, reflection = debug_code(
+            user_req, subtask, retrieved_ltm, "\n".join(working_memory[subtask]), model
+        )
         success, result = exec.run_isolation(code)
         counter += 1
         if verbose:
@@ -148,7 +189,7 @@ def write_and_exec_code(
         if success:
             working_memory[subtask].append(
                 PREV_CODE_CONTEXT_WITH_REFLECTION.format(
-                    code=code, result=result, reflection=reflection
+                    reflection=reflection, code=code, result=result
                 )
             )
 
@@ -162,12 +203,15 @@ def run_plan(
     exec: Execute,
     code: str,
     tool_recommender: Sim,
+    long_term_memory: Optional[Sim] = None,
     verbose: bool = False,
 ) -> Tuple[str, str, List[Dict[str, Any]], Dict[str, List[str]]]:
     active_plan = [e for e in plan if "success" not in e or not e["success"]]
-    working_memory: Dict[str, List[str]] = {}
     current_code = code
     current_test = ""
+    retrieved_ltm = ""
+    working_memory: Dict[str, List[str]] = {}
+
     for task in active_plan:
         _LOGGER.info(
             f"""
@@ -176,7 +220,13 @@ def run_plan(
         tool_info = "\n".join(
             [e["doc"] for e in tool_recommender.top_k(task["instruction"])]
         )
-        success, code, result, task_memory = write_and_exec_code(
+
+        if long_term_memory is not None:
+            retrieved_ltm = "\n".join(
+                [e["doc"] for e in long_term_memory.top_k(task["instruction"], 1)]
+            )
+
+        success, code, result, working_memory_i = write_and_exec_code(
             user_req,
             task["instruction"],
             current_code,
@@ -184,14 +234,15 @@ def run_plan(
             coder,
             tool_info,
             exec,
-            verbose,
+            retrieved_ltm,
+            verbose=verbose,
         )
         if task["type"] == "code":
             current_code = code
         else:
             current_test = code
 
-        working_memory.update(task_memory)
+        working_memory.update(working_memory_i)
 
         if verbose:
             _CONSOLE.print(
@@ -231,6 +282,7 @@ def __init__(
         self,
         timeout: int = 600,
         tool_recommender: Optional[Sim] = None,
+        long_term_memory: Optional[Sim] = None,
         verbose: bool = False,
     ) -> None:
         self.planner = OpenAILLM(temperature=0.1, json_mode=True)
@@ -241,6 +293,12 @@ def __init__(
         else:
             self.tool_recommender = tool_recommender
         self.verbose = verbose
+        self._working_memory: Dict[str, List[str]] = {}
+        if long_term_memory is not None:
+            if "doc" not in long_term_memory.df.columns:
+                raise ValueError("Long term memory must have a 'doc' column.")
+        self.long_term_memory = long_term_memory
+        self.max_retries = 3
         if self.verbose:
             _LOGGER.setLevel(logging.INFO)
 
@@ -248,36 +306,47 @@ def __call__(
         self,
         input: Union[List[Dict[str, str]], str],
         image: Optional[Union[str, Path]] = None,
+        plan: Optional[List[Dict[str, Any]]] = None,
     ) -> str:
         if isinstance(input, str):
             input = [{"role": "user", "content": input}]
-        code, _ = self.chat_with_tests(input, image)
-        return code
+        results = self.chat_with_workflow(input, image, plan)
+        return results["code"]  # type: ignore
 
-    def chat_with_tests(
+    def chat_with_workflow(
         self,
         chat: List[Dict[str, str]],
         image: Optional[Union[str, Path]] = None,
-    ) -> Tuple[str, str]:
+        plan: Optional[List[Dict[str, Any]]] = None,
+    ) -> Dict[str, Any]:
         if len(chat) == 0:
             raise ValueError("Input cannot be empty.")
 
-        user_req = chat[0]["content"]
         if image is not None:
-            user_req += f" Image name {image}"
+            # append file names to all user messages
+            for chat_i in chat:
+                if chat_i["role"] == "user":
+                    chat_i["content"] += f" Image name {image}"
+
+        working_code = ""
+        if plan is not None:
+            # grab the latest working code from a previous plan
+            for task in plan:
+                if "success" in task and "code" in task and task["success"]:
+                    working_code = task["code"]
 
-        plan = write_plan(user_req, TOOL_DESCRIPTIONS, self.planner)
+        user_req, plan = write_plan(chat, plan, TOOL_DESCRIPTIONS, self.planner)
         _LOGGER.info(
             f"""Plan:
 {tabulate(tabular_data=plan, headers="keys", tablefmt="mixed_grid", maxcolwidths=_MAX_TABULATE_COL_WIDTH)}"""
         )
-        working_memory: Dict[str, List[str]] = {}
 
-        working_code = ""
         working_test = ""
+        working_memory: Dict[str, List[str]] = {}
         success = False
+        retries = 0
 
-        while not success:
+        while not success and retries < self.max_retries:
             working_code, working_test, plan, working_memory_i = run_plan(
                 user_req,
                 plan,
@@ -285,16 +354,25 @@ def chat_with_tests(
                 self.exec,
                 working_code,
                 self.tool_recommender,
+                self.long_term_memory,
                 self.verbose,
             )
             success = all(task["success"] for task in plan)
             working_memory.update(working_memory_i)
 
             if not success:
-                # TODO: ask for feedback and replan
+                # return to user and request feedback
                 break
 
-        return working_code, working_test
+            retries += 1
+
+        return {
+            "code": working_code,
+            "test": working_test,
+            "success": success,
+            "working_memory": build_working_memory(working_memory),
+            "plan": plan,
+        }
 
     def log_progress(self, description: str) -> None:
         pass
diff --git a/vision_agent/agent/vision_agent_v2_prompt.py b/vision_agent/agent/vision_agent_v2_prompt.py
index 881274c5..4003b4df 100644
--- a/vision_agent/agent/vision_agent_v2_prompt.py
+++ b/vision_agent/agent/vision_agent_v2_prompt.py
@@ -1,3 +1,8 @@
+USER_REQ_CONTEXT = """
+## User Requirement
+{user_requirement}
+"""
+
 USER_REQ_SUBTASK_CONTEXT = """
 ## User Requirement
 {user_requirement}
@@ -6,11 +11,16 @@
 {subtask}
 """
 
-USER_REQ_CONTEXT = """
+USER_REQ_SUBTASK_WM_CONTEXT = """
 ## User Requirement
 {user_requirement}
-"""
 
+## Current Subtask
+{subtask}
+
+## Previous Task
+{working_memory}
+"""
 
 PLAN = """
 # Context
@@ -27,11 +37,13 @@
 - For each subtask, you should provide a short instruction on what to do. Ensure the subtasks are large enough to be meaningful, encompassing multiple lines of code.
 - You do not need to have the agent rewrite any tool functionality you already have, you should instead instruct it to utilize one or more of those tools in each subtask.
 - You can have agents either write coding tasks, to code some functionality or testing tasks to test previous functionality.
+- If a current plan exists, examine each item in the plan to determine if it was successful. If there was an item that failed, i.e. 'success': False, then you should rewrite that item and all subsequent items to ensure that the rewritten plan is successful.
 
 Output a list of jsons in the following format:
 
 ```json
 {{
+    "user_req": str, # "a summarized version of the user requirement"
     "plan":
         [
             {{
@@ -61,8 +73,9 @@
 {code}
 
 # Constraints
-- Write a function that accomplishes the User Requirement. You are supplied code from a previous task, feel free to copy over that code into your own implementation if you need it.
-- Always prioritize using pre-defined tools or code for the same functionality. You have access to all these tools through the `from vision_agent.tools.tools_v2 import *` import.
+- Write a function that accomplishes the 'User Requirement'. You are supplied code from a previous task under 'Previous Code', feel free to copy over that code into your own implementation if you need it.
+- Always prioritize using pre-defined tools or code for the same functionality from 'Tool Info for Current Subtask'. You have access to all these tools through the `from vision_agent.tools.tools_v2 import *` import.
+- You may recieve previous trials and errors under 'Previous Task', this is code, output and reflections from previous tasks. You can use these to avoid running in to the same issues when writing your code.
 - Write clean, readable, and well-documented code.
 
 # Output
@@ -102,6 +115,7 @@ def add(a: int, b: int) -> int:
 
 
 PREV_CODE_CONTEXT = """
+[previous impl]
 ```python
 {code}
 ```
@@ -112,18 +126,20 @@ def add(a: int, b: int) -> int:
 
 
 PREV_CODE_CONTEXT_WITH_REFLECTION = """
+[reflection on previous impl]
+{reflection}
+
+[new impl]
 ```python
 {code}
 ```
 
-[previous output]
+[new output]
 {result}
 
-[reflection on previous impl]
-{reflection}
 """
 
-
+# don't need [previous impl] because it will come from PREV_CODE_CONTEXT or PREV_CODE_CONTEXT_WITH_REFLECTION
 DEBUG = """
 [example]
 Here is an example of debugging with reflection.
@@ -133,7 +149,6 @@ def add(a: int, b: int) -> int:
 [context]
 {context}
 
-[previous impl]
 {previous_impl}
 
 [instruction]
@@ -158,7 +173,7 @@ def add(a: int, b: int) -> int:
 {code}
 
 # Constraints
-- Write code to test the functionality of the provided code according to the Current Subtask. If you cannot test the code, then write code to visualize the result by calling the code.
+- Write code to test the functionality of the provided code according to the 'Current Subtask'. If you cannot test the code, then write code to visualize the result by calling the code.
 - Always prioritize using pre-defined tools for the same functionality.
 - Write clean, readable, and well-documented code.
 
diff --git a/vision_agent/llm/llm.py b/vision_agent/llm/llm.py
index 96b6477b..e3563fb6 100644
--- a/vision_agent/llm/llm.py
+++ b/vision_agent/llm/llm.py
@@ -34,7 +34,7 @@ class OpenAILLM(LLM):
 
     def __init__(
         self,
-        model_name: str = "gpt-4-turbo",
+        model_name: str = "gpt-4o",
         api_key: Optional[str] = None,
         json_mode: bool = False,
         system_prompt: Optional[str] = None,
diff --git a/vision_agent/utils/__init__.py b/vision_agent/utils/__init__.py
index 11dc0d42..10a49df4 100644
--- a/vision_agent/utils/__init__.py
+++ b/vision_agent/utils/__init__.py
@@ -1,3 +1,3 @@
 from .execute import Execute
-from .sim import Sim
+from .sim import Sim, load_sim, merge_sim
 from .video import extract_frames_from_video
diff --git a/vision_agent/utils/sim.py b/vision_agent/utils/sim.py
index 3a244cd8..4b4298d7 100644
--- a/vision_agent/utils/sim.py
+++ b/vision_agent/utils/sim.py
@@ -1,6 +1,7 @@
 from pathlib import Path
 from typing import Dict, List, Optional, Sequence, Union
 
+import numpy as np
 import pandas as pd
 from openai import Client
 from scipy.spatial.distance import cosine  # type: ignore
@@ -46,7 +47,14 @@ def __init__(
             )
 
     def save(self, sim_file: Union[str, Path]) -> None:
-        self.df.to_csv(sim_file, index=False)
+        sim_file = Path(sim_file)
+        sim_file.mkdir(parents=True, exist_ok=True)
+
+        df = self.df.copy()
+        embs = np.array(df.embs.tolist())
+        np.save(sim_file / "embs.npy", embs)
+        df = df.drop("embs", axis=1)
+        df.to_csv(sim_file / "df.csv", index=False)
 
     def top_k(self, query: str, k: int = 5) -> Sequence[Dict]:
         """Returns the top k most similar items to the query.
@@ -65,6 +73,13 @@ def top_k(self, query: str, k: int = 5) -> Sequence[Dict]:
         return res[[c for c in res.columns if c != "embs"]].to_dict(orient="records")
 
 
+def merge_sim(sim1: Sim, sim2: Sim) -> Sim:
+    return Sim(pd.concat([sim1.df, sim2.df], ignore_index=True))
+
+
 def load_sim(sim_file: Union[str, Path]) -> Sim:
-    df = pd.read_csv(sim_file)
+    sim_file = Path(sim_file)
+    df = pd.read_csv(sim_file / "df.csv")
+    embs = np.load(sim_file / "embs.npy")
+    df["embs"] = list(embs)
     return Sim(df)