From b7b166730471c35ebaece3993ed8bc87a4cb9ab7 Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Tue, 7 May 2024 15:25:48 -0700 Subject: [PATCH] changed file names, added comments --- vision_agent/agent/__init__.py | 2 +- ...ted_vision_agent.py => vision_agent_v2.py} | 48 ++++++++++++++----- ...nt_prompt.py => vision_agent_v2_prompt.py} | 2 +- vision_agent/utils/execute.py | 3 ++ 4 files changed, 42 insertions(+), 13 deletions(-) rename vision_agent/agent/{automated_vision_agent.py => vision_agent_v2.py} (82%) rename vision_agent/agent/{automated_vision_agent_prompt.py => vision_agent_v2_prompt.py} (98%) diff --git a/vision_agent/agent/__init__.py b/vision_agent/agent/__init__.py index 6113b6bc..b358d3b0 100644 --- a/vision_agent/agent/__init__.py +++ b/vision_agent/agent/__init__.py @@ -1,6 +1,6 @@ from .agent import Agent from .agent_coder import AgentCoder -from .automated_vision_agent import AutomatedVisionAgent from .easytool import EasyTool from .reflexion import Reflexion from .vision_agent import VisionAgent +from .vision_agent_v2 import VisionAgentV2 diff --git a/vision_agent/agent/automated_vision_agent.py b/vision_agent/agent/vision_agent_v2.py similarity index 82% rename from vision_agent/agent/automated_vision_agent.py rename to vision_agent/agent/vision_agent_v2.py index c5dc5c02..f73e159e 100644 --- a/vision_agent/agent/automated_vision_agent.py +++ b/vision_agent/agent/vision_agent_v2.py @@ -12,7 +12,7 @@ from vision_agent.tools.tools_v2 import TOOL_DESCRIPTIONS, TOOLS_DF from vision_agent.utils import Execute, Sim -from .automated_vision_agent_prompt import ( +from .vision_agent_v2_prompt import ( CODE, CODE_SYS_MSG, DEBUG, @@ -164,9 +164,11 @@ def run_plan( code: str, tool_recommender: Sim, verbose: bool = False, -) -> Tuple[str, List[Dict[str, Any]]]: +) -> Tuple[str, str, List[Dict[str, Any]], Dict[str, List[str]]]: active_plan = [e for e in plan if "success" not in e or not e["success"]] working_memory: Dict[str, List[str]] = {} + current_code = code + current_test = "" for task in active_plan: _LOGGER.info( f""" @@ -178,13 +180,18 @@ def run_plan( success, code, result, task_memory = write_and_exec_code( user_req, task["instruction"], - code, + current_code, write_code if task["type"] == "code" else write_test, coder, tool_info, exec, verbose, ) + if task["type"] == "code": + current_code = code + else: + current_test = code + working_memory.update(task_memory) if verbose: @@ -200,10 +207,27 @@ def run_plan( if not success: break - return code, plan - + return current_code, current_test, plan, working_memory + + +class VisionAgentV2(Agent): + """Vision Agent is an AI agentic framework geared towards outputting Python code to + solve vision tasks. It is inspired by MetaGPT's Data Interpreter + https://arxiv.org/abs/2402.18679. Vision Agent has several key features to help it + generate code: + - A planner to generate a plan of tasks to solve a user requirement. The planner + can output code tasks or test tasks, where test tasks are used to verify the code. + - Automatic debugging, if a task fails, the agent will attempt to debug the code + using the failed output to fix it. + - A tool recommender to recommend tools to use for a given task. LLM performance + on tool retrieval starts to decrease as you add more tools, tool retrieval helps + keep the number of tools to choose from low. + - Memory retrieval, the agent can remember previous iterations on tasks to help it + with new tasks. + - Dynamic replanning, the agent can ask for feedback and replan remaining tasks + based off of that feedback. + """ -class AutomatedVisionAgent(Agent): def __init__( self, timeout: int = 600, @@ -225,7 +249,7 @@ def __call__( self, input: Union[List[Dict[str, str]], str], image: Optional[Union[str, Path]] = None, - ) -> str: + ) -> Tuple[str, str]: if isinstance(input, str): input = [{"role": "user", "content": input}] return self.chat(input, image) @@ -234,7 +258,7 @@ def chat( self, chat: List[Dict[str, str]], image: Optional[Union[str, Path]] = None, - ) -> str: + ) -> Tuple[str, str]: if len(chat) == 0: raise ValueError("Input cannot be empty.") @@ -254,7 +278,7 @@ def chat( success = False while not success: - working_code, plan = run_plan( + working_code, working_test, plan, working_memory_i = run_plan( user_req, plan, self.coder, @@ -264,11 +288,13 @@ def chat( self.verbose, ) success = all(task["success"] for task in plan) + working_memory.update(working_memory_i) if not success: - pass + # TODO: ask for feedback and replan + break - return working_code + return working_code, working_test def log_progress(self, description: str) -> None: pass diff --git a/vision_agent/agent/automated_vision_agent_prompt.py b/vision_agent/agent/vision_agent_v2_prompt.py similarity index 98% rename from vision_agent/agent/automated_vision_agent_prompt.py rename to vision_agent/agent/vision_agent_v2_prompt.py index 7d8bffdc..881274c5 100644 --- a/vision_agent/agent/automated_vision_agent_prompt.py +++ b/vision_agent/agent/vision_agent_v2_prompt.py @@ -47,7 +47,7 @@ """ -CODE_SYS_MSG = """You are an AI Python assistant. You need to help user to achieve their goal by implementing a function. Your code will be run in a jupyter notebook environment so don't use asyncio.run. Instead, use await if you need to call an async function. Do not use 'display' for showing images.""" +CODE_SYS_MSG = """You are an AI Python assistant. You need to help user to achieve their goal by implementing a function. Your code will be run in a jupyter notebook environment so don't use asyncio.run. Instead, use await if you need to call an async function. Do not use 'display' for showing images, instead use matplotlib or PIL.""" CODE = """ diff --git a/vision_agent/utils/execute.py b/vision_agent/utils/execute.py index cf162f29..aa882728 100644 --- a/vision_agent/utils/execute.py +++ b/vision_agent/utils/execute.py @@ -1,3 +1,6 @@ +"""This code is adapted from MetaGPT's https://github.com/geekan/MetaGPT/blob/main/metagpt/actions/di/execute_nb_code.py +""" + import base64 as b64 import io import re