From 820e9c483d43210719cff6f77834c296b547e2cc Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Tue, 14 May 2024 14:21:41 -0700 Subject: [PATCH 01/10] added different verbosity levels, better json parsing --- vision_agent/agent/vision_agent_v2.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vision_agent/agent/vision_agent_v2.py b/vision_agent/agent/vision_agent_v2.py index 1f93be11..f40cf30b 100644 --- a/vision_agent/agent/vision_agent_v2.py +++ b/vision_agent/agent/vision_agent_v2.py @@ -63,7 +63,7 @@ def extract_json(json_str: str) -> Dict[str, Any]: # get the last ``` not one from an intermediate string json_str = json_str[: json_str.find("}```")] json_dict = json.loads(json_str) - return json_dict # type: ignore + return json_dict def write_plan( @@ -80,8 +80,8 @@ def write_plan( context = USER_REQ_CONTEXT.format(user_requirement=user_requirements) prompt = PLAN.format(context=context, plan=str(plan), tool_desc=tool_desc) chat[-1]["content"] = prompt - new_plan = extract_json(model.chat(chat)) - return new_plan["user_req"], new_plan["plan"] + plan = extract_json(model.chat(chat)) + return plan["user_req"], plan["plan"] # type: ignore def write_code( From 421564c11dec2b49e31bf00a0546091980ed18a2 Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Tue, 14 May 2024 14:25:26 -0700 Subject: [PATCH 02/10] fix typing error --- vision_agent/agent/vision_agent_v2.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vision_agent/agent/vision_agent_v2.py b/vision_agent/agent/vision_agent_v2.py index f40cf30b..1f93be11 100644 --- a/vision_agent/agent/vision_agent_v2.py +++ b/vision_agent/agent/vision_agent_v2.py @@ -63,7 +63,7 @@ def extract_json(json_str: str) -> Dict[str, Any]: # get the last ``` not one from an intermediate string json_str = json_str[: json_str.find("}```")] json_dict = json.loads(json_str) - return json_dict + return json_dict # type: ignore def write_plan( @@ -80,8 +80,8 @@ def write_plan( context = USER_REQ_CONTEXT.format(user_requirement=user_requirements) prompt = PLAN.format(context=context, plan=str(plan), tool_desc=tool_desc) chat[-1]["content"] = prompt - plan = extract_json(model.chat(chat)) - return plan["user_req"], plan["plan"] # type: ignore + new_plan = extract_json(model.chat(chat)) + return new_plan["user_req"], new_plan["plan"] def write_code( From 7d454d17af0d9e9ce763f77d4845d5a69bbe4222 Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Wed, 15 May 2024 13:27:39 -0700 Subject: [PATCH 03/10] fixed issues with agent coder --- vision_agent/agent/agent_coder.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/vision_agent/agent/agent_coder.py b/vision_agent/agent/agent_coder.py index a1304eaa..998c0d8c 100644 --- a/vision_agent/agent/agent_coder.py +++ b/vision_agent/agent/agent_coder.py @@ -4,6 +4,8 @@ import sys from pathlib import Path from typing import Dict, List, Optional, Union +from rich.console import Console +from rich.syntax import Syntax from vision_agent.agent import Agent from vision_agent.agent.agent_coder_prompts import ( @@ -40,6 +42,7 @@ logging.basicConfig(stream=sys.stdout) _LOGGER = logging.getLogger(__name__) _EXECUTE = Execute() +_CONSOLE = Console() def write_tests(question: str, code: str, model: LLM) -> str: @@ -103,7 +106,7 @@ def run_visual_tests( def fix_bugs(code: str, tests: str, result: str, feedback: str, model: LLM) -> str: - prompt = FIX_BUG.format(completion=code, test_case=tests, result=result) + prompt = FIX_BUG.format(code=code, tests=tests, result=result, feedback=feedback) completion = model(prompt) return preprocess_data(completion) @@ -139,7 +142,8 @@ def __init__( else visual_tester_agent ) self.max_turns = 3 - if verbose: + self.verbose = verbose + if self.verbose: _LOGGER.setLevel(logging.INFO) def __call__( @@ -164,9 +168,11 @@ def chat( feedback = "" for _ in range(self.max_turns): code = write_program(question, feedback, self.coder_agent) - _LOGGER.info(f"code:\n{code}") + if self.verbose: + _CONSOLE.print(Syntax(code, "python", theme="gruvbox-dark", line_numbers=True)) debug = write_debug(question, code, feedback, self.tester_agent) - _LOGGER.info(f"debug:\n{debug}") + if self.verbose: + _CONSOLE.print(Syntax(debug, "python", theme="gruvbox-dark", line_numbers=True)) results = execute_tests(code, debug) _LOGGER.info( f"execution results: passed: {results['passed']}\n{results['result']}" @@ -176,7 +182,8 @@ def chat( code = fix_bugs( code, debug, results["result"].strip(), feedback, self.coder_agent # type: ignore ) - _LOGGER.info(f"fixed code:\n{code}") + if self.verbose: + _CONSOLE.print(Syntax(code, "python", theme="gruvbox-dark", line_numbers=True)) else: # TODO: Sometimes it prints nothing, so we need to handle that case # TODO: The visual agent reflection does not work very well, needs more testing From 809c90a3c375235855c40783c3093ac1684f005e Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Wed, 15 May 2024 13:27:57 -0700 Subject: [PATCH 04/10] add save json --- vision_agent/tools/tools_v2.py | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/vision_agent/tools/tools_v2.py b/vision_agent/tools/tools_v2.py index ab36869f..d0845f1c 100644 --- a/vision_agent/tools/tools_v2.py +++ b/vision_agent/tools/tools_v2.py @@ -2,6 +2,7 @@ import io import logging import tempfile +import json from importlib import resources from pathlib import Path from typing import Any, Callable, Dict, List, Tuple, Union, cast @@ -285,6 +286,31 @@ def closest_box_distance(box1: List[float], box2: List[float]) -> float: # Utility and visualization functions +def save_json(data: Any, file_path: str) -> None: + """'save_json' is a utility function that saves data as a JSON file. It is helpful + for saving data that contains NumPy arrays which are not JSON serializable. + + Parameters: + data (Any): The data to save. + file_path (str): The path to save the JSON file. + + Example + ------- + >>> save_json(data, "path/to/file.json") + """ + + class NumpyEncoder(json.JSONEncoder): + def default(self, obj): + if isinstance(obj, np.ndarray): + return obj.tolist() + elif isinstance(obj, np.bool_): + return bool(obj) + return json.JSONEncoder.default(self, obj) + + with open(file_path, "w") as f: + json.dump(data, f, cls=NumpyEncoder) + + def load_image(image_path: str) -> np.ndarray: """'load_image' is a utility function that loads an image from the given path. @@ -480,6 +506,7 @@ def get_tools_df(funcs: List[Callable[..., Any]]) -> pd.DataFrame: ocr, closest_mask_distance, closest_box_distance, + save_json, load_image, save_image, overlay_bounding_boxes, @@ -489,5 +516,5 @@ def get_tools_df(funcs: List[Callable[..., Any]]) -> pd.DataFrame: TOOL_DESCRIPTIONS = get_tool_descriptions(TOOLS) # type: ignore TOOL_DOCSTRING = get_tool_documentation(TOOLS) # type: ignore UTILITIES_DOCSTRING = get_tool_documentation( - [load_image, save_image, overlay_bounding_boxes] + [save_json, load_image, save_image, overlay_bounding_boxes] ) From 3e1bbbb7a937ba1dc14f585295cfdb4af5b4b0bb Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Wed, 15 May 2024 13:28:07 -0700 Subject: [PATCH 05/10] add thresh to top k --- vision_agent/utils/sim.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vision_agent/utils/sim.py b/vision_agent/utils/sim.py index 4b4298d7..ad50149a 100644 --- a/vision_agent/utils/sim.py +++ b/vision_agent/utils/sim.py @@ -56,7 +56,7 @@ def save(self, sim_file: Union[str, Path]) -> None: df = df.drop("embs", axis=1) df.to_csv(sim_file / "df.csv", index=False) - def top_k(self, query: str, k: int = 5) -> Sequence[Dict]: + def top_k(self, query: str, k: int = 5, thresh: Optional[float] = None) -> Sequence[Dict]: """Returns the top k most similar items to the query. Parameters: @@ -70,6 +70,8 @@ def top_k(self, query: str, k: int = 5) -> Sequence[Dict]: embedding = get_embedding(self.client, query, model=self.model) self.df["sim"] = self.df.embs.apply(lambda x: 1 - cosine(x, embedding)) res = self.df.sort_values("sim", ascending=False).head(k) + if thresh is not None: + res = res[res.sim > thresh] return res[[c for c in res.columns if c != "embs"]].to_dict(orient="records") From 1fbf5825d4b75f97197f8b90784e485149c9886b Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Wed, 15 May 2024 14:14:33 -0700 Subject: [PATCH 06/10] fix bug, add thresh for top k tools --- vision_agent/agent/vision_agent_v2.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/vision_agent/agent/vision_agent_v2.py b/vision_agent/agent/vision_agent_v2.py index 1f93be11..9bb39c1b 100644 --- a/vision_agent/agent/vision_agent_v2.py +++ b/vision_agent/agent/vision_agent_v2.py @@ -18,6 +18,7 @@ PLAN, PREV_CODE_CONTEXT, PREV_CODE_CONTEXT_WITH_REFLECTION, + REPLAN, TEST, USER_REQ_CONTEXT, USER_REQ_SUBTASK_CONTEXT, @@ -235,7 +236,7 @@ def run_plan( f""" {tabulate(tabular_data=[task], headers="keys", tablefmt="mixed_grid", maxcolwidths=_MAX_TABULATE_COL_WIDTH)}""" ) - tools = tool_recommender.top_k(task["instruction"]) + tools = tool_recommender.top_k(task["instruction"], thresh=0.3) tool_info = "\n".join([e["doc"] for e in tools]) if verbosity == 2: @@ -285,6 +286,7 @@ class VisionAgentV2(Agent): solve vision tasks. It is inspired by MetaGPT's Data Interpreter https://arxiv.org/abs/2402.18679. Vision Agent has several key features to help it generate code: + - A planner to generate a plan of tasks to solve a user requirement. The planner can output code tasks or test tasks, where test tasks are used to verify the code. - Automatic debugging, if a task fails, the agent will attempt to debug the code @@ -377,7 +379,7 @@ def chat_with_workflow( self.long_term_memory, self.verbosity, ) - success = all(task["success"] for task in plan) + success = all(task["success"] if "success" in task else False for task in plan) working_memory.update(working_memory_i) if not success: From c4b4b4a1025474d8ee0a036995dfad3e2fd1b4d1 Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Wed, 15 May 2024 14:14:45 -0700 Subject: [PATCH 07/10] update prompts --- vision_agent/agent/vision_agent_v2_prompt.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/vision_agent/agent/vision_agent_v2_prompt.py b/vision_agent/agent/vision_agent_v2_prompt.py index 4003b4df..87895da0 100644 --- a/vision_agent/agent/vision_agent_v2_prompt.py +++ b/vision_agent/agent/vision_agent_v2_prompt.py @@ -34,7 +34,7 @@ # Task: Based on the context and the tools you have available, write a plan of subtasks to achieve the user request that adhere to the following requirements: -- For each subtask, you should provide a short instruction on what to do. Ensure the subtasks are large enough to be meaningful, encompassing multiple lines of code. +- For each subtask, you should provide instructions on what to do. Write detailed subtasks, ensure they are large enough to be meaningful, encompassing multiple lines of code. - You do not need to have the agent rewrite any tool functionality you already have, you should instead instruct it to utilize one or more of those tools in each subtask. - You can have agents either write coding tasks, to code some functionality or testing tasks to test previous functionality. - If a current plan exists, examine each item in the plan to determine if it was successful. If there was an item that failed, i.e. 'success': False, then you should rewrite that item and all subsequent items to ensure that the rewritten plan is successful. @@ -73,9 +73,10 @@ {code} # Constraints -- Write a function that accomplishes the 'User Requirement'. You are supplied code from a previous task under 'Previous Code', feel free to copy over that code into your own implementation if you need it. -- Always prioritize using pre-defined tools or code for the same functionality from 'Tool Info for Current Subtask'. You have access to all these tools through the `from vision_agent.tools.tools_v2 import *` import. +- Write a function that accomplishes the 'Current Subtask'. You are supplied code from a previous task under 'Previous Code', do not delete or change previous code unless it contains a bug or it is necessary to complete the 'Current Subtask'. +- Always prioritize using pre-defined tools or code for the same functionality from 'Tool Info' when working on 'Current Subtask'. You have access to all these tools through the `from vision_agent.tools.tools_v2 import *` import. - You may recieve previous trials and errors under 'Previous Task', this is code, output and reflections from previous tasks. You can use these to avoid running in to the same issues when writing your code. +- Use the `save_json` function from `vision_agent.tools.tools_v2` to save your output as a json file. - Write clean, readable, and well-documented code. # Output From c84812cfeebf6ee855fa2e1eef6308ce85015d05 Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Wed, 15 May 2024 14:19:05 -0700 Subject: [PATCH 08/10] black and isort --- vision_agent/agent/agent_coder.py | 13 ++++++++++--- vision_agent/agent/vision_agent_v2.py | 4 +++- vision_agent/tools/tools_v2.py | 2 +- vision_agent/utils/sim.py | 4 +++- 4 files changed, 17 insertions(+), 6 deletions(-) diff --git a/vision_agent/agent/agent_coder.py b/vision_agent/agent/agent_coder.py index 998c0d8c..fca9ea64 100644 --- a/vision_agent/agent/agent_coder.py +++ b/vision_agent/agent/agent_coder.py @@ -4,6 +4,7 @@ import sys from pathlib import Path from typing import Dict, List, Optional, Union + from rich.console import Console from rich.syntax import Syntax @@ -169,10 +170,14 @@ def chat( for _ in range(self.max_turns): code = write_program(question, feedback, self.coder_agent) if self.verbose: - _CONSOLE.print(Syntax(code, "python", theme="gruvbox-dark", line_numbers=True)) + _CONSOLE.print( + Syntax(code, "python", theme="gruvbox-dark", line_numbers=True) + ) debug = write_debug(question, code, feedback, self.tester_agent) if self.verbose: - _CONSOLE.print(Syntax(debug, "python", theme="gruvbox-dark", line_numbers=True)) + _CONSOLE.print( + Syntax(debug, "python", theme="gruvbox-dark", line_numbers=True) + ) results = execute_tests(code, debug) _LOGGER.info( f"execution results: passed: {results['passed']}\n{results['result']}" @@ -183,7 +188,9 @@ def chat( code, debug, results["result"].strip(), feedback, self.coder_agent # type: ignore ) if self.verbose: - _CONSOLE.print(Syntax(code, "python", theme="gruvbox-dark", line_numbers=True)) + _CONSOLE.print( + Syntax(code, "python", theme="gruvbox-dark", line_numbers=True) + ) else: # TODO: Sometimes it prints nothing, so we need to handle that case # TODO: The visual agent reflection does not work very well, needs more testing diff --git a/vision_agent/agent/vision_agent_v2.py b/vision_agent/agent/vision_agent_v2.py index 9bb39c1b..6053994f 100644 --- a/vision_agent/agent/vision_agent_v2.py +++ b/vision_agent/agent/vision_agent_v2.py @@ -379,7 +379,9 @@ def chat_with_workflow( self.long_term_memory, self.verbosity, ) - success = all(task["success"] if "success" in task else False for task in plan) + success = all( + task["success"] if "success" in task else False for task in plan + ) working_memory.update(working_memory_i) if not success: diff --git a/vision_agent/tools/tools_v2.py b/vision_agent/tools/tools_v2.py index d0845f1c..2cd33df2 100644 --- a/vision_agent/tools/tools_v2.py +++ b/vision_agent/tools/tools_v2.py @@ -1,8 +1,8 @@ import inspect import io +import json import logging import tempfile -import json from importlib import resources from pathlib import Path from typing import Any, Callable, Dict, List, Tuple, Union, cast diff --git a/vision_agent/utils/sim.py b/vision_agent/utils/sim.py index ad50149a..614fdefa 100644 --- a/vision_agent/utils/sim.py +++ b/vision_agent/utils/sim.py @@ -56,7 +56,9 @@ def save(self, sim_file: Union[str, Path]) -> None: df = df.drop("embs", axis=1) df.to_csv(sim_file / "df.csv", index=False) - def top_k(self, query: str, k: int = 5, thresh: Optional[float] = None) -> Sequence[Dict]: + def top_k( + self, query: str, k: int = 5, thresh: Optional[float] = None + ) -> Sequence[Dict]: """Returns the top k most similar items to the query. Parameters: From e7095b9256b2a3ee56b597483575d102462201a3 Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Wed, 15 May 2024 14:20:40 -0700 Subject: [PATCH 09/10] fix type errors --- vision_agent/agent/vision_agent_v2.py | 1 - vision_agent/tools/tools_v2.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/vision_agent/agent/vision_agent_v2.py b/vision_agent/agent/vision_agent_v2.py index 6053994f..5f79e29a 100644 --- a/vision_agent/agent/vision_agent_v2.py +++ b/vision_agent/agent/vision_agent_v2.py @@ -18,7 +18,6 @@ PLAN, PREV_CODE_CONTEXT, PREV_CODE_CONTEXT_WITH_REFLECTION, - REPLAN, TEST, USER_REQ_CONTEXT, USER_REQ_SUBTASK_CONTEXT, diff --git a/vision_agent/tools/tools_v2.py b/vision_agent/tools/tools_v2.py index 2cd33df2..1f1a3c6d 100644 --- a/vision_agent/tools/tools_v2.py +++ b/vision_agent/tools/tools_v2.py @@ -300,7 +300,7 @@ def save_json(data: Any, file_path: str) -> None: """ class NumpyEncoder(json.JSONEncoder): - def default(self, obj): + def default(self, obj: Any): # type: ignore if isinstance(obj, np.ndarray): return obj.tolist() elif isinstance(obj, np.bool_): From f6a08fc18d6c2ca0b956e315913c0a6b1bd5102e Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Wed, 15 May 2024 16:01:07 -0700 Subject: [PATCH 10/10] added thresh doc --- vision_agent/utils/sim.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vision_agent/utils/sim.py b/vision_agent/utils/sim.py index 614fdefa..cdfab0d0 100644 --- a/vision_agent/utils/sim.py +++ b/vision_agent/utils/sim.py @@ -64,6 +64,7 @@ def top_k( Parameters: query: str: The query to compare to. k: int: The number of items to return. + thresh: Optional[float]: The minimum similarity threshold. Returns: Sequence[Dict]: The top k most similar items.