From d244cf3f14e28bb073b3d7c4e3531f239ad8e513 Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Thu, 19 Sep 2024 09:02:30 -0700 Subject: [PATCH] pass plan thoughts to coder --- vision_agent/agent/vision_agent_coder.py | 32 ++++++++++++------- .../agent/vision_agent_coder_prompts.py | 13 +++++--- 2 files changed, 29 insertions(+), 16 deletions(-) diff --git a/vision_agent/agent/vision_agent_coder.py b/vision_agent/agent/vision_agent_coder.py index 5dc52bb6..46d9c3bf 100644 --- a/vision_agent/agent/vision_agent_coder.py +++ b/vision_agent/agent/vision_agent_coder.py @@ -123,7 +123,7 @@ def pick_plan( log_progress: Callable[[Dict[str, Any]], None], verbosity: int = 0, max_retries: int = 3, -) -> Tuple[str, str]: +) -> Tuple[Dict[str, str], str]: log_progress( { "type": "log", @@ -233,10 +233,10 @@ def pick_plan( chat[-1]["content"] = prompt count = 0 - best_plan = None - while best_plan is None and count < max_retries: + plan_thoughts = None + while plan_thoughts is None and count < max_retries: try: - best_plan = extract_json(model(chat, stream=False)) # type: ignore + plan_thoughts = extract_json(model(chat, stream=False)) # type: ignore except JSONDecodeError as e: _LOGGER.exception( f"Error while extracting JSON during picking best plan {str(e)}" @@ -245,23 +245,23 @@ def pick_plan( count += 1 if ( - best_plan is None - or "best_plan" not in best_plan - or ("best_plan" in best_plan and best_plan["best_plan"] not in plans) + plan_thoughts is None + or "best_plan" not in plan_thoughts + or ("best_plan" in plan_thoughts and plan_thoughts["best_plan"] not in plans) ): - best_plan = {"best_plan": list(plans.keys())[0]} + plan_thoughts = {"best_plan": list(plans.keys())[0]} if verbosity >= 1: - _LOGGER.info(f"Best plan:\n{best_plan}") + _LOGGER.info(f"Best plan:\n{plan_thoughts}") log_progress( { "type": "log", "log_content": "Picked best plan", "status": "completed", - "payload": plans[best_plan["best_plan"]], + "payload": plans[plan_thoughts["best_plan"]], } ) - return best_plan["best_plan"], tool_output_str + return plan_thoughts, tool_output_str def write_code( @@ -269,6 +269,7 @@ def write_code( chat: List[Message], plan: str, tool_info: str, + plan_thoughts: str, tool_output: str, feedback: str, ) -> str: @@ -281,6 +282,7 @@ def write_code( docstring=tool_info, question=FULL_TASK.format(user_request=user_request, subtasks=plan), tool_output=tool_output, + plan_thoughts=plan_thoughts, feedback=feedback, ) chat[-1]["content"] = prompt @@ -316,6 +318,7 @@ def write_and_test_code( plan: str, tool_info: str, tool_output: str, + plan_thoughts: str, tool_utils: str, working_memory: List[Dict[str, str]], coder: LMM, @@ -340,6 +343,7 @@ def write_and_test_code( plan, tool_info, tool_output, + plan_thoughts, format_memory(working_memory), ) test = write_test( @@ -760,7 +764,7 @@ def chat_with_workflow( ) if test_multi_plan: - best_plan, tool_output_str = pick_plan( + plan_thoughts, tool_output_str = pick_plan( int_chat, plans, tool_infos["all"], @@ -770,9 +774,12 @@ def chat_with_workflow( self.log_progress, verbosity=self.verbosity, ) + best_plan = plan_thoughts["best_plan"] + plan_thoughts = plan_thoughts["thoughts"] else: best_plan = list(plans.keys())[0] tool_output_str = "" + plan_thoughts = "" if best_plan in plans and best_plan in tool_infos: plan_i = plans[best_plan] @@ -807,6 +814,7 @@ def chat_with_workflow( + "\n-".join([e for e in plan_i["instructions"]]), tool_info=tool_info, tool_output=tool_output_str, + plan_thoughts=plan_thoughts, tool_utils=T.UTILITIES_DOCSTRING, working_memory=working_memory, coder=self.coder, diff --git a/vision_agent/agent/vision_agent_coder_prompts.py b/vision_agent/agent/vision_agent_coder_prompts.py index 040ee0dc..e961b896 100644 --- a/vision_agent/agent/vision_agent_coder_prompts.py +++ b/vision_agent/agent/vision_agent_coder_prompts.py @@ -114,13 +114,14 @@ ```python -import numpy as np from vision_agent.tools import extract_frames, owl_v2_image, florence2_phrase_grounding, florence2_sam2_video_tracking # sample at 1 FPS and use the first 10 frames to reduce processing time frames = extract_frames("video.mp4", 1) frames = [f[0] for f in frames][:10] +# import numpy for remove_array auxiliary function +import numpy as np def remove_arrays(o): if isinstance(o, list): return [remove_arrays(e) for e in o] @@ -179,7 +180,7 @@ def remove_arrays(o): 3. Output a JSON object with the following format: {{ "predicted_answer": str # the answer you would expect from the best plan - "thoughts": str # your thought process for choosing the best plan + "thoughts": str # your thought process for choosing the best plan, any adjustments you would make to the plan "best_plan": str # the best plan you have chosen }} """ @@ -202,15 +203,19 @@ def remove_arrays(o): **User Instructions**: {question} -**Tool Output**: +**Tool Outputs**: {tool_output} + +**Tool Output Thoughts**: +{plan_thoughts} + **Previous Feedback**: {feedback} **Instructions**: 1. **Understand and Clarify**: Make sure you understand the task. -2. **Algorithm/Method Selection**: Decide on the most efficient method, use the tool output to guide your decision. +2. **Algorithm/Method Selection**: Decide on the most efficient method, use the tool outputs to guide your decision. 3. **Pseudocode Creation**: Write down the steps you will follow in pseudocode. 4. **Code Generation**: Translate your pseudocode into executable Python code. Ensure you use correct arguments, remember coordinates are always returned normalized from `vision_agent.tools`. All images from `vision_agent.tools` are in RGB format, red is (255, 0, 0) and blue is (0, 0, 255). """