From cffd0606bb416d12fb46a1a63f09c196fc75d8a1 Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Thu, 3 Oct 2024 16:46:39 -0700 Subject: [PATCH] revert changes with planning step for now --- vision_agent/agent/vision_agent.py | 26 ++++++++++++++++++---- vision_agent/tools/meta_tools.py | 35 +++++------------------------- 2 files changed, 28 insertions(+), 33 deletions(-) diff --git a/vision_agent/agent/vision_agent.py b/vision_agent/agent/vision_agent.py index 518858d1..ac9f4f32 100644 --- a/vision_agent/agent/vision_agent.py +++ b/vision_agent/agent/vision_agent.py @@ -204,7 +204,7 @@ def __call__( input: Union[str, List[Message]], media: Optional[Union[str, Path]] = None, artifacts: Optional[Artifacts] = None, - ) -> List[Message]: + ) -> str: """Chat with VisionAgent and get the conversation response. Parameters: @@ -221,10 +221,28 @@ def __call__( input = [{"role": "user", "content": input}] if media is not None: input[0]["media"] = [media] - results, _ = self.chat_with_code(input, artifacts) - return results + results, _ = self.chat_and_artifacts(input, artifacts) + return results[-1]["content"] # type: ignore + + def chat( + self, + chat: List[Message], + ) -> List[Message]: + """Chat with VisionAgent, it will use code to execute actions to accomplish + its tasks. + + Parameters: + chat (List[Message]): A conversation in the format of: + [{"role": "user", "content": "describe your task here..."}] + or if it contains media files, it should be in the format of: + [{"role": "user", "content": "describe your task here...", "media": ["image1.jpg", "image2.jpg"]}] + + Returns: + List[Message]: The conversation response. + """ + return self.chat_and_artifacts(chat)[0] - def chat_with_code( + def chat_and_artifacts( self, chat: List[Message], artifacts: Optional[Artifacts] = None, diff --git a/vision_agent/tools/meta_tools.py b/vision_agent/tools/meta_tools.py index 6e97198e..cdd522a8 100644 --- a/vision_agent/tools/meta_tools.py +++ b/vision_agent/tools/meta_tools.py @@ -405,7 +405,7 @@ def generate_vision_plan( output_str += f"\nbest plan: {response.best_plan}\n" output_str += "thoughts: " + response.plan_thoughts.strip() + "\n" - output_str += f"[End Plan Context]" + output_str += "[End Plan Context]" print(output_str) return output_str @@ -415,9 +415,6 @@ def generate_vision_code( name: str, chat: str, media: List[str], - plan: Optional[Dict[str, Union[str, List[str]]]] = None, - plan_thoughts: Optional[str] = None, - plan_context_artifact: Optional[str] = None, test_multi_plan: bool = True, custom_tool_names: Optional[List[str]] = None, ) -> str: @@ -428,10 +425,6 @@ def generate_vision_code( name (str): The name of the artifact to save the code to. chat (str): The chat message from the user. media (List[str]): The media files to use. - plan (Optional[Dict[str, Union[str, List[str]]]): The plan to use to generate - the code. - plan_thoughts (Optional[str]): The thoughts to use to generate the code. - plan_context_artifact (Optional[str]): The artifact name of the stored plan context. test_multi_plan (bool): Do not change this parameter. custom_tool_names (Optional[List[str]]): Do not change this parameter. @@ -447,7 +440,6 @@ def detect_dogs(image_path: str): dogs = owl_v2("dog", image) return dogs """ - if ZMQ_PORT is not None: agent = va.agent.VisionAgentCoder( report_progress_callback=lambda inp: report_progress_callback( @@ -458,26 +450,11 @@ def detect_dogs(image_path: str): agent = va.agent.VisionAgentCoder() fixed_chat: List[Message] = [{"role": "user", "content": chat, "media": media}] - if plan is None or plan_thoughts is None or plan_context_artifact is None: - response = agent.generate_code( - fixed_chat, - test_multi_plan=test_multi_plan, - custom_tool_names=custom_tool_names, - ) - else: - plan_context = json.loads(artifacts[plan_context_artifact]) - plan_context = va.agent.PlanContext( - plans={"plan1": plan}, - best_plan="plan1", - plan_thoughts=plan_thoughts, - tool_output=plan_context["tool_output"], - tool_doc=plan_context["tool_doc"], - test_results=None, - ) - response = agent.generate_code_from_plan( - fixed_chat, - plan_context, - ) + response = agent.generate_code( + fixed_chat, + test_multi_plan=test_multi_plan, + custom_tool_names=custom_tool_names, + ) redisplay_results(response["test_result"]) code = response["code"] artifacts[name] = code