From c7f9bef03650e22cc51f301364355588a36edd23 Mon Sep 17 00:00:00 2001 From: shankar-landing-ai Date: Tue, 28 May 2024 15:26:53 -0700 Subject: [PATCH 1/8] provide option to run with OpenAILMM --- vision_agent/agent/agent_coder.py | 10 +++++++--- vision_agent/agent/vision_agent_v3.py | 11 ++++++++--- vision_agent/agent/vision_agent_v3_prompts.py | 14 ++++++++------ 3 files changed, 23 insertions(+), 12 deletions(-) diff --git a/vision_agent/agent/agent_coder.py b/vision_agent/agent/agent_coder.py index aad3df66..e2e98da4 100644 --- a/vision_agent/agent/agent_coder.py +++ b/vision_agent/agent/agent_coder.py @@ -67,11 +67,15 @@ def parse_file_name(s: str) -> str: return "".join([p for p in s.split(" ") if p.endswith(".png")]) -def write_program(question: str, feedback: str, model: LLM) -> str: +def write_program(question: str, feedback: str, model: LLM, image=None) -> str: prompt = PROGRAM.format( docstring=TOOL_DOCSTRING, question=question, feedback=feedback ) - completion = model(prompt) + if isinstance(model, OpenAILMM): + completion = model(prompt, images=[image] if image else None) + else: + completion = model(prompt) + return preprocess_data(completion) @@ -168,7 +172,7 @@ def chat( code = "" feedback = "" for _ in range(self.max_turns): - code = write_program(question, feedback, self.coder_agent) + code = write_program(question, feedback, self.coder_agent, image=image) if self.verbose: _CONSOLE.print( Syntax(code, "python", theme="gruvbox-dark", line_numbers=True) diff --git a/vision_agent/agent/vision_agent_v3.py b/vision_agent/agent/vision_agent_v3.py index 6d194d9c..5b52edcd 100644 --- a/vision_agent/agent/vision_agent_v3.py +++ b/vision_agent/agent/vision_agent_v3.py @@ -20,6 +20,7 @@ USER_REQ, ) from vision_agent.llm import LLM, OpenAILLM +from vision_agent.lmm import LMM, OpenAILMM from vision_agent.tools.tools_v2 import TOOL_DESCRIPTIONS, TOOLS_DF, UTILITIES_DOCSTRING from vision_agent.utils import Execute from vision_agent.utils.sim import Sim @@ -76,7 +77,8 @@ def write_plan( chat: List[Dict[str, str]], tool_desc: str, working_memory: str, - model: LLM, + model: Union[LLM, LMM], + images: Optional[List[Union[str, Path]]] = None, ) -> List[Dict[str, str]]: chat = copy.deepcopy(chat) if chat[-1]["role"] != "user": @@ -86,7 +88,10 @@ def write_plan( context = USER_REQ.format(user_request=user_request) prompt = PLAN.format(context=context, tool_desc=tool_desc, feedback=working_memory) chat[-1]["content"] = prompt - return extract_json(model.chat(chat))["plan"] # type: ignore + if isinstance(model, OpenAILMM): + return extract_json(model.chat(chat, images=images))["plan"] # type: ignore + else: + return extract_json(model.chat(chat))["plan"] # type: ignore def reflect( @@ -309,7 +314,7 @@ def chat_with_workflow( while not success and retries < self.max_retries: plan_i = write_plan( - chat, TOOL_DESCRIPTIONS, format_memory(working_memory), self.planner + chat, TOOL_DESCRIPTIONS, format_memory(working_memory), self.planner, images=[image] ) plan_i_str = "\n-".join([e["instructions"] for e in plan_i]) if self.verbosity >= 1: diff --git a/vision_agent/agent/vision_agent_v3_prompts.py b/vision_agent/agent/vision_agent_v3_prompts.py index 769559a4..8937f1c4 100644 --- a/vision_agent/agent/vision_agent_v3_prompts.py +++ b/vision_agent/agent/vision_agent_v3_prompts.py @@ -61,7 +61,8 @@ 2. **Algorithm/Method Selection**: Decide on the most efficient way. 3. **Pseudocode Creation**: Write down the steps you will follow in pseudocode. 4. **Code Generation**: Translate your pseudocode into executable Python code. -5. **Logging**: Log the output of the custom functions that were provided to you from `from vision_agent.tools.tools_v2 import *`. Use a debug flag in the function parameters to toggle logging on and off. +5. **Initialization**: Initialize the variables with values provided in the question - {question}. +6.. **Logging**: Log the output of the custom functions that were provided to you from `from vision_agent.tools.tools_v2 import *`. Use a debug flag in the function parameters to toggle logging on and off. """ TEST = """ @@ -161,11 +162,12 @@ def find_text(image_path: str, text: str) -> str: 1. Verify the fundamental functionality under normal conditions. 2. Ensure each test case is well-documented with comments explaining the scenario it covers. 3. Your test case MUST run only on the given image which is {media} -4. DO NOT use any non-existent or dummy image or video files that are not provided by the user's instructions. -5. DO NOT mock any functions, you must test their functionality as is. -6. DO NOT assert the output value, run the code and verify it runs without any errors and assert only the output format or data structure. -7. DO NOT import the testing function as it will available in the testing environment. -8. Print the output of the function that is being tested. +4. Your test case MUST run only with the given values which is available in the question - {question} +5. DO NOT use any non-existent or dummy image or video files that are not provided by the user's instructions. +6. DO NOT mock any functions, you must test their functionality as is. +7. DO NOT assert the output value, run the code and verify it runs without any errors and assert only the output format or data structure. +8. DO NOT import the testing function as it will available in the testing environment. +9. Print the output of the function that is being tested. """ From 46636ed640d65e622650dbd198cd411a0e9cdbfe Mon Sep 17 00:00:00 2001 From: shankar-landing-ai Date: Tue, 28 May 2024 16:33:58 -0700 Subject: [PATCH 2/8] removed the additions in code prmpt as dillon fix is working --- vision_agent/agent/vision_agent_v3_prompts.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vision_agent/agent/vision_agent_v3_prompts.py b/vision_agent/agent/vision_agent_v3_prompts.py index 8937f1c4..4520722a 100644 --- a/vision_agent/agent/vision_agent_v3_prompts.py +++ b/vision_agent/agent/vision_agent_v3_prompts.py @@ -61,8 +61,7 @@ 2. **Algorithm/Method Selection**: Decide on the most efficient way. 3. **Pseudocode Creation**: Write down the steps you will follow in pseudocode. 4. **Code Generation**: Translate your pseudocode into executable Python code. -5. **Initialization**: Initialize the variables with values provided in the question - {question}. -6.. **Logging**: Log the output of the custom functions that were provided to you from `from vision_agent.tools.tools_v2 import *`. Use a debug flag in the function parameters to toggle logging on and off. +5. **Logging**: Log the output of the custom functions that were provided to you from `from vision_agent.tools.tools_v2 import *`. Use a debug flag in the function parameters to toggle logging on and off. """ TEST = """ From b7705842e66cbabb4d6026d07914ed3d5d8be574 Mon Sep 17 00:00:00 2001 From: shankar-landing-ai Date: Tue, 28 May 2024 17:38:13 -0700 Subject: [PATCH 3/8] edited the test prompt --- vision_agent/agent/vision_agent_v3_prompts.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vision_agent/agent/vision_agent_v3_prompts.py b/vision_agent/agent/vision_agent_v3_prompts.py index ffa83eeb..7652884b 100644 --- a/vision_agent/agent/vision_agent_v3_prompts.py +++ b/vision_agent/agent/vision_agent_v3_prompts.py @@ -172,7 +172,8 @@ def find_text(image_path: str, text: str) -> str: 4. Your test case MUST run only with the given values which is available in the question - {question} 5. DO NOT use any non-existent or dummy image or video files that are not provided by the user's instructions. 6. DO NOT mock any functions, you must test their functionality as is. -7. DO NOT assert the output value, run the code and verify it runs without any errors and assert only the output format or data structure. +7. DO NOT assert the output value, run the code and assert only the output format or data structure. +9. DO NOT use try except block to handle the error, let the error be raised if the code is incorrect. 8. DO NOT import the testing function as it will available in the testing environment. 9. Print the output of the function that is being tested. """ From ad1bc52f309f54916cbb7b3573d42db332a20951 Mon Sep 17 00:00:00 2001 From: shankar-landing-ai Date: Tue, 28 May 2024 17:44:28 -0700 Subject: [PATCH 4/8] fix mypy errors --- vision_agent/agent/agent_coder.py | 2 +- vision_agent/agent/vision_agent_v3.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/vision_agent/agent/agent_coder.py b/vision_agent/agent/agent_coder.py index 4b070b34..8fdeae8c 100644 --- a/vision_agent/agent/agent_coder.py +++ b/vision_agent/agent/agent_coder.py @@ -67,7 +67,7 @@ def parse_file_name(s: str) -> str: return "".join([p for p in s.split(" ") if p.endswith(".png")]) -def write_program(question: str, feedback: str, model: LLM, image=None) -> str: +def write_program(question: str, feedback: str, model: LLM, image: Optional[Union[str, Path]]=None) -> str: prompt = PROGRAM.format( docstring=TOOL_DOCSTRING, question=question, feedback=feedback ) diff --git a/vision_agent/agent/vision_agent_v3.py b/vision_agent/agent/vision_agent_v3.py index 70b65b96..bdd66d72 100644 --- a/vision_agent/agent/vision_agent_v3.py +++ b/vision_agent/agent/vision_agent_v3.py @@ -315,7 +315,7 @@ def chat_with_workflow( while not success and retries < self.max_retries: plan_i = write_plan( - chat, TOOL_DESCRIPTIONS, format_memory(working_memory), self.planner, images=[image] + chat, TOOL_DESCRIPTIONS, format_memory(working_memory), self.planner, images=[image] if image else None ) plan_i_str = "\n-".join([e["instructions"] for e in plan_i]) if self.verbosity >= 1: From 55c463141ae44508fa9b4e68d722aecfc03212e3 Mon Sep 17 00:00:00 2001 From: shankar-landing-ai Date: Tue, 28 May 2024 21:43:54 -0700 Subject: [PATCH 5/8] fix linting --- vision_agent/agent/agent_coder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vision_agent/agent/agent_coder.py b/vision_agent/agent/agent_coder.py index 8fdeae8c..02aea97a 100644 --- a/vision_agent/agent/agent_coder.py +++ b/vision_agent/agent/agent_coder.py @@ -67,7 +67,7 @@ def parse_file_name(s: str) -> str: return "".join([p for p in s.split(" ") if p.endswith(".png")]) -def write_program(question: str, feedback: str, model: LLM, image: Optional[Union[str, Path]]=None) -> str: +def write_program(question: str, feedback: str, model: LLM, image: Optional[Union[str, Path]] = None) -> str: prompt = PROGRAM.format( docstring=TOOL_DOCSTRING, question=question, feedback=feedback ) From 7b7a33420943b370cc4d08de7a934effa4c410cc Mon Sep 17 00:00:00 2001 From: shankar-landing-ai Date: Tue, 28 May 2024 21:48:36 -0700 Subject: [PATCH 6/8] fix black errors --- vision_agent/agent/vision_agent_v3_prompts.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vision_agent/agent/vision_agent_v3_prompts.py b/vision_agent/agent/vision_agent_v3_prompts.py index 7652884b..93ed5c47 100644 --- a/vision_agent/agent/vision_agent_v3_prompts.py +++ b/vision_agent/agent/vision_agent_v3_prompts.py @@ -173,9 +173,9 @@ def find_text(image_path: str, text: str) -> str: 5. DO NOT use any non-existent or dummy image or video files that are not provided by the user's instructions. 6. DO NOT mock any functions, you must test their functionality as is. 7. DO NOT assert the output value, run the code and assert only the output format or data structure. -9. DO NOT use try except block to handle the error, let the error be raised if the code is incorrect. -8. DO NOT import the testing function as it will available in the testing environment. -9. Print the output of the function that is being tested. +8. DO NOT use try except block to handle the error, let the error be raised if the code is incorrect. +9. DO NOT import the testing function as it will available in the testing environment. +10. Print the output of the function that is being tested. """ From 561c9c912bf4f674ba8a8c8c34982363056b0398 Mon Sep 17 00:00:00 2001 From: shankar-landing-ai Date: Tue, 28 May 2024 21:51:31 -0700 Subject: [PATCH 7/8] fix black issues --- vision_agent/agent/vision_agent_v3.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/vision_agent/agent/vision_agent_v3.py b/vision_agent/agent/vision_agent_v3.py index bdd66d72..309e8f24 100644 --- a/vision_agent/agent/vision_agent_v3.py +++ b/vision_agent/agent/vision_agent_v3.py @@ -315,7 +315,11 @@ def chat_with_workflow( while not success and retries < self.max_retries: plan_i = write_plan( - chat, TOOL_DESCRIPTIONS, format_memory(working_memory), self.planner, images=[image] if image else None + chat, + TOOL_DESCRIPTIONS, + format_memory(working_memory), + self.planner, + images=[image] if image else None, ) plan_i_str = "\n-".join([e["instructions"] for e in plan_i]) if self.verbosity >= 1: From acef8138280632ddd249c60abe1d2461c83410a2 Mon Sep 17 00:00:00 2001 From: shankar-landing-ai Date: Tue, 28 May 2024 21:53:54 -0700 Subject: [PATCH 8/8] fix black errors --- vision_agent/agent/agent_coder.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vision_agent/agent/agent_coder.py b/vision_agent/agent/agent_coder.py index 02aea97a..bb1dd761 100644 --- a/vision_agent/agent/agent_coder.py +++ b/vision_agent/agent/agent_coder.py @@ -67,7 +67,9 @@ def parse_file_name(s: str) -> str: return "".join([p for p in s.split(" ") if p.endswith(".png")]) -def write_program(question: str, feedback: str, model: LLM, image: Optional[Union[str, Path]] = None) -> str: +def write_program( + question: str, feedback: str, model: LLM, image: Optional[Union[str, Path]] = None +) -> str: prompt = PROGRAM.format( docstring=TOOL_DOCSTRING, question=question, feedback=feedback )