From b5f4fac39fee2ecb7ffd192fd7b52da18b081108 Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Mon, 14 Oct 2024 10:19:56 -0700 Subject: [PATCH 1/4] fix side case for replace args --- tests/unit/test_meta_tools.py | 4 ++-- vision_agent/tools/meta_tools.py | 5 +++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/tests/unit/test_meta_tools.py b/tests/unit/test_meta_tools.py index 6cac95ce..fff867d9 100644 --- a/tests/unit/test_meta_tools.py +++ b/tests/unit/test_meta_tools.py @@ -91,7 +91,7 @@ def test_use_extra_vision_agent_args_real_case(): assert out_code == expected_code code = "edit_vision_code(artifacts, 'code.py', ['write code 1', 'write code 2'], ['/home/user/n0xn5X6_IMG_2861%20(1).mov'])" - expected_code = "edit_vision_code(artifacts, 'code.py', ['write code 1', 'write code 2'], ['/home/user/n0xn5X6_IMG_2861%20(1).mov'], test_multi_plan=True)" + expected_code = "edit_vision_code(artifacts, 'code.py', ['write code 1', 'write code 2'], ['/home/user/n0xn5X6_IMG_2861%20(1).mov'])" out_code = use_extra_vision_agent_args(code) assert out_code == expected_code @@ -103,6 +103,6 @@ def test_use_extra_vision_args_with_custom_tools(): assert out_code == expected_code code = "edit_vision_code(artifacts, 'code.py', 'write code', ['/home/user/n0xn5X6_IMG_2861%20(1).mov'])" - expected_code = "edit_vision_code(artifacts, 'code.py', 'write code', ['/home/user/n0xn5X6_IMG_2861%20(1).mov'], test_multi_plan=True, custom_tool_names=['tool1', 'tool2'])" + expected_code = "edit_vision_code(artifacts, 'code.py', 'write code', ['/home/user/n0xn5X6_IMG_2861%20(1).mov'], custom_tool_names=['tool1', 'tool2'])" out_code = use_extra_vision_agent_args(code, custom_tool_names=["tool1", "tool2"]) assert out_code == expected_code diff --git a/vision_agent/tools/meta_tools.py b/vision_agent/tools/meta_tools.py index d9537e7c..b481f4f7 100644 --- a/vision_agent/tools/meta_tools.py +++ b/vision_agent/tools/meta_tools.py @@ -676,12 +676,13 @@ def use_extra_vision_agent_args( for node in red: # seems to always be atomtrailers not call type if node.type == "atomtrailers": + if node.name.value == "generate_vision_code": + node.value[1].value.append(f"test_multi_plan={test_multi_plan}") + if ( node.name.value == "generate_vision_code" or node.name.value == "edit_vision_code" ): - node.value[1].value.append(f"test_multi_plan={test_multi_plan}") - if custom_tool_names is not None: node.value[1].value.append(f"custom_tool_names={custom_tool_names}") cleaned_code = red.dumps().strip() From 2e858fb22249136a6defccb887ebe94ff465e05a Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Mon, 14 Oct 2024 10:40:39 -0700 Subject: [PATCH 2/4] fix new names --- examples/chat/app.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/chat/app.py b/examples/chat/app.py index b07e308e..2069218b 100644 --- a/examples/chat/app.py +++ b/examples/chat/app.py @@ -54,7 +54,7 @@ def update_messages(messages, lock): with lock: if Path("artifacts.pkl").exists(): artifacts.load("artifacts.pkl") - new_chat, _ = agent.chat_with_code(messages, artifacts=artifacts) + new_chat, _ = agent.chat_with_artifacts(messages, artifacts=artifacts) for new_message in new_chat[len(messages) :]: messages.append(new_message) From 3e83cab5e60e1b8dc0f44138dd7bf63d85142d40 Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Mon, 14 Oct 2024 10:41:31 -0700 Subject: [PATCH 3/4] fix hallucination cases --- tests/unit/test_va.py | 24 ++++++++++++++++++++++-- vision_agent/agent/vision_agent.py | 13 +++++++++++++ 2 files changed, 35 insertions(+), 2 deletions(-) diff --git a/tests/unit/test_va.py b/tests/unit/test_va.py index e7a6e7c5..b0b80e2d 100644 --- a/tests/unit/test_va.py +++ b/tests/unit/test_va.py @@ -1,4 +1,5 @@ from vision_agent.agent.agent_utils import extract_tag +from vision_agent.agent.vision_agent import _clean_response from vision_agent.tools.meta_tools import use_extra_vision_agent_args @@ -31,7 +32,7 @@ def test_parse_execution_no_test_multi_plan_edit(): code = "edit_vision_code(artifacts, 'code.py', ['Generate code'], ['image.png'])" assert ( parse_execution(code, False) - == "edit_vision_code(artifacts, 'code.py', ['Generate code'], ['image.png'], test_multi_plan=False)" + == "edit_vision_code(artifacts, 'code.py', ['Generate code'], ['image.png'])" ) @@ -47,10 +48,29 @@ def test_parse_execution_custom_tool_names_edit(): code = "edit_vision_code(artifacts, 'code.py', ['Generate code'], ['image.png'])" assert ( parse_execution(code, test_multi_plan=False, custom_tool_names=["owl_v2_image"]) - == "edit_vision_code(artifacts, 'code.py', ['Generate code'], ['image.png'], test_multi_plan=False, custom_tool_names=['owl_v2_image'])" + == "edit_vision_code(artifacts, 'code.py', ['Generate code'], ['image.png'], custom_tool_names=['owl_v2_image'])" ) def test_parse_execution_multiple_executes(): code = "print('Hello, World!')print('Hello, World!')" assert parse_execution(code) == "print('Hello, World!')\nprint('Hello, World!')" + + +def test_clean_response(): + response = """Thinking... +Here is the code: +print('Hello, World!')""" + assert _clean_response(response) == response + + +def test_clean_response_remove_extra(): + response = """Thinking... +Here is the code: +print('Hello, World!') +More thinking... +Response to code...""" + expected_response = """Thinking... +Here is the code: +print('Hello, World!')""" + assert _clean_response(response) == expected_response diff --git a/vision_agent/agent/vision_agent.py b/vision_agent/agent/vision_agent.py index 29643ecd..bfd2697b 100644 --- a/vision_agent/agent/vision_agent.py +++ b/vision_agent/agent/vision_agent.py @@ -85,6 +85,15 @@ def format_agent_message(agent_message: str) -> str: return output +def _clean_response(response: str) -> str: + # Sometimes the LLM will hallucinate responses to an tag as if it + # had already executed the code. This function removes the hallucinated response. + if "" in response: + end_execute_python = response.find("") + response = response[: end_execute_python + len("")] + return response + + def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]: chat = copy.deepcopy(chat) @@ -114,6 +123,10 @@ def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]: message["media"] = chat[-1]["media"] conv_resp = cast(str, orch([message], stream=False)) + # clean the response first, if we are executing code, do not resond or end + # conversation before the code has been executed. + conv_resp = _clean_response(conv_resp) + let_user_respond_str = extract_tag(conv_resp, "let_user_respond") let_user_respond = ( "true" in let_user_respond_str.lower() if let_user_respond_str else False From 070b0f5be5c2ea5f1f9c7afce091a10d8b5b65cb Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Mon, 14 Oct 2024 11:04:12 -0700 Subject: [PATCH 4/4] add forced responses to streaming --- vision_agent/agent/vision_agent.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vision_agent/agent/vision_agent.py b/vision_agent/agent/vision_agent.py index bfd2697b..38857724 100644 --- a/vision_agent/agent/vision_agent.py +++ b/vision_agent/agent/vision_agent.py @@ -471,7 +471,7 @@ def chat_with_artifacts( self.streaming_message( { "role": "assistant", - "content": json.dumps(response), + "content": json.dumps(add_step_descriptions(response)), "finished": finished and code_action is None, } )