switched to tags to reduce parsing issues

landing-ai · Oct 12, 2024 · db8ee75 · db8ee75
1 parent 4664e72
commit db8ee75
Show file tree

Hide file tree

Showing 3 changed files with 165 additions and 81 deletions.
diff --git a/tests/unit/test_va.py b/tests/unit/test_va.py
@@ -1,4 +1,12 @@
-from vision_agent.agent.vision_agent import parse_execution
+from vision_agent.agent.agent_utils import extract_tag
+from vision_agent.tools.meta_tools import use_extra_vision_agent_args
+
+
+def parse_execution(code, test_multi_plan=True, custom_tool_names=None):
+ code = extract_tag(code, "execute_python")
+ if not code:
+ return None
+ return use_extra_vision_agent_args(code, test_multi_plan, custom_tool_names)
 
 
 def test_parse_execution_zero():

diff --git a/vision_agent/agent/vision_agent.py b/vision_agent/agent/vision_agent.py
@@ -1,12 +1,13 @@
 import copy
+import json
 import logging
 import os
 import tempfile
 from pathlib import Path
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast
 
 from vision_agent.agent import Agent
-from vision_agent.agent.agent_utils import extract_json
+from vision_agent.agent.agent_utils import extract_json, extract_tag
 from vision_agent.agent.vision_agent_prompts import (
  EXAMPLES_CODE1,
  EXAMPLES_CODE2,
@@ -57,6 +58,32 @@ def add_boilerplate(code: str, **format: Any) -> str:
  )
 
 
+def format_agent_message(agent_message: str) -> str:
+ agent_message_json = extract_json(agent_message)
+ output = ""
+ if "thinking" in agent_message_json and agent_message_json["thinking"]:
+ output += "<thinking>" + agent_message_json["thinking"] + "</thinking>"
+ if "response" in agent_message_json and agent_message_json["response"]:
+ output += "<response>" + agent_message_json["response"] + "</response>"
+ if "execute_python" in agent_message_json and agent_message_json["execute_python"]:
+ output += (
+ "\n<execute_python>\n"
+ + agent_message_json["execute_python"]
+ + "\n</execute_python>\n"
+ )
+ if (
+ "let_user_respond" in agent_message_json
+ and agent_message_json["let_user_respond"]
+ ):
+ output += (
+ "<let_user_respond>"
+ + str(agent_message_json["let_user_respond"])
+ + "</let_user_respond>"
+ )
+
+ return output
+
+
 def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
  chat = copy.deepcopy(chat)
 
@@ -67,7 +94,7 @@ def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
  elif chat_i["role"] == "observation":
  conversation += f"OBSERVATION:\n{chat_i['content']}\n\n"
  elif chat_i["role"] == "assistant":
- conversation += f"AGENT: {chat_i['content']}\n\n"
+ conversation += f"AGENT: {format_agent_message(chat_i['content'])}\n\n"
  else:
  raise ValueError(f"role {chat_i['role']} is not supported")
 
@@ -84,7 +111,19 @@ def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
  and len(chat[-1]["media"]) > 0 # type: ignore
  ):
  message["media"] = chat[-1]["media"]
- return extract_json(orch([message], stream=False)) # type: ignore
+ conv_resp = cast(str, orch([message], stream=False))
+
+ let_user_respond_str = extract_tag(conv_resp, "let_user_respond")
+ let_user_respond = (
+ "true" in let_user_respond_str.lower() if let_user_respond_str else False
+ )
+
+ return {
+ "thinking": extract_tag(conv_resp, "thinking"),
+ "response": extract_tag(conv_resp, "response"),
+ "execute_python": extract_tag(conv_resp, "execute_python"),
+ "let_user_respond": let_user_respond,
+ }
 
 
 def execute_code_action(
@@ -100,32 +139,6 @@ def execute_code_action(
  return result, obs
 
 
-def parse_execution(
- response: str,
- test_multi_plan: bool = True,
- custom_tool_names: Optional[List[str]] = None,
-) -> Optional[str]:
- code = None
- remaining = response
- all_code = []
- while "<execute_python>" in remaining:
- code_i = remaining[
- remaining.find("<execute_python>") + len("<execute_python>") :
- ]
- code_i = code_i[: code_i.find("</execute_python>")]
- remaining = remaining[
- remaining.find("</execute_python>") + len("</execute_python>") :
- ]
- all_code.append(code_i)
-
- if len(all_code) > 0:
- code = "\n".join(all_code)
-
- if code is not None:
- code = use_extra_vision_agent_args(code, test_multi_plan, custom_tool_names)
- return code
-
-
 def execute_user_code_action(
  last_user_message: Message,
  code_interpreter: CodeInterpreter,
@@ -138,9 +151,13 @@ def execute_user_code_action(
  return user_result, user_obs
 
  last_user_content = cast(str, last_user_message.get("content", ""))
+ try:
+ user_code_action = json.loads(last_user_content).get("execute_python", None)
+ except json.JSONDecodeError:
+ return user_result, user_obs
 
- user_code_action = parse_execution(last_user_content, False)
  if user_code_action is not None:
+ user_code_action = use_extra_vision_agent_args(user_code_action, False)
  user_result, user_obs = execute_code_action(
  user_code_action, code_interpreter, artifact_remote_path
  )
@@ -149,29 +166,28 @@ def execute_user_code_action(
  return user_result, user_obs
 
 
-def add_step_descriptions(response: Dict[str, str]) -> Dict[str, str]:
+def add_step_descriptions(response: Dict[str, Any]) -> Dict[str, Any]:
  response = copy.deepcopy(response)
- if "response" in response:
- resp_str = response["response"]
- if "<execute_python>" in resp_str:
- # only include descriptions for these, the rest will just have executing
- # code
- description_map = {
- "open_code_artifact": "Reading file.",
- "create_code_artifact": "Creating file.",
- "edit_code_artifact": "Editing file.",
- "generate_vision_code": "Generating vision code.",
- "edit_vision_code": "Editing vision code.",
- }
- description = ""
- for k, v in description_map.items():
- if k in resp_str:
- description += v + " "
- if description == "":
- description = "Executing code."
- resp_str = resp_str[resp_str.find("<execute_python>") :]
- resp_str = description + resp_str
- response["response"] = resp_str
+
+ if "execute_python" in response and response["execute_python"]:
+ # only include descriptions for these, the rest will just have executing
+ # code
+ description_map = {
+ "open_code_artifact": "Reading file.",
+ "create_code_artifact": "Creating file.",
+ "edit_code_artifact": "Editing file.",
+ "generate_vision_code": "Generating vision code.",
+ "edit_vision_code": "Editing vision code.",
+ }
+ description = ""
+ for k, v in description_map.items():
+ if k in response["execute_python"]:
+ description += v + " "
+ if description == "":
+ description = "Executing code."
+
+ response["response"] = description
+
  return response
 
 
@@ -394,25 +410,27 @@ def chat_with_artifacts(
  int_chat.append(
  {
  "role": "assistant",
- "content": str(add_step_descriptions(response)),
+ "content": json.dumps(add_step_descriptions(response)),
  }
  )
  orig_chat.append(
  {
  "role": "assistant",
- "content": str(add_step_descriptions(response)),
+ "content": json.dumps(add_step_descriptions(response)),
  }
  )
 
  # sometimes it gets stuck in a loop, so we force it to exit
  if last_response == response:
  response["let_user_respond"] = True
 
- finished = response["let_user_respond"]
+ finished = response.get("let_user_respond", False)
 
- code_action = parse_execution(
- response["response"], test_multi_plan, custom_tool_names
- )
+ code_action = response.get("execute_python", None)
+ if code_action is not None:
+ code_action = use_extra_vision_agent_args(
+ code_action, test_multi_plan, custom_tool_names
+ )
 
  if last_response == response:
  self.streaming_message(
@@ -431,7 +449,7 @@ def chat_with_artifacts(
  self.streaming_message(
  {
  "role": "assistant",
- "content": response,
+ "content": json.dumps(response),
  "finished": finished and code_action is None,
  }
  )