Skip to content

Commit

Permalink
switched to tags to reduce parsing issues
Browse files Browse the repository at this point in the history
  • Loading branch information
dillonalaird committed Oct 12, 2024
1 parent 4664e72 commit db8ee75
Show file tree
Hide file tree
Showing 3 changed files with 165 additions and 81 deletions.
10 changes: 9 additions & 1 deletion tests/unit/test_va.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,12 @@
from vision_agent.agent.vision_agent import parse_execution
from vision_agent.agent.agent_utils import extract_tag
from vision_agent.tools.meta_tools import use_extra_vision_agent_args


def parse_execution(code, test_multi_plan=True, custom_tool_names=None):
code = extract_tag(code, "execute_python")
if not code:
return None
return use_extra_vision_agent_args(code, test_multi_plan, custom_tool_names)


def test_parse_execution_zero():
Expand Down
136 changes: 77 additions & 59 deletions vision_agent/agent/vision_agent.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
import copy
import json
import logging
import os
import tempfile
from pathlib import Path
from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast

from vision_agent.agent import Agent
from vision_agent.agent.agent_utils import extract_json
from vision_agent.agent.agent_utils import extract_json, extract_tag
from vision_agent.agent.vision_agent_prompts import (
EXAMPLES_CODE1,
EXAMPLES_CODE2,
Expand Down Expand Up @@ -57,6 +58,32 @@ def add_boilerplate(code: str, **format: Any) -> str:
)


def format_agent_message(agent_message: str) -> str:
agent_message_json = extract_json(agent_message)
output = ""
if "thinking" in agent_message_json and agent_message_json["thinking"]:
output += "<thinking>" + agent_message_json["thinking"] + "</thinking>"
if "response" in agent_message_json and agent_message_json["response"]:
output += "<response>" + agent_message_json["response"] + "</response>"
if "execute_python" in agent_message_json and agent_message_json["execute_python"]:
output += (
"\n<execute_python>\n"
+ agent_message_json["execute_python"]
+ "\n</execute_python>\n"
)
if (
"let_user_respond" in agent_message_json
and agent_message_json["let_user_respond"]
):
output += (
"<let_user_respond>"
+ str(agent_message_json["let_user_respond"])
+ "</let_user_respond>"
)

return output


def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
chat = copy.deepcopy(chat)

Expand All @@ -67,7 +94,7 @@ def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
elif chat_i["role"] == "observation":
conversation += f"OBSERVATION:\n{chat_i['content']}\n\n"
elif chat_i["role"] == "assistant":
conversation += f"AGENT: {chat_i['content']}\n\n"
conversation += f"AGENT: {format_agent_message(chat_i['content'])}\n\n"
else:
raise ValueError(f"role {chat_i['role']} is not supported")

Expand All @@ -84,7 +111,19 @@ def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]:
and len(chat[-1]["media"]) > 0 # type: ignore
):
message["media"] = chat[-1]["media"]
return extract_json(orch([message], stream=False)) # type: ignore
conv_resp = cast(str, orch([message], stream=False))

let_user_respond_str = extract_tag(conv_resp, "let_user_respond")
let_user_respond = (
"true" in let_user_respond_str.lower() if let_user_respond_str else False
)

return {
"thinking": extract_tag(conv_resp, "thinking"),
"response": extract_tag(conv_resp, "response"),
"execute_python": extract_tag(conv_resp, "execute_python"),
"let_user_respond": let_user_respond,
}


def execute_code_action(
Expand All @@ -100,32 +139,6 @@ def execute_code_action(
return result, obs


def parse_execution(
response: str,
test_multi_plan: bool = True,
custom_tool_names: Optional[List[str]] = None,
) -> Optional[str]:
code = None
remaining = response
all_code = []
while "<execute_python>" in remaining:
code_i = remaining[
remaining.find("<execute_python>") + len("<execute_python>") :
]
code_i = code_i[: code_i.find("</execute_python>")]
remaining = remaining[
remaining.find("</execute_python>") + len("</execute_python>") :
]
all_code.append(code_i)

if len(all_code) > 0:
code = "\n".join(all_code)

if code is not None:
code = use_extra_vision_agent_args(code, test_multi_plan, custom_tool_names)
return code


def execute_user_code_action(
last_user_message: Message,
code_interpreter: CodeInterpreter,
Expand All @@ -138,9 +151,13 @@ def execute_user_code_action(
return user_result, user_obs

last_user_content = cast(str, last_user_message.get("content", ""))
try:
user_code_action = json.loads(last_user_content).get("execute_python", None)
except json.JSONDecodeError:
return user_result, user_obs

user_code_action = parse_execution(last_user_content, False)
if user_code_action is not None:
user_code_action = use_extra_vision_agent_args(user_code_action, False)
user_result, user_obs = execute_code_action(
user_code_action, code_interpreter, artifact_remote_path
)
Expand All @@ -149,29 +166,28 @@ def execute_user_code_action(
return user_result, user_obs


def add_step_descriptions(response: Dict[str, str]) -> Dict[str, str]:
def add_step_descriptions(response: Dict[str, Any]) -> Dict[str, Any]:
response = copy.deepcopy(response)
if "response" in response:
resp_str = response["response"]
if "<execute_python>" in resp_str:
# only include descriptions for these, the rest will just have executing
# code
description_map = {
"open_code_artifact": "Reading file.",
"create_code_artifact": "Creating file.",
"edit_code_artifact": "Editing file.",
"generate_vision_code": "Generating vision code.",
"edit_vision_code": "Editing vision code.",
}
description = ""
for k, v in description_map.items():
if k in resp_str:
description += v + " "
if description == "":
description = "Executing code."
resp_str = resp_str[resp_str.find("<execute_python>") :]
resp_str = description + resp_str
response["response"] = resp_str

if "execute_python" in response and response["execute_python"]:
# only include descriptions for these, the rest will just have executing
# code
description_map = {
"open_code_artifact": "Reading file.",
"create_code_artifact": "Creating file.",
"edit_code_artifact": "Editing file.",
"generate_vision_code": "Generating vision code.",
"edit_vision_code": "Editing vision code.",
}
description = ""
for k, v in description_map.items():
if k in response["execute_python"]:
description += v + " "
if description == "":
description = "Executing code."

response["response"] = description

return response


Expand Down Expand Up @@ -394,25 +410,27 @@ def chat_with_artifacts(
int_chat.append(
{
"role": "assistant",
"content": str(add_step_descriptions(response)),
"content": json.dumps(add_step_descriptions(response)),
}
)
orig_chat.append(
{
"role": "assistant",
"content": str(add_step_descriptions(response)),
"content": json.dumps(add_step_descriptions(response)),
}
)

# sometimes it gets stuck in a loop, so we force it to exit
if last_response == response:
response["let_user_respond"] = True

finished = response["let_user_respond"]
finished = response.get("let_user_respond", False)

code_action = parse_execution(
response["response"], test_multi_plan, custom_tool_names
)
code_action = response.get("execute_python", None)
if code_action is not None:
code_action = use_extra_vision_agent_args(
code_action, test_multi_plan, custom_tool_names
)

if last_response == response:
self.streaming_message(
Expand All @@ -431,7 +449,7 @@ def chat_with_artifacts(
self.streaming_message(
{
"role": "assistant",
"content": response,
"content": json.dumps(response),
"finished": finished and code_action is None,
}
)
Expand Down
Loading

0 comments on commit db8ee75

Please sign in to comment.