add support for passing args to visionagentcoder

landing-ai · Sep 5, 2024 · d255b1e · d255b1e
1 parent 3b15c9b
commit d255b1e
Show file tree

Hide file tree

Showing 2 changed files with 87 additions and 7 deletions.
diff --git a/vision_agent/agent/vision_agent.py b/vision_agent/agent/vision_agent.py
@@ -14,7 +14,7 @@
 )
 from vision_agent.lmm import LMM, Message, OpenAILMM
 from vision_agent.tools import META_TOOL_DOCSTRING
-from vision_agent.tools.meta_tools import Artifacts
+from vision_agent.tools.meta_tools import Artifacts, use_extra_vision_agent_args
 from vision_agent.utils import CodeInterpreterFactory
 from vision_agent.utils.execute import CodeInterpreter, Execution
 
@@ -87,11 +87,18 @@ def run_code_action(
  return result, obs
 
 
-def parse_execution(response: str) -> Optional[str]:
+def parse_execution(
+ response: str,
+ test_multi_plan: bool = True,
+ customed_tool_names: Optional[List[str]] = None,
+) -> Optional[str]:
  code = None
  if "<execute_python>" in response:
  code = response[response.find("<execute_python>") + len("<execute_python>") :]
  code = code[: code.find("</execute_python>")]
+
+ if code is not None:
+ code = use_extra_vision_agent_args(code, test_multi_plan, customed_tool_names)
  return code
 
 
@@ -174,6 +181,8 @@ def chat_with_code(
  self,
  chat: List[Message],
  artifacts: Optional[Artifacts] = None,
+ test_multi_plan: bool = True,
+ customized_tool_names: Optional[List[str]] = None,
  ) -> Tuple[List[Message], Artifacts]:
  """Chat with VisionAgent, it will use code to execute actions to accomplish
  its tasks.
@@ -184,6 +193,12 @@ def chat_with_code(
  or if it contains media files, it should be in the format of:
  [{"role": "user", "content": "describe your task here...", "media": ["image1.jpg", "image2.jpg"]}]
  artifacts (Optional[Artifacts]): The artifacts to use in the task.
+ test_multi_plan (bool): If True, it will test tools for multiple plans and
+ pick the best one based off of the tool results. If False, it will go
+ with the first plan.
+ customized_tool_names (List[str]): A list of customized tools for agent to
+ pick and use. If not provided, default to full tool set from
+ vision_agent.tools.
 
  Returns:
  List[Message]: The conversation response.
@@ -262,7 +277,9 @@ def chat_with_code(
  if response["let_user_respond"]:
  break
 
- code_action = parse_execution(response["response"])
+ code_action = parse_execution(
+ response["response"], test_multi_plan, customized_tool_names
+ )
 
  if code_action is not None:
  result, obs = run_code_action(

diff --git a/vision_agent/tools/meta_tools.py b/vision_agent/tools/meta_tools.py
@@ -297,7 +297,12 @@ def edit_code_artifact(
 
 
 def generate_vision_code(
- artifacts: Artifacts, name: str, chat: str, media: List[str]
+ artifacts: Artifacts,
+ name: str,
+ chat: str,
+ media: List[str],
+ test_multi_plan: bool = True,
+ customized_tool_names: Optional[List[str]] = None,
 ) -> str:
  """Generates python code to solve vision based tasks.
 
@@ -306,6 +311,8 @@ def generate_vision_code(
  name (str): The name of the artifact to save the code to.
  chat (str): The chat message from the user.
  media (List[str]): The media files to use.
+ test_multi_plan (bool): Do not change this parameter.
+ customized_tool_names (Optional[List[str]]): Do not change this parameter.
 
  Returns:
  str: The generated code.
@@ -330,7 +337,11 @@ def detect_dogs(image_path: str):
  agent = va.agent.VisionAgentCoder()
 
  fixed_chat: List[Message] = [{"role": "user", "content": chat, "media": media}]
- response = agent.chat_with_workflow(fixed_chat, test_multi_plan=False)
+ response = agent.chat_with_workflow(
+ fixed_chat,
+ test_multi_plan=test_multi_plan,
+ customized_tool_names=customized_tool_names,
+ )
  redisplay_results(response["test_result"])
  code = response["code"]
  artifacts[name] = code
@@ -342,14 +353,19 @@ def detect_dogs(image_path: str):
 
 
 def edit_vision_code(
- artifacts: Artifacts, name: str, chat_history: List[str], media: List[str]
+ artifacts: Artifacts,
+ name: str,
+ chat_history: List[str],
+ media: List[str],
+ customized_tool_names: Optional[List[str]] = None,
 ) -> str:
  """Edits python code to solve a vision based task.
 
  Parameters:
  artifacts (Artifacts): The artifacts object to save the code to.
  name (str): The file path to the code.
  chat_history (List[str]): The chat history to used to generate the code.
+ customized_tool_names (Optional[List[str]]): Do not change this parameter.
 
  Returns:
  str: The edited code.
@@ -386,7 +402,11 @@ def detect_dogs(image_path: str):
  fixed_chat_history.append({"role": "assistant", "content": code})
  fixed_chat_history.append({"role": "user", "content": chat})
 
- response = agent.chat_with_workflow(fixed_chat_history, test_multi_plan=False)
+ response = agent.chat_with_workflow(
+ fixed_chat_history,
+ test_multi_plan=False,
+ customized_tool_names=customized_tool_names,
+ )
  redisplay_results(response["test_result"])
  code = response["code"]
  artifacts[name] = code
@@ -480,6 +500,49 @@ def get_diff_with_prompts(name: str, before: str, after: str) -> str:
  return f"[Artifact {name} edits]\n{diff}\n[End of edits]"
 
 
+def use_extra_vision_agent_args(
+ code: str,
+ test_multi_plan: bool = True,
+ customized_tool_names: Optional[List[str]] = None,
+) -> str:
+ """This is for forcing arguments passed by the user to VisionAgent into the
+ VisionAgentCoder call.
+
+ Parameters:
+ code (str): The code to edit.
+ test_multi_plan (bool): Do not change this parameter.
+ customized_tool_names (Optional[List[str]]): Do not change this parameter.
+
+ Returns:
+ str: The edited code.
+ """
+ generate_pattern = r"generate_vision_code\(\s*([^\)]+)\)"
+
+ def generate_replacer(match: re.Match) -> str:
+ arg = match.group(1)
+ out_str = f"generate_vision_code({arg}, test_multi_plan={test_multi_plan}"
+ if customized_tool_names is not None:
+ out_str += f", customized_tool_names={customized_tool_names})"
+ else:
+ out_str += ")"
+ return out_str
+
+ edit_pattern = r"edit_vision_code\(\s*([^\)]+)\)"
+
+ def edit_replacer(match: re.Match) -> str:
+ arg = match.group(1)
+ out_str = f"edit_vision_code({arg}"
+ if customized_tool_names is not None:
+ out_str += f", customized_tool_names={customized_tool_names})"
+ else:
+ out_str += ")"
+ return out_str
+
+ new_code = re.sub(generate_pattern, generate_replacer, code)
+ new_code = re.sub(edit_pattern, edit_replacer, new_code)
+ return new_code
+
+
 def use_florence2_fine_tuning(
  artifacts: Artifacts, name: str, task: str, fine_tune_id: str
 ) -> str: