landing-ai · dillonalaird · Sep 9, 2024 · Sep 5, 2024 · Sep 5, 2024 · Sep 5, 2024
diff --git a/vision_agent/agent/vision_agent.py b/vision_agent/agent/vision_agent.py
@@ -14,7 +14,7 @@
 )
 from vision_agent.lmm import LMM, Message, OpenAILMM
 from vision_agent.tools import META_TOOL_DOCSTRING
-from vision_agent.tools.meta_tools import Artifacts
+from vision_agent.tools.meta_tools import Artifacts, use_extra_vision_agent_args
 from vision_agent.utils import CodeInterpreterFactory
 from vision_agent.utils.execute import CodeInterpreter, Execution
 
@@ -87,11 +87,18 @@ def run_code_action(
  return result, obs
 
 
-def parse_execution(response: str) -> Optional[str]:
+def parse_execution(
+ response: str,
+ test_multi_plan: bool = True,
+ customed_tool_names: Optional[List[str]] = None,
+) -> Optional[str]:
  code = None
  if "<execute_python>" in response:
  code = response[response.find("<execute_python>") + len("<execute_python>") :]
  code = code[: code.find("</execute_python>")]
+
+ if code is not None:
+ code = use_extra_vision_agent_args(code, test_multi_plan, customed_tool_names)
  return code
 
 
@@ -174,6 +181,8 @@ def chat_with_code(
  self,
  chat: List[Message],
  artifacts: Optional[Artifacts] = None,
+ test_multi_plan: bool = True,
+ customized_tool_names: Optional[List[str]] = None,
  ) -> Tuple[List[Message], Artifacts]:
  """Chat with VisionAgent, it will use code to execute actions to accomplish
  its tasks.
@@ -184,6 +193,12 @@ def chat_with_code(
  or if it contains media files, it should be in the format of:
  [{"role": "user", "content": "describe your task here...", "media": ["image1.jpg", "image2.jpg"]}]
  artifacts (Optional[Artifacts]): The artifacts to use in the task.
+ test_multi_plan (bool): If True, it will test tools for multiple plans and
+ pick the best one based off of the tool results. If False, it will go
+ with the first plan.
+ customized_tool_names (List[str]): A list of customized tools for agent to
+ pick and use. If not provided, default to full tool set from
+ vision_agent.tools.
 
  Returns:
  List[Message]: The conversation response.
@@ -262,7 +277,9 @@ def chat_with_code(
  if response["let_user_respond"]:
  break
 
- code_action = parse_execution(response["response"])
+ code_action = parse_execution(
+ response["response"], test_multi_plan, customized_tool_names
+ )
 
  if code_action is not None:
  result, obs = run_code_action(

diff --git a/vision_agent/agent/vision_agent_prompts.py b/vision_agent/agent/vision_agent_prompts.py
@@ -1,5 +1,5 @@
 VA_CODE = """
-**Role**: You are a helpful conversational agent that assists users with their requests by writing code to solve it.
+**Role**: You are a helpful agent that assists users with writing code.
 
 **Taks**: As a conversational agent, you are required to understand the user's request and provide a helpful response. Use a Chain-of-Thought approach to break down the problem, create a plan, and then provide a response. Ensure that your response is clear, concise, and helpful. You can use an interactive Python (Jupyter Notebook) environment, executing code with <execution_python>. You are given access to an `artifacts` object which contains files shared between you and the user. `artifacts` will be automatically saved everytime you execute python code.
 
@@ -56,7 +56,9 @@
 
 
 AGENT: {"thoughts": "Two dogs are detected, I will show this to the user and ask them if the result looks good.", "response": "I have written the code to detect dogs and shown the output, do the results look good to you?", "let_user_respond": true}
+"""
 
+EXAMPLES_CODE1_EXTRA = """
 USER: The the image only has one dog, can you fix this?
 
 [Artifacts loaded]
@@ -105,25 +107,24 @@
 
 OBSERVATION:
 [Artifact code.py]
-0|from vision_agent.tools import load_image, owl_v2, closest_box_distance, overlay_bounding_boxes, save_image
+0|from vision_agent.tools import load_image, florence2_phrase_grounding, closest_box_distance, overlay_bounding_boxes, save_image
 1|def count_workers_with_helmets(image_path: str, output_path: str):
 2| image = load_image(image_path)
-3| detections = owl_v2("worker, helmet", image)
+3| detections = florence2_phrase_grounding("worker, helmet", image)
 4| workers = [d for d in detections if d['label'] == 'worker']
 5| helmets = [d for d in detections if d['label'] == 'helmet']
 6| count = 0
 6| for worker in workers:
-7| person_box = worker['bbox']
-8| person_has_helmet = False
-9| for helmet in helmets:
-10| if closest_box_distance(worker['box'], helmet['box']) < 0.01:
-11| person_has_helmet = True
-12| break
-13| if person_has_helmet:
-14| count += 1
-15| overlay_bounding_boxes(image, detections)
-16| save_image(output_path, image)
-15| return count
+7| person_has_helmet = False
+8| for helmet in helmets:
+9| if closest_box_distance(worker['box'], helmet['box']) < 0.01:
+10| person_has_helmet = True
+11| break
+11| if person_has_helmet:
+13| count += 1
+14| overlay_bounding_boxes(image, detections)
+15| save_image(output_path, image)
+16| return count
 [End of artifact]
 
 AGENT: {"thoughts": "I have generated the code to count the workers with helmets in the image, I must now run the code and print the output and write the visualization to the artifacts so I can see the result and the user can see the visaulization.", "response": "<execute_python>from code import count_workers_with_helmets\n print(count_workers_with_helmets('/path/to/images/workers.png', 'workers_viz.png'))\n write_media_artifact(artifacts, 'workers_viz.png')</execute_python>", "let_user_respond": false}
@@ -133,4 +134,38 @@
 2
 
 AGENT: {"thoughts": "Two workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "I have written the code to count the workers wearing helmets in code.py and saved the visualization under 'workers_viz.png'.", "let_user_respond": true}
+
+USER: The detections are slightly off. Can you fine tune florence2 using these labels? "[{'image_path': 'image1.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}, {'image_path': 'image2.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}]"
+
+AGENT: {"thoughts": "Because the user has supplied me with labels I can call florence2_fine_tuning on their behalf to fine tune the model", "response": "I will fine tune florence2 with the labels you provided <execute_python>florence2_fine_tuning([{'image_path': 'image1.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}, {'image_path': 'image2.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}], "phrase_grounding")</execute_python>", "let_user_respond": false}
+
+OBSERVATION:
+[Florence2 fine tuning id: 23b3b022-5ebf-4798-9373-20ef36429abf]
+
+
+AGENT: {"thoughts": "The model has finished fine tuning, I will now replace the original florence2_phrase_grounding call with the fine tuning id.", "response": "I will not update the code to use the fine tuned model. <execute_python>use_florence2_fine_tuning(artifacts, "code.py", "phrase_grounding", "23b3b022-5ebf-4798-9373-20ef36429abf")</execute_python>", "let_user_respond": false}
+
+OBSERVATION:
+[Artifact code.py edits]
+---
++++
+@@ -1,7 +1,7 @@
+ from vision_agent.tools import load_image, florence2_phrase_grounding, closest_box_distance, overlay_bounding_boxes, save_image
+ def count_workers_with_helmets(image_path: str, output_path: str):
+ image = load_image(image_path)
+- detections = florence2_phrase_grounding("worker, helmet", image)
++ detections = florence2_phrase_grounding("worker, helmet", image, "23b3b022-5ebf-4798-9373-20ef36429abf")
+ workers = [d for d in detections if d['label'] == 'worker']
+ helmets = [d for d in detections if d['label'] == 'helmet']
+ count = 0
+[End of edits]
+
+
+AGENT: {"thoughts": "I have updated the code to use the fine tuned model, I must now run the code and print the output and write the visualization to the artifacts so I can see the result and the user can see the visaulization.", "response": "<execute_python>from code import count_workers_with_helmets\n print(count_workers_with_helmets('/path/to/images/workers.png', 'workers_viz.png'))\n write_media_artifact(artifacts, 'workers_viz.png')</execute_python>", "let_user_respond": false}
+
+OBSERVATION:
+----- stdout -----
+3
+
+AGENT: {"thoughts": "Three workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "I have updated the code using the fine tuned florence2_phrase_grounding model to count the workers wearing helmets in code.py and saved the visualization under 'workers_viz.png'.", "let_user_respond": true}
 """
diff --git a/vision_agent/tools/meta_tools.py b/vision_agent/tools/meta_tools.py
@@ -297,7 +297,12 @@ def edit_code_artifact(
 
 
 def generate_vision_code(
- artifacts: Artifacts, name: str, chat: str, media: List[str]
+ artifacts: Artifacts,
+ name: str,
+ chat: str,
+ media: List[str],
+ test_multi_plan: bool = True,
+ customized_tool_names: Optional[List[str]] = None,
 ) -> str:
  """Generates python code to solve vision based tasks.
 
@@ -306,6 +311,8 @@ def generate_vision_code(
  name (str): The name of the artifact to save the code to.
  chat (str): The chat message from the user.
  media (List[str]): The media files to use.
+ test_multi_plan (bool): Do not change this parameter.
+ customized_tool_names (Optional[List[str]]): Do not change this parameter.
 
  Returns:
  str: The generated code.
@@ -330,7 +337,11 @@ def detect_dogs(image_path: str):
  agent = va.agent.VisionAgentCoder()
 
  fixed_chat: List[Message] = [{"role": "user", "content": chat, "media": media}]
- response = agent.chat_with_workflow(fixed_chat, test_multi_plan=True)
+ response = agent.chat_with_workflow(
+ fixed_chat,
+ test_multi_plan=test_multi_plan,
+ customized_tool_names=customized_tool_names,
+ )
  redisplay_results(response["test_result"])
  code = response["code"]
  artifacts[name] = code
@@ -342,14 +353,19 @@ def detect_dogs(image_path: str):
 
 
 def edit_vision_code(
- artifacts: Artifacts, name: str, chat_history: List[str], media: List[str]
+ artifacts: Artifacts,
+ name: str,
+ chat_history: List[str],
+ media: List[str],
+ customized_tool_names: Optional[List[str]] = None,
 ) -> str:
  """Edits python code to solve a vision based task.
 
  Parameters:
  artifacts (Artifacts): The artifacts object to save the code to.
  name (str): The file path to the code.
  chat_history (List[str]): The chat history to used to generate the code.
+ customized_tool_names (Optional[List[str]]): Do not change this parameter.
 
  Returns:
  str: The edited code.
@@ -386,7 +402,11 @@ def detect_dogs(image_path: str):
  fixed_chat_history.append({"role": "assistant", "content": code})
  fixed_chat_history.append({"role": "user", "content": chat})
 
- response = agent.chat_with_workflow(fixed_chat_history, test_multi_plan=False)
+ response = agent.chat_with_workflow(
+ fixed_chat_history,
+ test_multi_plan=False,
+ customized_tool_names=customized_tool_names,
+ )
  redisplay_results(response["test_result"])
  code = response["code"]
  artifacts[name] = code
@@ -425,18 +445,19 @@ def get_tool_descriptions() -> str:
 
 
 def florence2_fine_tuning(bboxes: List[Dict[str, Any]], task: str) -> str:
- """'florence2_fine_tuning' is a tool that fine-tune florence2 to be able to detect
+ """DO NOT use this function unless the user has supplied you with bboxes.
+ 'florence2_fine_tuning' is a tool that fine-tune florence2 to be able to detect
  objects in an image based on a given dataset. It returns the fine tuning job id.
 
  Parameters:
- bboxes (List[BboxInput]): A list of BboxInput containing the
- image path, labels and bounding boxes.
+ bboxes (List[BboxInput]): A list of BboxInput containing the image path, labels
+ and bounding boxes. The coordinates are unnormalized.
  task (str): The florencev2 fine-tuning task. The options are
  'phrase_grounding'.
 
  Returns:
- UUID: The fine tuning job id, this id will used to retrieve the fine
- tuned model.
+ str: The fine tuning job id, this id will used to retrieve the fine tuned
+ model.
 
  Example
  -------
@@ -473,6 +494,54 @@ def get_diff(before: str, after: str) -> str:
  )
 
 
+def get_diff_with_prompts(name: str, before: str, after: str) -> str:
+ diff = get_diff(before, after)
+ return f"[Artifact {name} edits]\n{diff}\n[End of edits]"
+
+
+def use_extra_vision_agent_args(
+ code: str,
+ test_multi_plan: bool = True,
+ customized_tool_names: Optional[List[str]] = None,
+) -> str:
+ """This is for forcing arguments passed by the user to VisionAgent into the
+ VisionAgentCoder call.
+
+ Parameters:
+ code (str): The code to edit.
+ test_multi_plan (bool): Do not change this parameter.
+ customized_tool_names (Optional[List[str]]): Do not change this parameter.
+
+ Returns:
+ str: The edited code.
+ """
+ generate_pattern = r"generate_vision_code\(\s*([^\)]+)\)"
+
+ def generate_replacer(match: re.Match) -> str:
+ arg = match.group(1)
+ out_str = f"generate_vision_code({arg}, test_multi_plan={test_multi_plan}"
+ if customized_tool_names is not None:
+ out_str += f", customized_tool_names={customized_tool_names})"
+ else:
+ out_str += ")"
+ return out_str
+
+ edit_pattern = r"edit_vision_code\(\s*([^\)]+)\)"
+
+ def edit_replacer(match: re.Match) -> str:
+ arg = match.group(1)
+ out_str = f"edit_vision_code({arg}"
+ if customized_tool_names is not None:
+ out_str += f", customized_tool_names={customized_tool_names})"
+ else:
+ out_str += ")"
+ return out_str
+
+ new_code = re.sub(generate_pattern, generate_replacer, code)
+ new_code = re.sub(edit_pattern, edit_replacer, new_code)
+ return new_code
+
+
 def use_florence2_fine_tuning(
  artifacts: Artifacts, name: str, task: str, fine_tune_id: str
 ) -> str:
@@ -521,7 +590,7 @@ def replacer(match: re.Match) -> str:
 
  artifacts[name] = new_code
 
- diff = get_diff(code, new_code)
+ diff = get_diff_with_prompts(name, code, new_code)
  print(diff)
  return diff
 

diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py
@@ -1945,15 +1945,4 @@ def overlay_counting_results(
 TOOL_DESCRIPTIONS = get_tool_descriptions(TOOLS) # type: ignore
 TOOL_DOCSTRING = get_tool_documentation(TOOLS) # type: ignore
 TOOLS_INFO = get_tools_info(FUNCTION_TOOLS) # type: ignore
-UTILITIES_DOCSTRING = get_tool_documentation(
- [
- save_json,
- load_image,
- save_image,
- save_video,
- overlay_bounding_boxes,
- overlay_segmentation_masks,
- overlay_heat_map,
- overlay_counting_results,
- ]
-)
+UTILITIES_DOCSTRING = get_tool_documentation(UTIL_TOOLS) # type: ignore