Skip to content

Commit

Permalink
Improve Orchestrator (#222)
Browse files Browse the repository at this point in the history
* updating prompts with fine tuning examples

* update docs and add get_diff_with_prompt

* use get_diff_with_prompts for editing code

* add extract frames to util docs

* add prompts around edits

* add support for passing args to visionagentcoder

* for debugging

* format changes

* removed duplicate TOOLS_INFO
  • Loading branch information
dillonalaird authored Sep 9, 2024
1 parent 008e042 commit a7dd110
Show file tree
Hide file tree
Showing 4 changed files with 149 additions and 39 deletions.
23 changes: 20 additions & 3 deletions vision_agent/agent/vision_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
)
from vision_agent.lmm import LMM, Message, OpenAILMM
from vision_agent.tools import META_TOOL_DOCSTRING
from vision_agent.tools.meta_tools import Artifacts
from vision_agent.tools.meta_tools import Artifacts, use_extra_vision_agent_args
from vision_agent.utils import CodeInterpreterFactory
from vision_agent.utils.execute import CodeInterpreter, Execution

Expand Down Expand Up @@ -87,11 +87,18 @@ def run_code_action(
return result, obs


def parse_execution(response: str) -> Optional[str]:
def parse_execution(
response: str,
test_multi_plan: bool = True,
customed_tool_names: Optional[List[str]] = None,
) -> Optional[str]:
code = None
if "<execute_python>" in response:
code = response[response.find("<execute_python>") + len("<execute_python>") :]
code = code[: code.find("</execute_python>")]

if code is not None:
code = use_extra_vision_agent_args(code, test_multi_plan, customed_tool_names)
return code


Expand Down Expand Up @@ -174,6 +181,8 @@ def chat_with_code(
self,
chat: List[Message],
artifacts: Optional[Artifacts] = None,
test_multi_plan: bool = True,
customized_tool_names: Optional[List[str]] = None,
) -> Tuple[List[Message], Artifacts]:
"""Chat with VisionAgent, it will use code to execute actions to accomplish
its tasks.
Expand All @@ -184,6 +193,12 @@ def chat_with_code(
or if it contains media files, it should be in the format of:
[{"role": "user", "content": "describe your task here...", "media": ["image1.jpg", "image2.jpg"]}]
artifacts (Optional[Artifacts]): The artifacts to use in the task.
test_multi_plan (bool): If True, it will test tools for multiple plans and
pick the best one based off of the tool results. If False, it will go
with the first plan.
customized_tool_names (List[str]): A list of customized tools for agent to
pick and use. If not provided, default to full tool set from
vision_agent.tools.
Returns:
List[Message]: The conversation response.
Expand Down Expand Up @@ -262,7 +277,9 @@ def chat_with_code(
if response["let_user_respond"]:
break

code_action = parse_execution(response["response"])
code_action = parse_execution(
response["response"], test_multi_plan, customized_tool_names
)

if code_action is not None:
result, obs = run_code_action(
Expand Down
63 changes: 49 additions & 14 deletions vision_agent/agent/vision_agent_prompts.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
VA_CODE = """
**Role**: You are a helpful conversational agent that assists users with their requests by writing code to solve it.
**Role**: You are a helpful agent that assists users with writing code.
**Taks**: As a conversational agent, you are required to understand the user's request and provide a helpful response. Use a Chain-of-Thought approach to break down the problem, create a plan, and then provide a response. Ensure that your response is clear, concise, and helpful. You can use an interactive Python (Jupyter Notebook) environment, executing code with <execution_python>. You are given access to an `artifacts` object which contains files shared between you and the user. `artifacts` will be automatically saved everytime you execute python code.
Expand Down Expand Up @@ -56,7 +56,9 @@
AGENT: {"thoughts": "Two dogs are detected, I will show this to the user and ask them if the result looks good.", "response": "I have written the code to detect dogs and shown the output, do the results look good to you?", "let_user_respond": true}
"""

EXAMPLES_CODE1_EXTRA = """
USER: The the image only has one dog, can you fix this?
[Artifacts loaded]
Expand Down Expand Up @@ -105,25 +107,24 @@
OBSERVATION:
[Artifact code.py]
0|from vision_agent.tools import load_image, owl_v2, closest_box_distance, overlay_bounding_boxes, save_image
0|from vision_agent.tools import load_image, florence2_phrase_grounding, closest_box_distance, overlay_bounding_boxes, save_image
1|def count_workers_with_helmets(image_path: str, output_path: str):
2| image = load_image(image_path)
3| detections = owl_v2("worker, helmet", image)
3| detections = florence2_phrase_grounding("worker, helmet", image)
4| workers = [d for d in detections if d['label'] == 'worker']
5| helmets = [d for d in detections if d['label'] == 'helmet']
6| count = 0
6| for worker in workers:
7| person_box = worker['bbox']
8| person_has_helmet = False
9| for helmet in helmets:
10| if closest_box_distance(worker['box'], helmet['box']) < 0.01:
11| person_has_helmet = True
12| break
13| if person_has_helmet:
14| count += 1
15| overlay_bounding_boxes(image, detections)
16| save_image(output_path, image)
15| return count
7| person_has_helmet = False
8| for helmet in helmets:
9| if closest_box_distance(worker['box'], helmet['box']) < 0.01:
10| person_has_helmet = True
11| break
11| if person_has_helmet:
13| count += 1
14| overlay_bounding_boxes(image, detections)
15| save_image(output_path, image)
16| return count
[End of artifact]
AGENT: {"thoughts": "I have generated the code to count the workers with helmets in the image, I must now run the code and print the output and write the visualization to the artifacts so I can see the result and the user can see the visaulization.", "response": "<execute_python>from code import count_workers_with_helmets\n print(count_workers_with_helmets('/path/to/images/workers.png', 'workers_viz.png'))\n write_media_artifact(artifacts, 'workers_viz.png')</execute_python>", "let_user_respond": false}
Expand All @@ -133,4 +134,38 @@
2
AGENT: {"thoughts": "Two workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "I have written the code to count the workers wearing helmets in code.py and saved the visualization under 'workers_viz.png'.", "let_user_respond": true}
USER: The detections are slightly off. Can you fine tune florence2 using these labels? "[{'image_path': 'image1.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}, {'image_path': 'image2.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}]"
AGENT: {"thoughts": "Because the user has supplied me with labels I can call florence2_fine_tuning on their behalf to fine tune the model", "response": "I will fine tune florence2 with the labels you provided <execute_python>florence2_fine_tuning([{'image_path': 'image1.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}, {'image_path': 'image2.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}], "phrase_grounding")</execute_python>", "let_user_respond": false}
OBSERVATION:
[Florence2 fine tuning id: 23b3b022-5ebf-4798-9373-20ef36429abf]
AGENT: {"thoughts": "The model has finished fine tuning, I will now replace the original florence2_phrase_grounding call with the fine tuning id.", "response": "I will not update the code to use the fine tuned model. <execute_python>use_florence2_fine_tuning(artifacts, "code.py", "phrase_grounding", "23b3b022-5ebf-4798-9373-20ef36429abf")</execute_python>", "let_user_respond": false}
OBSERVATION:
[Artifact code.py edits]
---
+++
@@ -1,7 +1,7 @@
from vision_agent.tools import load_image, florence2_phrase_grounding, closest_box_distance, overlay_bounding_boxes, save_image
def count_workers_with_helmets(image_path: str, output_path: str):
image = load_image(image_path)
- detections = florence2_phrase_grounding("worker, helmet", image)
+ detections = florence2_phrase_grounding("worker, helmet", image, "23b3b022-5ebf-4798-9373-20ef36429abf")
workers = [d for d in detections if d['label'] == 'worker']
helmets = [d for d in detections if d['label'] == 'helmet']
count = 0
[End of edits]
AGENT: {"thoughts": "I have updated the code to use the fine tuned model, I must now run the code and print the output and write the visualization to the artifacts so I can see the result and the user can see the visaulization.", "response": "<execute_python>from code import count_workers_with_helmets\n print(count_workers_with_helmets('/path/to/images/workers.png', 'workers_viz.png'))\n write_media_artifact(artifacts, 'workers_viz.png')</execute_python>", "let_user_respond": false}
OBSERVATION:
----- stdout -----
3
AGENT: {"thoughts": "Three workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "I have updated the code using the fine tuned florence2_phrase_grounding model to count the workers wearing helmets in code.py and saved the visualization under 'workers_viz.png'.", "let_user_respond": true}
"""
89 changes: 79 additions & 10 deletions vision_agent/tools/meta_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -297,7 +297,12 @@ def edit_code_artifact(


def generate_vision_code(
artifacts: Artifacts, name: str, chat: str, media: List[str]
artifacts: Artifacts,
name: str,
chat: str,
media: List[str],
test_multi_plan: bool = True,
customized_tool_names: Optional[List[str]] = None,
) -> str:
"""Generates python code to solve vision based tasks.
Expand All @@ -306,6 +311,8 @@ def generate_vision_code(
name (str): The name of the artifact to save the code to.
chat (str): The chat message from the user.
media (List[str]): The media files to use.
test_multi_plan (bool): Do not change this parameter.
customized_tool_names (Optional[List[str]]): Do not change this parameter.
Returns:
str: The generated code.
Expand All @@ -330,7 +337,11 @@ def detect_dogs(image_path: str):
agent = va.agent.VisionAgentCoder()

fixed_chat: List[Message] = [{"role": "user", "content": chat, "media": media}]
response = agent.chat_with_workflow(fixed_chat, test_multi_plan=True)
response = agent.chat_with_workflow(
fixed_chat,
test_multi_plan=test_multi_plan,
customized_tool_names=customized_tool_names,
)
redisplay_results(response["test_result"])
code = response["code"]
artifacts[name] = code
Expand All @@ -342,14 +353,19 @@ def detect_dogs(image_path: str):


def edit_vision_code(
artifacts: Artifacts, name: str, chat_history: List[str], media: List[str]
artifacts: Artifacts,
name: str,
chat_history: List[str],
media: List[str],
customized_tool_names: Optional[List[str]] = None,
) -> str:
"""Edits python code to solve a vision based task.
Parameters:
artifacts (Artifacts): The artifacts object to save the code to.
name (str): The file path to the code.
chat_history (List[str]): The chat history to used to generate the code.
customized_tool_names (Optional[List[str]]): Do not change this parameter.
Returns:
str: The edited code.
Expand Down Expand Up @@ -386,7 +402,11 @@ def detect_dogs(image_path: str):
fixed_chat_history.append({"role": "assistant", "content": code})
fixed_chat_history.append({"role": "user", "content": chat})

response = agent.chat_with_workflow(fixed_chat_history, test_multi_plan=False)
response = agent.chat_with_workflow(
fixed_chat_history,
test_multi_plan=False,
customized_tool_names=customized_tool_names,
)
redisplay_results(response["test_result"])
code = response["code"]
artifacts[name] = code
Expand Down Expand Up @@ -425,18 +445,19 @@ def get_tool_descriptions() -> str:


def florence2_fine_tuning(bboxes: List[Dict[str, Any]], task: str) -> str:
"""'florence2_fine_tuning' is a tool that fine-tune florence2 to be able to detect
"""DO NOT use this function unless the user has supplied you with bboxes.
'florence2_fine_tuning' is a tool that fine-tune florence2 to be able to detect
objects in an image based on a given dataset. It returns the fine tuning job id.
Parameters:
bboxes (List[BboxInput]): A list of BboxInput containing the
image path, labels and bounding boxes.
bboxes (List[BboxInput]): A list of BboxInput containing the image path, labels
and bounding boxes. The coordinates are unnormalized.
task (str): The florencev2 fine-tuning task. The options are
'phrase_grounding'.
Returns:
UUID: The fine tuning job id, this id will used to retrieve the fine
tuned model.
str: The fine tuning job id, this id will used to retrieve the fine tuned
model.
Example
-------
Expand Down Expand Up @@ -473,6 +494,54 @@ def get_diff(before: str, after: str) -> str:
)


def get_diff_with_prompts(name: str, before: str, after: str) -> str:
diff = get_diff(before, after)
return f"[Artifact {name} edits]\n{diff}\n[End of edits]"


def use_extra_vision_agent_args(
code: str,
test_multi_plan: bool = True,
customized_tool_names: Optional[List[str]] = None,
) -> str:
"""This is for forcing arguments passed by the user to VisionAgent into the
VisionAgentCoder call.
Parameters:
code (str): The code to edit.
test_multi_plan (bool): Do not change this parameter.
customized_tool_names (Optional[List[str]]): Do not change this parameter.
Returns:
str: The edited code.
"""
generate_pattern = r"generate_vision_code\(\s*([^\)]+)\)"

def generate_replacer(match: re.Match) -> str:
arg = match.group(1)
out_str = f"generate_vision_code({arg}, test_multi_plan={test_multi_plan}"
if customized_tool_names is not None:
out_str += f", customized_tool_names={customized_tool_names})"
else:
out_str += ")"
return out_str

edit_pattern = r"edit_vision_code\(\s*([^\)]+)\)"

def edit_replacer(match: re.Match) -> str:
arg = match.group(1)
out_str = f"edit_vision_code({arg}"
if customized_tool_names is not None:
out_str += f", customized_tool_names={customized_tool_names})"
else:
out_str += ")"
return out_str

new_code = re.sub(generate_pattern, generate_replacer, code)
new_code = re.sub(edit_pattern, edit_replacer, new_code)
return new_code


def use_florence2_fine_tuning(
artifacts: Artifacts, name: str, task: str, fine_tune_id: str
) -> str:
Expand Down Expand Up @@ -521,7 +590,7 @@ def replacer(match: re.Match) -> str:

artifacts[name] = new_code

diff = get_diff(code, new_code)
diff = get_diff_with_prompts(name, code, new_code)
print(diff)
return diff

Expand Down
13 changes: 1 addition & 12 deletions vision_agent/tools/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -1945,15 +1945,4 @@ def overlay_counting_results(
TOOL_DESCRIPTIONS = get_tool_descriptions(TOOLS) # type: ignore
TOOL_DOCSTRING = get_tool_documentation(TOOLS) # type: ignore
TOOLS_INFO = get_tools_info(FUNCTION_TOOLS) # type: ignore
UTILITIES_DOCSTRING = get_tool_documentation(
[
save_json,
load_image,
save_image,
save_video,
overlay_bounding_boxes,
overlay_segmentation_masks,
overlay_heat_map,
overlay_counting_results,
]
)
UTILITIES_DOCSTRING = get_tool_documentation(UTIL_TOOLS) # type: ignore

0 comments on commit a7dd110

Please sign in to comment.