Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve Orchestrator #222

Merged
merged 9 commits into from
Sep 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 20 additions & 3 deletions vision_agent/agent/vision_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
)
from vision_agent.lmm import LMM, Message, OpenAILMM
from vision_agent.tools import META_TOOL_DOCSTRING
from vision_agent.tools.meta_tools import Artifacts
from vision_agent.tools.meta_tools import Artifacts, use_extra_vision_agent_args
from vision_agent.utils import CodeInterpreterFactory
from vision_agent.utils.execute import CodeInterpreter, Execution

Expand Down Expand Up @@ -87,11 +87,18 @@ def run_code_action(
return result, obs


def parse_execution(response: str) -> Optional[str]:
def parse_execution(
response: str,
test_multi_plan: bool = True,
customed_tool_names: Optional[List[str]] = None,
) -> Optional[str]:
code = None
if "<execute_python>" in response:
code = response[response.find("<execute_python>") + len("<execute_python>") :]
code = code[: code.find("</execute_python>")]

if code is not None:
code = use_extra_vision_agent_args(code, test_multi_plan, customed_tool_names)
return code


Expand Down Expand Up @@ -174,6 +181,8 @@ def chat_with_code(
self,
chat: List[Message],
artifacts: Optional[Artifacts] = None,
test_multi_plan: bool = True,
customized_tool_names: Optional[List[str]] = None,
) -> Tuple[List[Message], Artifacts]:
"""Chat with VisionAgent, it will use code to execute actions to accomplish
its tasks.
Expand All @@ -184,6 +193,12 @@ def chat_with_code(
or if it contains media files, it should be in the format of:
[{"role": "user", "content": "describe your task here...", "media": ["image1.jpg", "image2.jpg"]}]
artifacts (Optional[Artifacts]): The artifacts to use in the task.
test_multi_plan (bool): If True, it will test tools for multiple plans and
pick the best one based off of the tool results. If False, it will go
with the first plan.
customized_tool_names (List[str]): A list of customized tools for agent to
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this similar to what @MingruiZhang developed in the UI. Send a custom list of tools to use from the overall set of tools ?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes this is the same thing. It was added to VisionAgentCoder and I just made sure it worked in VisionAgent.

pick and use. If not provided, default to full tool set from
vision_agent.tools.

Returns:
List[Message]: The conversation response.
Expand Down Expand Up @@ -262,7 +277,9 @@ def chat_with_code(
if response["let_user_respond"]:
break

code_action = parse_execution(response["response"])
code_action = parse_execution(
response["response"], test_multi_plan, customized_tool_names
)

if code_action is not None:
result, obs = run_code_action(
Expand Down
63 changes: 49 additions & 14 deletions vision_agent/agent/vision_agent_prompts.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
VA_CODE = """
**Role**: You are a helpful conversational agent that assists users with their requests by writing code to solve it.
**Role**: You are a helpful agent that assists users with writing code.

**Taks**: As a conversational agent, you are required to understand the user's request and provide a helpful response. Use a Chain-of-Thought approach to break down the problem, create a plan, and then provide a response. Ensure that your response is clear, concise, and helpful. You can use an interactive Python (Jupyter Notebook) environment, executing code with <execution_python>. You are given access to an `artifacts` object which contains files shared between you and the user. `artifacts` will be automatically saved everytime you execute python code.

Expand Down Expand Up @@ -56,7 +56,9 @@


AGENT: {"thoughts": "Two dogs are detected, I will show this to the user and ask them if the result looks good.", "response": "I have written the code to detect dogs and shown the output, do the results look good to you?", "let_user_respond": true}
"""

EXAMPLES_CODE1_EXTRA = """
USER: The the image only has one dog, can you fix this?

[Artifacts loaded]
Expand Down Expand Up @@ -105,25 +107,24 @@

OBSERVATION:
[Artifact code.py]
0|from vision_agent.tools import load_image, owl_v2, closest_box_distance, overlay_bounding_boxes, save_image
0|from vision_agent.tools import load_image, florence2_phrase_grounding, closest_box_distance, overlay_bounding_boxes, save_image
1|def count_workers_with_helmets(image_path: str, output_path: str):
2| image = load_image(image_path)
3| detections = owl_v2("worker, helmet", image)
3| detections = florence2_phrase_grounding("worker, helmet", image)
4| workers = [d for d in detections if d['label'] == 'worker']
5| helmets = [d for d in detections if d['label'] == 'helmet']
6| count = 0
6| for worker in workers:
7| person_box = worker['bbox']
8| person_has_helmet = False
9| for helmet in helmets:
10| if closest_box_distance(worker['box'], helmet['box']) < 0.01:
11| person_has_helmet = True
12| break
13| if person_has_helmet:
14| count += 1
15| overlay_bounding_boxes(image, detections)
16| save_image(output_path, image)
15| return count
7| person_has_helmet = False
8| for helmet in helmets:
9| if closest_box_distance(worker['box'], helmet['box']) < 0.01:
10| person_has_helmet = True
11| break
11| if person_has_helmet:
13| count += 1
14| overlay_bounding_boxes(image, detections)
15| save_image(output_path, image)
16| return count
[End of artifact]

AGENT: {"thoughts": "I have generated the code to count the workers with helmets in the image, I must now run the code and print the output and write the visualization to the artifacts so I can see the result and the user can see the visaulization.", "response": "<execute_python>from code import count_workers_with_helmets\n print(count_workers_with_helmets('/path/to/images/workers.png', 'workers_viz.png'))\n write_media_artifact(artifacts, 'workers_viz.png')</execute_python>", "let_user_respond": false}
Expand All @@ -133,4 +134,38 @@
2

AGENT: {"thoughts": "Two workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "I have written the code to count the workers wearing helmets in code.py and saved the visualization under 'workers_viz.png'.", "let_user_respond": true}

USER: The detections are slightly off. Can you fine tune florence2 using these labels? "[{'image_path': 'image1.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}, {'image_path': 'image2.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}]"

AGENT: {"thoughts": "Because the user has supplied me with labels I can call florence2_fine_tuning on their behalf to fine tune the model", "response": "I will fine tune florence2 with the labels you provided <execute_python>florence2_fine_tuning([{'image_path': 'image1.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}, {'image_path': 'image2.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}], "phrase_grounding")</execute_python>", "let_user_respond": false}

OBSERVATION:
[Florence2 fine tuning id: 23b3b022-5ebf-4798-9373-20ef36429abf]


AGENT: {"thoughts": "The model has finished fine tuning, I will now replace the original florence2_phrase_grounding call with the fine tuning id.", "response": "I will not update the code to use the fine tuned model. <execute_python>use_florence2_fine_tuning(artifacts, "code.py", "phrase_grounding", "23b3b022-5ebf-4798-9373-20ef36429abf")</execute_python>", "let_user_respond": false}

OBSERVATION:
[Artifact code.py edits]
---
+++
@@ -1,7 +1,7 @@
from vision_agent.tools import load_image, florence2_phrase_grounding, closest_box_distance, overlay_bounding_boxes, save_image
def count_workers_with_helmets(image_path: str, output_path: str):
image = load_image(image_path)
- detections = florence2_phrase_grounding("worker, helmet", image)
+ detections = florence2_phrase_grounding("worker, helmet", image, "23b3b022-5ebf-4798-9373-20ef36429abf")
workers = [d for d in detections if d['label'] == 'worker']
helmets = [d for d in detections if d['label'] == 'helmet']
count = 0
[End of edits]


AGENT: {"thoughts": "I have updated the code to use the fine tuned model, I must now run the code and print the output and write the visualization to the artifacts so I can see the result and the user can see the visaulization.", "response": "<execute_python>from code import count_workers_with_helmets\n print(count_workers_with_helmets('/path/to/images/workers.png', 'workers_viz.png'))\n write_media_artifact(artifacts, 'workers_viz.png')</execute_python>", "let_user_respond": false}

OBSERVATION:
----- stdout -----
3

AGENT: {"thoughts": "Three workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "I have updated the code using the fine tuned florence2_phrase_grounding model to count the workers wearing helmets in code.py and saved the visualization under 'workers_viz.png'.", "let_user_respond": true}
"""
89 changes: 79 additions & 10 deletions vision_agent/tools/meta_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -297,7 +297,12 @@ def edit_code_artifact(


def generate_vision_code(
artifacts: Artifacts, name: str, chat: str, media: List[str]
artifacts: Artifacts,
name: str,
chat: str,
media: List[str],
test_multi_plan: bool = True,
customized_tool_names: Optional[List[str]] = None,
) -> str:
"""Generates python code to solve vision based tasks.

Expand All @@ -306,6 +311,8 @@ def generate_vision_code(
name (str): The name of the artifact to save the code to.
chat (str): The chat message from the user.
media (List[str]): The media files to use.
test_multi_plan (bool): Do not change this parameter.
customized_tool_names (Optional[List[str]]): Do not change this parameter.

Returns:
str: The generated code.
Expand All @@ -330,7 +337,11 @@ def detect_dogs(image_path: str):
agent = va.agent.VisionAgentCoder()

fixed_chat: List[Message] = [{"role": "user", "content": chat, "media": media}]
response = agent.chat_with_workflow(fixed_chat, test_multi_plan=True)
response = agent.chat_with_workflow(
fixed_chat,
test_multi_plan=test_multi_plan,
customized_tool_names=customized_tool_names,
)
redisplay_results(response["test_result"])
code = response["code"]
artifacts[name] = code
Expand All @@ -342,14 +353,19 @@ def detect_dogs(image_path: str):


def edit_vision_code(
artifacts: Artifacts, name: str, chat_history: List[str], media: List[str]
artifacts: Artifacts,
name: str,
chat_history: List[str],
media: List[str],
customized_tool_names: Optional[List[str]] = None,
) -> str:
"""Edits python code to solve a vision based task.

Parameters:
artifacts (Artifacts): The artifacts object to save the code to.
name (str): The file path to the code.
chat_history (List[str]): The chat history to used to generate the code.
customized_tool_names (Optional[List[str]]): Do not change this parameter.

Returns:
str: The edited code.
Expand Down Expand Up @@ -386,7 +402,11 @@ def detect_dogs(image_path: str):
fixed_chat_history.append({"role": "assistant", "content": code})
fixed_chat_history.append({"role": "user", "content": chat})

response = agent.chat_with_workflow(fixed_chat_history, test_multi_plan=False)
response = agent.chat_with_workflow(
fixed_chat_history,
test_multi_plan=False,
customized_tool_names=customized_tool_names,
)
redisplay_results(response["test_result"])
code = response["code"]
artifacts[name] = code
Expand Down Expand Up @@ -425,18 +445,19 @@ def get_tool_descriptions() -> str:


def florence2_fine_tuning(bboxes: List[Dict[str, Any]], task: str) -> str:
"""'florence2_fine_tuning' is a tool that fine-tune florence2 to be able to detect
"""DO NOT use this function unless the user has supplied you with bboxes.
'florence2_fine_tuning' is a tool that fine-tune florence2 to be able to detect
objects in an image based on a given dataset. It returns the fine tuning job id.

Parameters:
bboxes (List[BboxInput]): A list of BboxInput containing the
image path, labels and bounding boxes.
bboxes (List[BboxInput]): A list of BboxInput containing the image path, labels
and bounding boxes. The coordinates are unnormalized.
task (str): The florencev2 fine-tuning task. The options are
'phrase_grounding'.

Returns:
UUID: The fine tuning job id, this id will used to retrieve the fine
tuned model.
str: The fine tuning job id, this id will used to retrieve the fine tuned
model.

Example
-------
Expand Down Expand Up @@ -473,6 +494,54 @@ def get_diff(before: str, after: str) -> str:
)


def get_diff_with_prompts(name: str, before: str, after: str) -> str:
diff = get_diff(before, after)
return f"[Artifact {name} edits]\n{diff}\n[End of edits]"


def use_extra_vision_agent_args(
code: str,
test_multi_plan: bool = True,
customized_tool_names: Optional[List[str]] = None,
) -> str:
"""This is for forcing arguments passed by the user to VisionAgent into the
VisionAgentCoder call.

Parameters:
code (str): The code to edit.
test_multi_plan (bool): Do not change this parameter.
customized_tool_names (Optional[List[str]]): Do not change this parameter.

Returns:
str: The edited code.
"""
generate_pattern = r"generate_vision_code\(\s*([^\)]+)\)"

def generate_replacer(match: re.Match) -> str:
arg = match.group(1)
out_str = f"generate_vision_code({arg}, test_multi_plan={test_multi_plan}"
if customized_tool_names is not None:
out_str += f", customized_tool_names={customized_tool_names})"
else:
out_str += ")"
return out_str

edit_pattern = r"edit_vision_code\(\s*([^\)]+)\)"

def edit_replacer(match: re.Match) -> str:
arg = match.group(1)
out_str = f"edit_vision_code({arg}"
if customized_tool_names is not None:
out_str += f", customized_tool_names={customized_tool_names})"
else:
out_str += ")"
return out_str

new_code = re.sub(generate_pattern, generate_replacer, code)
new_code = re.sub(edit_pattern, edit_replacer, new_code)
return new_code


def use_florence2_fine_tuning(
artifacts: Artifacts, name: str, task: str, fine_tune_id: str
) -> str:
Expand Down Expand Up @@ -521,7 +590,7 @@ def replacer(match: re.Match) -> str:

artifacts[name] = new_code

diff = get_diff(code, new_code)
diff = get_diff_with_prompts(name, code, new_code)
print(diff)
return diff

Expand Down
13 changes: 1 addition & 12 deletions vision_agent/tools/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -1945,15 +1945,4 @@ def overlay_counting_results(
TOOL_DESCRIPTIONS = get_tool_descriptions(TOOLS) # type: ignore
TOOL_DOCSTRING = get_tool_documentation(TOOLS) # type: ignore
TOOLS_INFO = get_tools_info(FUNCTION_TOOLS) # type: ignore
UTILITIES_DOCSTRING = get_tool_documentation(
[
save_json,
load_image,
save_image,
save_video,
overlay_bounding_boxes,
overlay_segmentation_masks,
overlay_heat_map,
overlay_counting_results,
]
)
UTILITIES_DOCSTRING = get_tool_documentation(UTIL_TOOLS) # type: ignore
Loading