From a7dd110c60c8b0dd0a8fa6d8cf77eb618a7d106a Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Mon, 9 Sep 2024 08:21:25 -0700 Subject: [PATCH] Improve Orchestrator (#222) * updating prompts with fine tuning examples * update docs and add get_diff_with_prompt * use get_diff_with_prompts for editing code * add extract frames to util docs * add prompts around edits * add support for passing args to visionagentcoder * for debugging * format changes * removed duplicate TOOLS_INFO --- vision_agent/agent/vision_agent.py | 23 +++++- vision_agent/agent/vision_agent_prompts.py | 63 +++++++++++---- vision_agent/tools/meta_tools.py | 89 +++++++++++++++++++--- vision_agent/tools/tools.py | 13 +--- 4 files changed, 149 insertions(+), 39 deletions(-) diff --git a/vision_agent/agent/vision_agent.py b/vision_agent/agent/vision_agent.py index 4733bb24..776ab964 100644 --- a/vision_agent/agent/vision_agent.py +++ b/vision_agent/agent/vision_agent.py @@ -14,7 +14,7 @@ ) from vision_agent.lmm import LMM, Message, OpenAILMM from vision_agent.tools import META_TOOL_DOCSTRING -from vision_agent.tools.meta_tools import Artifacts +from vision_agent.tools.meta_tools import Artifacts, use_extra_vision_agent_args from vision_agent.utils import CodeInterpreterFactory from vision_agent.utils.execute import CodeInterpreter, Execution @@ -87,11 +87,18 @@ def run_code_action( return result, obs -def parse_execution(response: str) -> Optional[str]: +def parse_execution( + response: str, + test_multi_plan: bool = True, + customed_tool_names: Optional[List[str]] = None, +) -> Optional[str]: code = None if "" in response: code = response[response.find("") + len("") :] code = code[: code.find("")] + + if code is not None: + code = use_extra_vision_agent_args(code, test_multi_plan, customed_tool_names) return code @@ -174,6 +181,8 @@ def chat_with_code( self, chat: List[Message], artifacts: Optional[Artifacts] = None, + test_multi_plan: bool = True, + customized_tool_names: Optional[List[str]] = None, ) -> Tuple[List[Message], Artifacts]: """Chat with VisionAgent, it will use code to execute actions to accomplish its tasks. @@ -184,6 +193,12 @@ def chat_with_code( or if it contains media files, it should be in the format of: [{"role": "user", "content": "describe your task here...", "media": ["image1.jpg", "image2.jpg"]}] artifacts (Optional[Artifacts]): The artifacts to use in the task. + test_multi_plan (bool): If True, it will test tools for multiple plans and + pick the best one based off of the tool results. If False, it will go + with the first plan. + customized_tool_names (List[str]): A list of customized tools for agent to + pick and use. If not provided, default to full tool set from + vision_agent.tools. Returns: List[Message]: The conversation response. @@ -262,7 +277,9 @@ def chat_with_code( if response["let_user_respond"]: break - code_action = parse_execution(response["response"]) + code_action = parse_execution( + response["response"], test_multi_plan, customized_tool_names + ) if code_action is not None: result, obs = run_code_action( diff --git a/vision_agent/agent/vision_agent_prompts.py b/vision_agent/agent/vision_agent_prompts.py index bf9fac80..7b0dd600 100644 --- a/vision_agent/agent/vision_agent_prompts.py +++ b/vision_agent/agent/vision_agent_prompts.py @@ -1,5 +1,5 @@ VA_CODE = """ -**Role**: You are a helpful conversational agent that assists users with their requests by writing code to solve it. +**Role**: You are a helpful agent that assists users with writing code. **Taks**: As a conversational agent, you are required to understand the user's request and provide a helpful response. Use a Chain-of-Thought approach to break down the problem, create a plan, and then provide a response. Ensure that your response is clear, concise, and helpful. You can use an interactive Python (Jupyter Notebook) environment, executing code with . You are given access to an `artifacts` object which contains files shared between you and the user. `artifacts` will be automatically saved everytime you execute python code. @@ -56,7 +56,9 @@ AGENT: {"thoughts": "Two dogs are detected, I will show this to the user and ask them if the result looks good.", "response": "I have written the code to detect dogs and shown the output, do the results look good to you?", "let_user_respond": true} +""" +EXAMPLES_CODE1_EXTRA = """ USER: The the image only has one dog, can you fix this? [Artifacts loaded] @@ -105,25 +107,24 @@ OBSERVATION: [Artifact code.py] -0|from vision_agent.tools import load_image, owl_v2, closest_box_distance, overlay_bounding_boxes, save_image +0|from vision_agent.tools import load_image, florence2_phrase_grounding, closest_box_distance, overlay_bounding_boxes, save_image 1|def count_workers_with_helmets(image_path: str, output_path: str): 2| image = load_image(image_path) -3| detections = owl_v2("worker, helmet", image) +3| detections = florence2_phrase_grounding("worker, helmet", image) 4| workers = [d for d in detections if d['label'] == 'worker'] 5| helmets = [d for d in detections if d['label'] == 'helmet'] 6| count = 0 6| for worker in workers: -7| person_box = worker['bbox'] -8| person_has_helmet = False -9| for helmet in helmets: -10| if closest_box_distance(worker['box'], helmet['box']) < 0.01: -11| person_has_helmet = True -12| break -13| if person_has_helmet: -14| count += 1 -15| overlay_bounding_boxes(image, detections) -16| save_image(output_path, image) -15| return count +7| person_has_helmet = False +8| for helmet in helmets: +9| if closest_box_distance(worker['box'], helmet['box']) < 0.01: +10| person_has_helmet = True +11| break +11| if person_has_helmet: +13| count += 1 +14| overlay_bounding_boxes(image, detections) +15| save_image(output_path, image) +16| return count [End of artifact] AGENT: {"thoughts": "I have generated the code to count the workers with helmets in the image, I must now run the code and print the output and write the visualization to the artifacts so I can see the result and the user can see the visaulization.", "response": "from code import count_workers_with_helmets\n print(count_workers_with_helmets('/path/to/images/workers.png', 'workers_viz.png'))\n write_media_artifact(artifacts, 'workers_viz.png')", "let_user_respond": false} @@ -133,4 +134,38 @@ 2 AGENT: {"thoughts": "Two workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "I have written the code to count the workers wearing helmets in code.py and saved the visualization under 'workers_viz.png'.", "let_user_respond": true} + +USER: The detections are slightly off. Can you fine tune florence2 using these labels? "[{'image_path': 'image1.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}, {'image_path': 'image2.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}]" + +AGENT: {"thoughts": "Because the user has supplied me with labels I can call florence2_fine_tuning on their behalf to fine tune the model", "response": "I will fine tune florence2 with the labels you provided florence2_fine_tuning([{'image_path': 'image1.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}, {'image_path': 'image2.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}], "phrase_grounding")", "let_user_respond": false} + +OBSERVATION: +[Florence2 fine tuning id: 23b3b022-5ebf-4798-9373-20ef36429abf] + + +AGENT: {"thoughts": "The model has finished fine tuning, I will now replace the original florence2_phrase_grounding call with the fine tuning id.", "response": "I will not update the code to use the fine tuned model. use_florence2_fine_tuning(artifacts, "code.py", "phrase_grounding", "23b3b022-5ebf-4798-9373-20ef36429abf")", "let_user_respond": false} + +OBSERVATION: +[Artifact code.py edits] +--- ++++ +@@ -1,7 +1,7 @@ + from vision_agent.tools import load_image, florence2_phrase_grounding, closest_box_distance, overlay_bounding_boxes, save_image + def count_workers_with_helmets(image_path: str, output_path: str): + image = load_image(image_path) +- detections = florence2_phrase_grounding("worker, helmet", image) ++ detections = florence2_phrase_grounding("worker, helmet", image, "23b3b022-5ebf-4798-9373-20ef36429abf") + workers = [d for d in detections if d['label'] == 'worker'] + helmets = [d for d in detections if d['label'] == 'helmet'] + count = 0 +[End of edits] + + +AGENT: {"thoughts": "I have updated the code to use the fine tuned model, I must now run the code and print the output and write the visualization to the artifacts so I can see the result and the user can see the visaulization.", "response": "from code import count_workers_with_helmets\n print(count_workers_with_helmets('/path/to/images/workers.png', 'workers_viz.png'))\n write_media_artifact(artifacts, 'workers_viz.png')", "let_user_respond": false} + +OBSERVATION: +----- stdout ----- +3 + +AGENT: {"thoughts": "Three workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "I have updated the code using the fine tuned florence2_phrase_grounding model to count the workers wearing helmets in code.py and saved the visualization under 'workers_viz.png'.", "let_user_respond": true} """ diff --git a/vision_agent/tools/meta_tools.py b/vision_agent/tools/meta_tools.py index 3670e600..ccf98287 100644 --- a/vision_agent/tools/meta_tools.py +++ b/vision_agent/tools/meta_tools.py @@ -297,7 +297,12 @@ def edit_code_artifact( def generate_vision_code( - artifacts: Artifacts, name: str, chat: str, media: List[str] + artifacts: Artifacts, + name: str, + chat: str, + media: List[str], + test_multi_plan: bool = True, + customized_tool_names: Optional[List[str]] = None, ) -> str: """Generates python code to solve vision based tasks. @@ -306,6 +311,8 @@ def generate_vision_code( name (str): The name of the artifact to save the code to. chat (str): The chat message from the user. media (List[str]): The media files to use. + test_multi_plan (bool): Do not change this parameter. + customized_tool_names (Optional[List[str]]): Do not change this parameter. Returns: str: The generated code. @@ -330,7 +337,11 @@ def detect_dogs(image_path: str): agent = va.agent.VisionAgentCoder() fixed_chat: List[Message] = [{"role": "user", "content": chat, "media": media}] - response = agent.chat_with_workflow(fixed_chat, test_multi_plan=True) + response = agent.chat_with_workflow( + fixed_chat, + test_multi_plan=test_multi_plan, + customized_tool_names=customized_tool_names, + ) redisplay_results(response["test_result"]) code = response["code"] artifacts[name] = code @@ -342,7 +353,11 @@ def detect_dogs(image_path: str): def edit_vision_code( - artifacts: Artifacts, name: str, chat_history: List[str], media: List[str] + artifacts: Artifacts, + name: str, + chat_history: List[str], + media: List[str], + customized_tool_names: Optional[List[str]] = None, ) -> str: """Edits python code to solve a vision based task. @@ -350,6 +365,7 @@ def edit_vision_code( artifacts (Artifacts): The artifacts object to save the code to. name (str): The file path to the code. chat_history (List[str]): The chat history to used to generate the code. + customized_tool_names (Optional[List[str]]): Do not change this parameter. Returns: str: The edited code. @@ -386,7 +402,11 @@ def detect_dogs(image_path: str): fixed_chat_history.append({"role": "assistant", "content": code}) fixed_chat_history.append({"role": "user", "content": chat}) - response = agent.chat_with_workflow(fixed_chat_history, test_multi_plan=False) + response = agent.chat_with_workflow( + fixed_chat_history, + test_multi_plan=False, + customized_tool_names=customized_tool_names, + ) redisplay_results(response["test_result"]) code = response["code"] artifacts[name] = code @@ -425,18 +445,19 @@ def get_tool_descriptions() -> str: def florence2_fine_tuning(bboxes: List[Dict[str, Any]], task: str) -> str: - """'florence2_fine_tuning' is a tool that fine-tune florence2 to be able to detect + """DO NOT use this function unless the user has supplied you with bboxes. + 'florence2_fine_tuning' is a tool that fine-tune florence2 to be able to detect objects in an image based on a given dataset. It returns the fine tuning job id. Parameters: - bboxes (List[BboxInput]): A list of BboxInput containing the - image path, labels and bounding boxes. + bboxes (List[BboxInput]): A list of BboxInput containing the image path, labels + and bounding boxes. The coordinates are unnormalized. task (str): The florencev2 fine-tuning task. The options are 'phrase_grounding'. Returns: - UUID: The fine tuning job id, this id will used to retrieve the fine - tuned model. + str: The fine tuning job id, this id will used to retrieve the fine tuned + model. Example ------- @@ -473,6 +494,54 @@ def get_diff(before: str, after: str) -> str: ) +def get_diff_with_prompts(name: str, before: str, after: str) -> str: + diff = get_diff(before, after) + return f"[Artifact {name} edits]\n{diff}\n[End of edits]" + + +def use_extra_vision_agent_args( + code: str, + test_multi_plan: bool = True, + customized_tool_names: Optional[List[str]] = None, +) -> str: + """This is for forcing arguments passed by the user to VisionAgent into the + VisionAgentCoder call. + + Parameters: + code (str): The code to edit. + test_multi_plan (bool): Do not change this parameter. + customized_tool_names (Optional[List[str]]): Do not change this parameter. + + Returns: + str: The edited code. + """ + generate_pattern = r"generate_vision_code\(\s*([^\)]+)\)" + + def generate_replacer(match: re.Match) -> str: + arg = match.group(1) + out_str = f"generate_vision_code({arg}, test_multi_plan={test_multi_plan}" + if customized_tool_names is not None: + out_str += f", customized_tool_names={customized_tool_names})" + else: + out_str += ")" + return out_str + + edit_pattern = r"edit_vision_code\(\s*([^\)]+)\)" + + def edit_replacer(match: re.Match) -> str: + arg = match.group(1) + out_str = f"edit_vision_code({arg}" + if customized_tool_names is not None: + out_str += f", customized_tool_names={customized_tool_names})" + else: + out_str += ")" + return out_str + + new_code = re.sub(generate_pattern, generate_replacer, code) + new_code = re.sub(edit_pattern, edit_replacer, new_code) + return new_code + + def use_florence2_fine_tuning( artifacts: Artifacts, name: str, task: str, fine_tune_id: str ) -> str: @@ -521,7 +590,7 @@ def replacer(match: re.Match) -> str: artifacts[name] = new_code - diff = get_diff(code, new_code) + diff = get_diff_with_prompts(name, code, new_code) print(diff) return diff diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py index e8e23ba6..7092a646 100644 --- a/vision_agent/tools/tools.py +++ b/vision_agent/tools/tools.py @@ -1945,15 +1945,4 @@ def overlay_counting_results( TOOL_DESCRIPTIONS = get_tool_descriptions(TOOLS) # type: ignore TOOL_DOCSTRING = get_tool_documentation(TOOLS) # type: ignore TOOLS_INFO = get_tools_info(FUNCTION_TOOLS) # type: ignore -UTILITIES_DOCSTRING = get_tool_documentation( - [ - save_json, - load_image, - save_image, - save_video, - overlay_bounding_boxes, - overlay_segmentation_masks, - overlay_heat_map, - overlay_counting_results, - ] -) +UTILITIES_DOCSTRING = get_tool_documentation(UTIL_TOOLS) # type: ignore