diff --git a/vision_agent/agent/vision_agent.py b/vision_agent/agent/vision_agent.py index 6365c949..eeffd8ac 100644 --- a/vision_agent/agent/vision_agent.py +++ b/vision_agent/agent/vision_agent.py @@ -35,7 +35,7 @@ class BoilerplateCode: pre_code = [ "from typing import *", "from vision_agent.utils.execute import CodeInterpreter", - "from vision_agent.tools.meta_tools import Artifacts, open_code_artifact, create_code_artifact, edit_code_artifact, get_tool_descriptions, generate_vision_plan, generate_vision_code, edit_vision_code, write_media_artifact, view_media_artifact, object_detection_fine_tuning, use_object_detection_fine_tuning", + "from vision_agent.tools.meta_tools import Artifacts, open_code_artifact, create_code_artifact, edit_code_artifact, get_tool_descriptions, generate_vision_code, edit_vision_code, write_media_artifact, view_media_artifact, object_detection_fine_tuning, use_object_detection_fine_tuning", "artifacts = Artifacts('{remote_path}')", "artifacts.load('{remote_path}')", ] diff --git a/vision_agent/agent/vision_agent_planner_prompts.py b/vision_agent/agent/vision_agent_planner_prompts.py index b76c0e7f..833e2c9b 100644 --- a/vision_agent/agent/vision_agent_planner_prompts.py +++ b/vision_agent/agent/vision_agent_planner_prompts.py @@ -93,7 +93,7 @@ ```python import numpy as np -from vision_agent.tools import save_video, extract_frames_and_timestamps, owl_v2_video, florence2_phrase_grounding, florence2_sam2_video_tracking, overlay_bounding_boxes, overlay_segmentation_masks +from vision_agent.tools import extract_frames_and_timestamps, owl_v2_video, florence2_phrase_grounding, florence2_sam2_video_tracking # sample at 1 FPS and use the first 10 frames to reduce processing time frames = extract_frames_and_timestamps("video.mp4", 1) @@ -125,24 +125,15 @@ def get_counts(preds): # plan1 owl_v2_out = owl_v2_video("person", frames) owl_v2_counts = get_counts(owl_v2_out) -# overlay bounding boxes on the frames for visualization -owl_v2_viz = overlay_bounding_boxes(frames, owl_v2_out) -save_video(frames, owl_v2_viz, "owl_v2_video.mp4") # plan2 florence2_out = [florence2_phrase_grounding("person", f) for f in frames] florence2_counts = get_counts(florence2_out) -# overlay bounding boxes on the frames for visualization -florence2_viz = overlay_bounding_boxes(frames, florence2_out) -save_video(frames, florence2_viz, "florence2_phrase_grounding.mp4") # plan3 f2s2_tracking_out = florence2_sam2_video_tracking("person", frames) remove_arrays(f2s2_tracking_out) f2s2_counts = get_counts(f2s2_tracking_out) -# overlay segmentation masks on the frames for visualization -f2s2_viz = overlay_segmentation_masks(frames, f2s2_tracking_out) -save_video(frames, f2s2_viz, "florence2_sam2_video_tracking.mp4") final_out = {{ "owl_v2_video": owl_v2_out, @@ -159,7 +150,6 @@ def get_counts(preds): print(final_out) print(labels_and_scores) print(counts) -print("Visualizations saved to owl_v2_video.mp4, florence2_phrase_grounding.mp4, florence2_sam2_video_tracking.mp4") ``` --- END EXAMPLE2 --- @@ -167,7 +157,7 @@ def get_counts(preds): 1. Write a program to load the media and call each tool and print it's output along with other relevant information. 2. Create a dictionary where the keys are the tool name and the values are the tool outputs. Remove numpy arrays from the printed dictionary. 3. Your test case MUST run only on the given images which are {media} -4. Print this final dictionary and save any visualizations to help the user understand the output, prefer overlay_bounding_boxes and overlay_segmentation_masks to display confidence scores. +4. Print this final dictionary. 5. For video input, sample at 1 FPS and use the first 10 frames only to reduce processing time. """ diff --git a/vision_agent/agent/vision_agent_prompts.py b/vision_agent/agent/vision_agent_prompts.py index d3a88efc..a8b1e543 100644 --- a/vision_agent/agent/vision_agent_prompts.py +++ b/vision_agent/agent/vision_agent_prompts.py @@ -26,9 +26,10 @@ **Instructions**: 1. **Understand and Clarify**: Make sure you understand the task, ask clarifying questions if the task is not clear. -2. **Code Generation**: Only use code provided in the Documentation in your tags. For generating vision code always start by calling `generate_vision_plan` and asking the user which plan they want to use, after that call `generate_vision_code` with the chosen plan. Only use `edit_vision_code` to modify code written by `generate_vision_code`. -3. **Execute**: Do only what the user asked you to do and no more, do not debug vision code for them. If you need to ask the user a question, set `let_user_respond` to `true`. -4. **Output in JSON**: Respond in the following format in JSON: +2. **Code Generation**: Only use code provided in the Documentation in your tags. Only use `edit_vision_code` to modify code written by `generate_vision_code`. +3. **Execute**: Do only what the user asked you to do and no more. If you need to ask the user a question, set `let_user_respond` to `true`. +4. **Response**: Keep your responses short and concise. Provide the user only with the information they need to continue the conversation. +5. **Output in JSON**: Respond in the following format in JSON: ```json {{"thoughts": , "response": , "let_user_respond": }}. @@ -44,37 +45,14 @@ Artifact dog.jpg loaded to /path/to/images/dog.jpg [End of artifacts] -AGENT: {"thoughts": "I will first generate a plan and review it with the user", "response": "generate_vision_plan(artifacts, 'dog_detector_plan.json', 'Can you write code to detect dogs in this image?', media=['/path/to/images/dog.jpg'])", "let_user_respond": false} - -OBSERVATION: -[Start Plan Context, saved at 'dog_detector_plan.json'] -plan1: The most straightforward and reliable approach is to use the owl_v2_image tool, which is specifically designed for object detection. It can detect multiple objects given a text prompt, which is perfect for identifying dogs in an image. --Load the image 'dogs.jpg' using the load_image tool --Use the owl_v2_image tool with the prompt 'dog' to detect dogs in the loaded image --Overlay the detected bounding boxes on the original image using the overlay_bounding_boxes tool --Save the resulting image with bounding boxes using the save_image tool -plan2: As an alternative, we can use the florence2_sam2_image tool for segmentation, which might provide more precise outlines of the dogs. This could be useful if we want to highlight the exact shape of the dogs rather than just rectangular bounding boxes. --Load the image 'dogs.jpg' using the load_image tool --Use the florence2_sam2_image tool with the prompt 'dog' to segment dogs in the loaded image --Overlay the segmentation masks on the original image using the overlay_segmentation_masks tool --Save the resulting image with segmentation masks using the save_image tool -best plan: plan1 -thoughts: After examining the image, I can see there are indeed two dogs present - a larger white fluffy dog and a smaller gray dog. Plan 1 using the owl_v2_image tool is the most accurate for this task as it correctly detected both dogs with reasonable bounding boxes. Plan 2 with segmentation could work but might be overkill for simple detection. Plan 3 is less suitable as it doesn't provide localization. To improve Plan 1, we could add a confidence threshold of 0.5 to filter out low-confidence detections, though in this case it wouldn't change the result. -[End Plan Context] - -AGENT: {"thoughts": "I will ask the user which plan they like best and any modifications they might make.", "response": "I have generated two plans to detect dogs in the image. I recommend using Plan 1 with the owl_v2_image tool. Do you want to proceed with this plan or would you like to make any modifications?", "let_user_respond": true} - -USER: Plan 1 sounds good to me with your modifications. - - -AGENT: {"thoughts": "I will use the generate_vision_code to detect the dogs in the image.", "response": "generate_vision_code(artifacts, 'dog_detector.py', 'Can you write code to detect dogs in this image?', media=['/path/to/images/dog.jpg'], plan={'thoughts': 'The most straightforward and reliable approach is to use the owl_v2_image tool, which is specifically designed for object detection. It can detect multiple objects given a text prompt, which is perfect for identifying dogs in an image.', 'instructions': ['Load the image \'dogs.jpg\' using the load_image tool', 'Use the owl_v2_image tool with the prompt \'dog\' to detect dogs in the loaded image', 'Overlay the detected bounding boxes on the original image using the overlay_bounding_boxes tool', 'Save the resulting image with bounding boxes using the save_image tool']}, 'Add a confidence threshold of 0.5 to filter out the low-confidence detections.', 'dog_detector_plan.json')", "let_user_respond": false} +AGENT: {"thoughts": "I will use the generate_vision_code to detect the dogs in the image.", "response": "generate_vision_code(artifacts, 'dog_detector.py', 'Can you write code to detect dogs in this image?', media=['/path/to/images/dog.jpg'])", "let_user_respond": false} OBSERVATION: [Artifact dog_detector.py] 0|from vision_agent.tools import load_image, owl_v2 1|def detect_dogs(image_path: str): 2| image = load_image(image_path) -3| dogs = owl_v2("dog", image, box_threshold=0.5) +3| dogs = owl_v2("dog", image) 4| return dogs [End of artifact] @@ -85,7 +63,7 @@ [{'score': 0.99, 'label': 'dog', 'box': [0.1, 0.2, 0.3, 0.4]}, {'score': 0.23, 'label': 'dog', 'box': [0.2, 0.3, 0.4, 0.5]}] -AGENT: {"thoughts": "Two dogs are detected, I will show this to the user and ask them if the result looks good.", "response": "I have written the code to detect dogs and shown the output, do the results look good to you?", "let_user_respond": true} +AGENT: {"thoughts": "Two dogs are detected, I will show this to the user and ask them if the result looks good.", "response": "The code detectd two dogs, do the results look good to you?", "let_user_respond": true} """ EXAMPLES_CODE1_EXTRA = """ @@ -114,7 +92,7 @@ ----- stdout ----- [{'score': 0.99, 'label': 'dog', 'box': [0.1, 0.2, 0.3, 0.4]}] -AGENT: {"thoughts": "One dog is detected, I will show this to the user and ask them if the result looks good.", "response": "I have written the code to detect one dog and shown you the output, do the results look good to you?", "let_user_respond": true} +AGENT: {"thoughts": "One dog is detected, I will show this to the user and ask them if the result looks good.", "response": "The code detected one dog, do these results look good to you?", "let_user_respond": true} """ EXAMPLES_CODE2 = """ @@ -141,7 +119,7 @@ [Artifacts loaded] [End of artifacts] -AGENT: {"thoughts": "The user has asked to count workers with helmets but has not provided an image. I will ask the user for an image and then generate a plan to count workers with helmets.", "response": "Can you provide an image of workers with helmets?", "let_user_respond": true} +AGENT: {"thoughts": "The user has asked to count workers with helmets but has not provided an image. I will ask the user for an image and then generate the code to count workers with helmets.", "response": "Can you provide an image of workers with helmets?", "let_user_respond": true} USER: Yes you can use workers.png @@ -150,40 +128,14 @@ Artifact workers.png loaded to /path/to/images/workers.png [End of artifacts] -AGENT: {"thoughts": "I will generate a plan and review it with the user.", "response": "generate_vision_plan(artifacts, 'workers_plan.json', 'Can you create a function to count workers with helmets, return the count and save a visaulization of the bounding boxes?', media=['/path/to/images/workers.png'])", "let_user_respond": false} - -OBSERVATION: -[Start Plan Context, saved at workers_plan_context.json] -plan1: This plan uses owl_v2_image, which is specifically designed for detecting and counting multiple objects given a text prompt. It's likely the most accurate and efficient method for this task. --Load the image 'workers.png' using the load_image function --Use owl_v2_image with the prompt 'workers with helmets' to detect and count workers wearing helmets --Count the number of detections returned by owl_v2_image --Use overlay_bounding_boxes to visualize the detections on the original image --Save the visualization using save_image --Return the count of workers with helmets -plan2: This plan uses florence2_phrase_grounding, which can detect multiple objects given a text prompt. While it may not be as specialized for counting as owl_v2_image, it could still provide accurate results. --Load the image 'workers.png' using the load_image function --Use florence2_phrase_grounding with the prompt 'workers wearing helmets' to detect workers with helmets --Count the number of detections returned by florence2_phrase_grounding --Use overlay_bounding_boxes to visualize the detections on the original image --Save the visualization using save_image --Return the count of workers with helmets -best plan: plan2 -thoughts: After examining the image, I can clearly see 4 workers wearing helmets. Plan 2 using florence2_phrase_grounding provides the most accurate count and detection. Plan 1 (owl_v2_image) significantly overcounts, detecting 14 instances which is not correct. The florence2_phrase_grounding approach in Plan 2 correctly identifies 4 workers with helmets, matching my manual count. -[End Plan Context] - -AGENT: {"thoughts": "I will ask the user which plan they like best and any modifications they might make.", "response": "I have generated two plans to count workers with helmets in the image. I recommend using Plan 2 with the florence2_phrase_grounding tool. Do you want to proceed with this plan or would you like to make any modifications?", "let_user_respond": true} - -USER: Could you use plan 1 with owl_v2 instead and a confidence threshold of 0.3? - -AGENT: {"thoughts": "I will incorporate the users thoughts when generating the vision code.", "response": "generate_vision_code(artifacts, 'code.py', 'Can you create a function to count workers with helmets, return the count and save a visaulization of the bounding boxes?', media=['/path/to/images/workers.png'], plan={'thoughts': 'This plan uses owl_v2_image, which is specifically designed for detecting and counting multiple objects given a text prompt. It's likely the most accurate and efficient method for this task.', 'instructions': ['Load the image \'workers.png\' using the load_image function', 'Use owl_v2_image with the prompt \'workers with helmets\' to detect and count workers wearing helmets', 'Count the number of detections returned by owl_v2_image', 'Use overlay_bounding_boxes to visualize the detections on the original image', 'Save the visualization using save_image', 'Return the count of workers with helmets'], 'Add a confidence threshold of 0.3 to filter out low-confidence detections.', 'workers_plan_context.json'})", "let_user_respond": false} +AGENT: {"thoughts": "I will use the generate_vision_code to count the workers with helmets in the image.", "response": "generate_vision_code(artifacts, 'code.py', 'Can you create a function to count workers with helmets, return the count and save a visaulization of the bounding boxes?', media=['/paths/to/images/workers.png'])", "let_user_respond": false} OBSERVATION: [Artifact code.py] -0|from vision_agent.tools import load_image, owl_v2_image, closest_box_distance, overlay_bounding_boxes, save_image +0|from vision_agent.tools import load_image, florence2_phrase_grounding, closest_box_distance, overlay_bounding_boxes, save_image 1|def count_workers_with_helmets(image_path: str, output_path: str): 2| image = load_image(image_path) -3| detections = owl_v2_image("worker, helmet", image, box_threshold=0.3) +3| detections = florence2_phrase_grounding("worker, helmet", image) 4| workers = [d for d in detections if d['label'] == 'worker'] 5| helmets = [d for d in detections if d['label'] == 'helmet'] 6| count = 0 @@ -206,27 +158,27 @@ ----- stdout ----- 2 -AGENT: {"thoughts": "Two workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "I have written the code to count the workers wearing helmets in code.py and saved the visualization under 'workers_viz.png'.", "let_user_respond": true} +AGENT: {"thoughts": "Two workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "The code to detect workers with helmets is saved in code.py and the visualization under 'workers_viz.png'.", "let_user_respond": true} -USER: The detections are slightly off. Can you fine tune the object detection model using these labels? "[{'image_path': 'image1.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}, {'image_path': 'image2.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}]" +USER: The detections are slightly off. Can you fine tune florence2 using these labels? "[{'image_path': 'image1.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}, {'image_path': 'image2.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}]" -AGENT: {"thoughts": "Because the user has supplied me with labels I can call object_detection_fine_tuning on their behalf to fine tune the model", "response": "I will fine tune object detection with the labels you provided object_detection_fine_tuning([{'image_path': 'image1.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}, {'image_path': 'image2.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}])", "let_user_respond": false} +AGENT: {"thoughts": "Because the user has supplied me with labels I can call object_detection_fine_tuning on their behalf to fine tune the model", "response": "object_detection_fine_tuning([{'image_path': 'image1.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}, {'image_path': 'image2.png': 'labels': ['worker', 'helmet'], 'bboxes': [[235, 118, 294, 241], [232, 118, 294, 128]]}])", "let_user_respond": false} OBSERVATION: [Fine tuning id: 23b3b022-5ebf-4798-9373-20ef36429abf] -AGENT: {"thoughts": "The model has finished fine tuning, I will now replace the original owl_v2_image call with the fine tuning id.", "response": "I will not update the code to use the fine tuned model. use_object_detection_fine_tuning(artifacts, 'code.py', '23b3b022-5ebf-4798-9373-20ef36429abf')", "let_user_respond": false} +AGENT: {"thoughts": "The model has finished fine tuning, I will now replace the original florence2_phrase_grounding call with the fine tuning id.", "response": "use_object_detection_fine_tuning(artifacts, "code.py", "23b3b022-5ebf-4798-9373-20ef36429abf")", "let_user_respond": false} OBSERVATION: [Artifact code.py edits] --- +++ @@ -1,7 +1,7 @@ - from vision_agent.tools import load_image, owl_v2_image, closest_box_distance, overlay_bounding_boxes, save_image + from vision_agent.tools import load_image, florence2_phrase_grounding, closest_box_distance, overlay_bounding_boxes, save_image def count_workers_with_helmets(image_path: str, output_path: str): image = load_image(image_path) -- detections = owl_v2_image("worker, helmet", image) -+ detections = owl_v2_image("worker, helmet", image, "23b3b022-5ebf-4798-9373-20ef36429abf") +- detections = florence2_phrase_grounding("worker, helmet", image) ++ detections = florence2_phrase_grounding("worker, helmet", image, "23b3b022-5ebf-4798-9373-20ef36429abf") workers = [d for d in detections if d['label'] == 'worker'] helmets = [d for d in detections if d['label'] == 'helmet'] count = 0 @@ -238,5 +190,5 @@ def count_workers_with_helmets(image_path: str, output_path: str): ----- stdout ----- 3 -AGENT: {"thoughts": "Three workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "I have updated the code using the fine tuned owl_v2_image model to count the workers wearing helmets in code.py and saved the visualization under 'workers_viz.png'.", "let_user_respond": true} +AGENT: {"thoughts": "Three workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "I have updated the code using the fine tuned florence2_phrase_grounding model to count the workers wearing helmets in code.py and saved the visualization under 'workers_viz.png'.", "let_user_respond": true} """