Skip to content

Commit

Permalink
revert to original prompts
Browse files Browse the repository at this point in the history
  • Loading branch information
dillonalaird committed Oct 4, 2024
1 parent e61eeee commit e76cb5b
Show file tree
Hide file tree
Showing 3 changed files with 23 additions and 81 deletions.
2 changes: 1 addition & 1 deletion vision_agent/agent/vision_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ class BoilerplateCode:
pre_code = [
"from typing import *",
"from vision_agent.utils.execute import CodeInterpreter",
"from vision_agent.tools.meta_tools import Artifacts, open_code_artifact, create_code_artifact, edit_code_artifact, get_tool_descriptions, generate_vision_plan, generate_vision_code, edit_vision_code, write_media_artifact, view_media_artifact, object_detection_fine_tuning, use_object_detection_fine_tuning",
"from vision_agent.tools.meta_tools import Artifacts, open_code_artifact, create_code_artifact, edit_code_artifact, get_tool_descriptions, generate_vision_code, edit_vision_code, write_media_artifact, view_media_artifact, object_detection_fine_tuning, use_object_detection_fine_tuning",
"artifacts = Artifacts('{remote_path}')",
"artifacts.load('{remote_path}')",
]
Expand Down
14 changes: 2 additions & 12 deletions vision_agent/agent/vision_agent_planner_prompts.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@
```python
import numpy as np
from vision_agent.tools import save_video, extract_frames_and_timestamps, owl_v2_video, florence2_phrase_grounding, florence2_sam2_video_tracking, overlay_bounding_boxes, overlay_segmentation_masks
from vision_agent.tools import extract_frames_and_timestamps, owl_v2_video, florence2_phrase_grounding, florence2_sam2_video_tracking
# sample at 1 FPS and use the first 10 frames to reduce processing time
frames = extract_frames_and_timestamps("video.mp4", 1)
Expand Down Expand Up @@ -125,24 +125,15 @@ def get_counts(preds):
# plan1
owl_v2_out = owl_v2_video("person", frames)
owl_v2_counts = get_counts(owl_v2_out)
# overlay bounding boxes on the frames for visualization
owl_v2_viz = overlay_bounding_boxes(frames, owl_v2_out)
save_video(frames, owl_v2_viz, "owl_v2_video.mp4")
# plan2
florence2_out = [florence2_phrase_grounding("person", f) for f in frames]
florence2_counts = get_counts(florence2_out)
# overlay bounding boxes on the frames for visualization
florence2_viz = overlay_bounding_boxes(frames, florence2_out)
save_video(frames, florence2_viz, "florence2_phrase_grounding.mp4")
# plan3
f2s2_tracking_out = florence2_sam2_video_tracking("person", frames)
remove_arrays(f2s2_tracking_out)
f2s2_counts = get_counts(f2s2_tracking_out)
# overlay segmentation masks on the frames for visualization
f2s2_viz = overlay_segmentation_masks(frames, f2s2_tracking_out)
save_video(frames, f2s2_viz, "florence2_sam2_video_tracking.mp4")
final_out = {{
"owl_v2_video": owl_v2_out,
Expand All @@ -159,15 +150,14 @@ def get_counts(preds):
print(final_out)
print(labels_and_scores)
print(counts)
print("Visualizations saved to owl_v2_video.mp4, florence2_phrase_grounding.mp4, florence2_sam2_video_tracking.mp4")
```
--- END EXAMPLE2 ---
**Instructions**:
1. Write a program to load the media and call each tool and print it's output along with other relevant information.
2. Create a dictionary where the keys are the tool name and the values are the tool outputs. Remove numpy arrays from the printed dictionary.
3. Your test case MUST run only on the given images which are {media}
4. Print this final dictionary and save any visualizations to help the user understand the output, prefer overlay_bounding_boxes and overlay_segmentation_masks to display confidence scores.
4. Print this final dictionary.
5. For video input, sample at 1 FPS and use the first 10 frames only to reduce processing time.
"""

Expand Down
Loading

0 comments on commit e76cb5b

Please sign in to comment.