From b846ed8d2dc976802af4813aed68563b99d6ea9b Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Fri, 4 Oct 2024 15:32:30 -0700
Subject: [PATCH] revert to original prompts

---
 vision_agent/agent/vision_agent.py                 |  2 +-
 vision_agent/agent/vision_agent_planner_prompts.py | 14 ++------------
 2 files changed, 3 insertions(+), 13 deletions(-)

diff --git a/vision_agent/agent/vision_agent.py b/vision_agent/agent/vision_agent.py
index ac9f4f32..4364ad0f 100644
--- a/vision_agent/agent/vision_agent.py
+++ b/vision_agent/agent/vision_agent.py
@@ -35,7 +35,7 @@ class BoilerplateCode:
     pre_code = [
         "from typing import *",
         "from vision_agent.utils.execute import CodeInterpreter",
-        "from vision_agent.tools.meta_tools import Artifacts, open_code_artifact, create_code_artifact, edit_code_artifact, get_tool_descriptions, generate_vision_plan, generate_vision_code, edit_vision_code, write_media_artifact, view_media_artifact, object_detection_fine_tuning, use_object_detection_fine_tuning",
+        "from vision_agent.tools.meta_tools import Artifacts, open_code_artifact, create_code_artifact, edit_code_artifact, get_tool_descriptions, generate_vision_code, edit_vision_code, write_media_artifact, view_media_artifact, object_detection_fine_tuning, use_object_detection_fine_tuning",
         "artifacts = Artifacts('{remote_path}')",
         "artifacts.load('{remote_path}')",
     ]
diff --git a/vision_agent/agent/vision_agent_planner_prompts.py b/vision_agent/agent/vision_agent_planner_prompts.py
index b76c0e7f..833e2c9b 100644
--- a/vision_agent/agent/vision_agent_planner_prompts.py
+++ b/vision_agent/agent/vision_agent_planner_prompts.py
@@ -93,7 +93,7 @@
 
 ```python
 import numpy as np
-from vision_agent.tools import save_video, extract_frames_and_timestamps, owl_v2_video, florence2_phrase_grounding, florence2_sam2_video_tracking, overlay_bounding_boxes, overlay_segmentation_masks
+from vision_agent.tools import extract_frames_and_timestamps, owl_v2_video, florence2_phrase_grounding, florence2_sam2_video_tracking
 
 # sample at 1 FPS and use the first 10 frames to reduce processing time
 frames = extract_frames_and_timestamps("video.mp4", 1)
@@ -125,24 +125,15 @@ def get_counts(preds):
 # plan1
 owl_v2_out = owl_v2_video("person", frames)
 owl_v2_counts = get_counts(owl_v2_out)
-# overlay bounding boxes on the frames for visualization
-owl_v2_viz = overlay_bounding_boxes(frames, owl_v2_out)
-save_video(frames, owl_v2_viz, "owl_v2_video.mp4")
 
 # plan2
 florence2_out = [florence2_phrase_grounding("person", f) for f in frames]
 florence2_counts = get_counts(florence2_out)
-# overlay bounding boxes on the frames for visualization
-florence2_viz = overlay_bounding_boxes(frames, florence2_out)
-save_video(frames, florence2_viz, "florence2_phrase_grounding.mp4")
 
 # plan3
 f2s2_tracking_out = florence2_sam2_video_tracking("person", frames)
 remove_arrays(f2s2_tracking_out)
 f2s2_counts = get_counts(f2s2_tracking_out)
-# overlay segmentation masks on the frames for visualization
-f2s2_viz = overlay_segmentation_masks(frames, f2s2_tracking_out)
-save_video(frames, f2s2_viz, "florence2_sam2_video_tracking.mp4")
 
 final_out = {{
     "owl_v2_video": owl_v2_out,
@@ -159,7 +150,6 @@ def get_counts(preds):
 print(final_out)
 print(labels_and_scores)
 print(counts)
-print("Visualizations saved to owl_v2_video.mp4, florence2_phrase_grounding.mp4, florence2_sam2_video_tracking.mp4")
 ```
 --- END EXAMPLE2 ---
 
@@ -167,7 +157,7 @@ def get_counts(preds):
 1. Write a program to load the media and call each tool and print it's output along with other relevant information.
 2. Create a dictionary where the keys are the tool name and the values are the tool outputs. Remove numpy arrays from the printed dictionary.
 3. Your test case MUST run only on the given images which are {media}
-4. Print this final dictionary and save any visualizations to help the user understand the output, prefer overlay_bounding_boxes and overlay_segmentation_masks to display confidence scores.
+4. Print this final dictionary.
 5. For video input, sample at 1 FPS and use the first 10 frames only to reduce processing time.
 """