From abb3ff998ed3654ed37145e88759df8cf6482d50 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Sun, 8 Sep 2024 18:56:44 -0700
Subject: [PATCH] fix prompts

---
 .../agent/vision_agent_coder_prompts.py       | 32 +++++++++++++++----
 1 file changed, 25 insertions(+), 7 deletions(-)

diff --git a/vision_agent/agent/vision_agent_coder_prompts.py b/vision_agent/agent/vision_agent_coder_prompts.py
index df68372c..5c2f6518 100644
--- a/vision_agent/agent/vision_agent_coder_prompts.py
+++ b/vision_agent/agent/vision_agent_coder_prompts.py
@@ -108,16 +108,28 @@
 - Use the 'florence2_phrase_grounding' tool with the prompt 'person' to detect where the people are in the video.
 plan3:
 - Extract frames from 'video.mp4' at 10 FPS using the 'extract_frames' tool.
-- Use the 'countgd_counting' tool with the prompt 'person' to detect where the people are in the video.
+- Use the 'florence2_sam2_video_tracking' tool with the prompt 'person' to detect where the people are in the video.
 
 
 ```python
-from vision_agent.tools import extract_frames, owl_v2_image, florence2_phrase_grounding, countgd_counting
+import numpy as np
+from vision_agent.tools import extract_frames, owl_v2_image, florence2_phrase_grounding, florence2_sam2_video_tracking
 
 # sample at 1 FPS and use the first 10 frames to reduce processing time
 frames = extract_frames("video.mp4", 1)
 frames = [f[0] for f in frames][:10]
 
+def remove_arrays(o):
+    if isinstance(o, list):
+        return [remove_arrays(i) for i in o]
+    elif isinstance(o, dict):
+        for k, v in o.items():
+            o[k] = remove_arrays(v)
+    elif isinstance(o, np.ndarray):
+        return str(o.shape)
+    else:
+        return o
+
 # plan1
 owl_v2_out = [owl_v2_image("person", f) for f in frames]
 
@@ -125,9 +137,10 @@
 florence2_out = [florence2_phrase_grounding("person", f) for f in frames]
 
 # plan3
-countgd_out = [countgd_counting(f) for f in frames]
+f2s2_tracking_out = florence2_sam2_video_tracking("person", frames)
+remove_arrays(f2s2_tracking_out)
 
-final_out = {{"owl_v2_image": owl_v2_out, "florencev2_object_detection": florencev2_out, "countgd_counting": cgd_out}}
+final_out = {{"owl_v2_image": owl_v2_out, "florence2_phrase_grounding": florence2_out, "florence2_sam2_video_tracking": f2s2_tracking_out}}
 print(final_out)
 ```
 """
@@ -328,12 +341,17 @@ def find_text(image_path: str, text: str) -> str:
 This is previous feedback provided on the code:
 {feedback}
 
-Please fix the bug by follow the error information and return a JSON object with the following format:
+Please fix the bug by correcting the error. Return the following data:
+```json
 {{
     "reflections": str # any thoughts you have about the bug and how you fixed it
-    "code": str # the fixed code if any, else an empty string
-    "test": str # the fixed test code if any, else an empty string
+    "which_code": str # which code you fixed, can either be 'code' or 'test'
 }}
+```
+
+```python
+# Your fixed code here
+```
 """