diff --git a/pyproject.toml b/pyproject.toml index fcd7b299..3ae0ff21 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api" [tool.poetry] name = "vision-agent" -version = "0.2.160" +version = "0.2.162" description = "Toolset for Vision Agent" authors = ["Landing AI "] readme = "README.md" diff --git a/vision_agent/agent/vision_agent_coder.py b/vision_agent/agent/vision_agent_coder.py index a682a31c..6a312df9 100644 --- a/vision_agent/agent/vision_agent_coder.py +++ b/vision_agent/agent/vision_agent_coder.py @@ -551,7 +551,6 @@ def generate_code_from_plan( code = remove_installs_from_code(cast(str, results["code"])) test = remove_installs_from_code(cast(str, results["test"])) working_memory.extend(results["working_memory"]) # type: ignore - execution_result = cast(Execution, results["test_result"]) return { diff --git a/vision_agent/agent/vision_agent_planner_prompts.py b/vision_agent/agent/vision_agent_planner_prompts.py index 1b56c460..833e2c9b 100644 --- a/vision_agent/agent/vision_agent_planner_prompts.py +++ b/vision_agent/agent/vision_agent_planner_prompts.py @@ -93,7 +93,7 @@ ```python import numpy as np -from vision_agent.tools import save_video, extract_frames_and_timestamps, owl_v2_video, florence2_phrase_grounding, florence2_sam2_video_tracking +from vision_agent.tools import extract_frames_and_timestamps, owl_v2_video, florence2_phrase_grounding, florence2_sam2_video_tracking # sample at 1 FPS and use the first 10 frames to reduce processing time frames = extract_frames_and_timestamps("video.mp4", 1) @@ -125,18 +125,15 @@ def get_counts(preds): # plan1 owl_v2_out = owl_v2_video("person", frames) owl_v2_counts = get_counts(owl_v2_out) -save_video(frames, owl_v2_out, "owl_v2_video.mp4") # plan2 florence2_out = [florence2_phrase_grounding("person", f) for f in frames] florence2_counts = get_counts(florence2_out) -save_video(frames, florence2_out, "florence2_phrase_grounding.mp4") # plan3 f2s2_tracking_out = florence2_sam2_video_tracking("person", frames) remove_arrays(f2s2_tracking_out) f2s2_counts = get_counts(f2s2_tracking_out) -save_video(frames, f2s2_tracking_out, "florence2_sam2_video_tracking.mp4") final_out = {{ "owl_v2_video": owl_v2_out, @@ -153,7 +150,6 @@ def get_counts(preds): print(final_out) print(labels_and_scores) print(counts) -print("Visualizations saved to owl_v2_video.mp4, florence2_phrase_grounding.mp4, florence2_sam2_video_tracking.mp4") ``` --- END EXAMPLE2 --- @@ -161,7 +157,7 @@ def get_counts(preds): 1. Write a program to load the media and call each tool and print it's output along with other relevant information. 2. Create a dictionary where the keys are the tool name and the values are the tool outputs. Remove numpy arrays from the printed dictionary. 3. Your test case MUST run only on the given images which are {media} -4. Print this final dictionary and save any visualizations to help the user understand the output. +4. Print this final dictionary. 5. For video input, sample at 1 FPS and use the first 10 frames only to reduce processing time. """