From c4e50c852bc6d97799ad43e0c398d9d97849ab6a Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Mon, 19 Aug 2024 09:33:25 -0700 Subject: [PATCH] updated prompts --- .../agent/vision_agent_coder_prompts.py | 4 +- vision_agent/tools/tools.py | 40 +++++++++++++------ 2 files changed, 30 insertions(+), 14 deletions(-) diff --git a/vision_agent/agent/vision_agent_coder_prompts.py b/vision_agent/agent/vision_agent_coder_prompts.py index cb4c3eeb..9f4020f8 100644 --- a/vision_agent/agent/vision_agent_coder_prompts.py +++ b/vision_agent/agent/vision_agent_coder_prompts.py @@ -39,6 +39,7 @@ "plan1": [ {{ + "thoughts": str # your thought process for this plan "instructions": str # what you should do in this task associated with a tool }} ], @@ -127,7 +128,8 @@ **Instructions**: 1. Given the plans, image, and tool outputs, decide which plan is the best to achieve the user request. -2. Output a JSON object with the following format: +2. Try solving the problem yourself given the image and pick the plan which matches your solution the best. +3. Output a JSON object with the following format: {{ "thoughts": str # your thought process for choosing the best plan "best_plan": str # the best plan you have chosen diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py index 5aaef147..7025a823 100644 --- a/vision_agent/tools/tools.py +++ b/vision_agent/tools/tools.py @@ -303,8 +303,8 @@ def florence2_sam2_video( ) -> List[List[Dict[str, Any]]]: """'florence2_sam2_video' is a tool that can segment and track multiple objects in a video given a text prompt such as category names or referring expressions. The - categories in the text prompt are separated by commas. It returns tracked objects - as masks, labels, and scores for each frame. + categories in the text prompt are separated by commas. It is useful for tracking + and counting across frames without counting duplicates. Parameters: prompt (str): The prompt to ground to the video. @@ -421,12 +421,19 @@ def loca_zero_shot_counting(image: np.ndarray) -> Dict[str, Any]: Returns: Dict[str, Any]: A dictionary containing the key 'count' and the count as a - value. E.g. {count: 12}. + value, e.g. {count: 12} and a heat map for visaulization purposes. Example ------- >>> loca_zero_shot_counting(image) - {'count': 45}, + {'count': 83, + 'heat_map': array([[ 0, 0, 0, ..., 0, 0, 0], + [ 0, 0, 0, ..., 0, 0, 0], + [ 0, 0, 0, ..., 0, 0, 1], + ..., + [ 0, 0, 0, ..., 30, 35, 41], + [ 0, 0, 0, ..., 41, 47, 53], + [ 0, 0, 0, ..., 53, 59, 64]], dtype=uint8)} """ image_b64 = convert_to_b64(image) @@ -451,12 +458,19 @@ def loca_visual_prompt_counting( Returns: Dict[str, Any]: A dictionary containing the key 'count' and the count as a - value. E.g. {count: 12}. + value, e.g. {count: 12} and a heat map for visaulization purposes. Example ------- >>> loca_visual_prompt_counting(image, {"bbox": [0.1, 0.1, 0.4, 0.42]}) - {'count': 45}, + {'count': 83, + 'heat_map': array([[ 0, 0, 0, ..., 0, 0, 0], + [ 0, 0, 0, ..., 0, 0, 0], + [ 0, 0, 0, ..., 0, 0, 1], + ..., + [ 0, 0, 0, ..., 30, 35, 41], + [ 0, 0, 0, ..., 41, 47, 53], + [ 0, 0, 0, ..., 53, 59, 64]], dtype=uint8)} """ image_size = get_image_size(image) @@ -1138,7 +1152,7 @@ def closest_box_distance( def extract_frames( - video_uri: Union[str, Path], fps: float = 0.5 + video_uri: Union[str, Path], fps: float = 1 ) -> List[Tuple[np.ndarray, float]]: """'extract_frames' extracts frames from a video which can be a file path or youtube link, returns a list of tuples (frame, timestamp), where timestamp is the relative @@ -1147,7 +1161,7 @@ def extract_frames( Parameters: video_uri (Union[str, Path]): The path to the video file or youtube link fps (float, optional): The frame rate per second to extract the frames. Defaults - to 0.5. + to 10. Returns: List[Tuple[np.ndarray, float]]: A list of tuples containing the extracted frame @@ -1249,7 +1263,7 @@ def save_image(image: np.ndarray, file_path: str) -> None: def save_video( - frames: List[np.ndarray], output_video_path: Optional[str] = None, fps: float = 4 + frames: List[np.ndarray], output_video_path: Optional[str] = None, fps: float = 1 ) -> str: """'save_video' is a utility function that saves a list of frames as a mp4 video file on disk. @@ -1500,7 +1514,6 @@ def overlay_heat_map( TOOLS = [ owl_v2, - grounding_sam, extract_frames, ocr, clip, @@ -1508,13 +1521,14 @@ def overlay_heat_map( vit_nsfw_classification, loca_zero_shot_counting, loca_visual_prompt_counting, - florence2_roberta_vqa, florence2_image_caption, florence2_ocr, + florence2_sam2_image, + florence2_sam2_video, + ixc25_image_vqa, + ixc25_video_vqa, detr_segmentation, depth_anything_v2, - generate_soft_edge_image, - dpt_hybrid_midas, generate_pose_image, closest_mask_distance, closest_box_distance,