diff --git a/vision_agent/agent/vision_agent_prompts.py b/vision_agent/agent/vision_agent_prompts.py index 45e0310d..1b3d464e 100644 --- a/vision_agent/agent/vision_agent_prompts.py +++ b/vision_agent/agent/vision_agent_prompts.py @@ -68,7 +68,7 @@ 1. **Understand and Clarify**: Make sure you understand the task. 2. **Algorithm/Method Selection**: Decide on the most efficient way. 3. **Pseudocode Creation**: Write down the steps you will follow in pseudocode. -4. **Code Generation**: Translate your pseudocode into executable Python code. +4. **Code Generation**: Translate your pseudocode into executable Python code. Ensure you use correct arguments, remember coordinates are always returned normalized from `vision_agent.tools`. 5. **Logging**: Log the output of the custom functions that were provided to you from `from vision_agent.tools import *`. Use a debug flag in the function parameters to toggle logging on and off. """ diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py index d0ddcd12..7baa02b6 100644 --- a/vision_agent/tools/tools.py +++ b/vision_agent/tools/tools.py @@ -60,8 +60,8 @@ def grounding_dino( ) -> List[Dict[str, Any]]: """'grounding_dino' is a tool that can detect and count multiple objects given a text prompt such as category names or referring expressions. The categories in text prompt - are separated by commas or periods. It returns a list and count of bounding boxes, - label names and associated probability scores. + are separated by commas or periods. It returns a list of bounding boxes with + normalized coordinates, label names and associated probability scores. Parameters: prompt (str): The prompt to ground to the image. @@ -73,7 +73,7 @@ def grounding_dino( Returns: List[Dict[str, Any]]: A list of dictionaries containing the score, label, and - bounding box of the detected objects with normalized coordinates + bounding box of the detected objects with normalized coordinates between 0 and 1 (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left and xmax and ymax are the coordinates of the bottom-right of the bounding box.