diff --git a/vision_agent/agent/easytool_prompts.py b/vision_agent/agent/easytool_prompts.py index 73045ba8..4ae18405 100644 --- a/vision_agent/agent/easytool_prompts.py +++ b/vision_agent/agent/easytool_prompts.py @@ -44,7 +44,7 @@ CHOOSE_PARAMETER = """Given a user's question and an API tool documentation, you need to output parameters according to the API tool documentation to successfully call the API to solve the user's question. Please note that: -1. The Example in the API tool documentation can help you better understand the use of the API. +1. The Example in the API tool documentation can help you better understand the use of the API. Pay attention to the examples which show how to parse the question and extract tool parameters such as prompts and visual inputs. 2. Ensure the parameters you output are correct. The output must contain the required parameters, and can contain the optional parameters based on the question. If there are no paremters in the required parameters and optional parameters, just leave it as {{"Parameters":{{}}}} 3. If the user's question mentions other APIs, you should ONLY consider the API tool documentation I give and do not consider other APIs. 4. The question may have dependencies on answers of other questions, so we will provide logs of previous questions and answers for your reference. diff --git a/vision_agent/agent/vision_agent.py b/vision_agent/agent/vision_agent.py index d01569fc..79aff944 100644 --- a/vision_agent/agent/vision_agent.py +++ b/vision_agent/agent/vision_agent.py @@ -308,6 +308,9 @@ def _handle_extract_frames( # handle extract_frames_ case, useful if it extracts frames but doesn't do # any following processing for video_file_output in tool_result["call_results"]: + # When the video tool is run with wrong parameters, exit the loop + if len(video_file_output) < 2: + break for frame, _ in video_file_output: image = frame if image not in image_to_data: @@ -447,7 +450,7 @@ def __init__( task_model: Optional[Union[LLM, LMM]] = None, answer_model: Optional[Union[LLM, LMM]] = None, reflect_model: Optional[Union[LLM, LMM]] = None, - max_retries: int = 3, + max_retries: int = 2, verbose: bool = False, report_progress_callback: Optional[Callable[[str], None]] = None, ): diff --git a/vision_agent/agent/vision_agent_prompts.py b/vision_agent/agent/vision_agent_prompts.py index 4cc65845..14a53972 100644 --- a/vision_agent/agent/vision_agent_prompts.py +++ b/vision_agent/agent/vision_agent_prompts.py @@ -26,22 +26,24 @@ Reflection: """ -TASK_DECOMPOSE = """You need to decompose a user's complex question into some simple subtasks and let the model execute it step by step. +TASK_DECOMPOSE = """You need to decompose a user's complex question into one or more simple subtasks and let the model execute it step by step. This is the user's question: {question} This is the tool list: {tools} Please note that: -1. You should only decompose this user's complex question into some simple subtasks which can be executed easily by using one single tool in the tool list. -2. If one subtask needs the results from another subtask, you should write clearly. For example: +1. If the given task is simple and the answer can be provided by executing one tool, you should only use that tool to provide the answer. +2. If the given task is complex, You should decompose this user's complex question into simple subtasks which can only be executed easily by using one single tool in the tool list. +3. You should try to decompose the complex question into least number of subtasks. +4. If one subtask needs the results from another subtask, you should write clearly. For example: {{"Tasks": ["Convert 23 km/h to X km/min by 'divide_'", "Multiply X km/min by 45 min to get Y by 'multiply_'"]}} -3. You must ONLY output in a parsible JSON format. An example output looks like: +5. You must ONLY output in a parsible JSON format. An example output looks like: {{"Tasks": ["Task 1", "Task 2", ...]}} Output: """ -TASK_DECOMPOSE_DEPENDS = """You need to decompose a user's complex question into some simple subtasks and let the model execute it step by step. +TASK_DECOMPOSE_DEPENDS = """You need to decompose a user's complex question into one or more simple subtasks and let the model execute it step by step. This is the user's question: {question} This is the tool list: @@ -51,10 +53,12 @@ {reflections} Please note that: -1. You should only decompose this user's complex question into some simple subtasks which can be executed easily by using one single tool in the tool list. -2. If one subtask needs the results from another subtask, you should write clearly. For example: +1. If the given task is simple and the answer can be provided by executing one tool, you should only use that tool to provide the answer. +2. If the given task is complex, You should decompose this user's complex question into simple subtasks which can only be executed easily by using one single tool in the tool list. +3. You should try to decompose the complex question into least number of subtasks. +4. If one subtask needs the results from another subtask, you should write clearly. For example: {{"Tasks": ["Convert 23 km/h to X km/min by 'divide_'", "Multiply X km/min by 45 min to get Y by 'multiply_'"]}} -3. You must ONLY output in a parsible JSON format. An example output looks like: +5. You must ONLY output in a parsible JSON format. An example output looks like: {{"Tasks": ["Task 1", "Task 2", ...]}} @@ -65,8 +69,10 @@ {tools} Please note that: -1. You should only choose one tool from the Tool List to solve this question. -2. You must ONLY output the ID of the tool you chose in a parsible JSON format. Two example outputs look like: +1. You should only choose one tool from the Tool List to solve this question and it should have maximum chance of solving the question. +2. You should only choose the tool whose parameters are most relevant to the user's question and are availale as part of the question. +3. You should choose the tool whose return type is most relevant to the answer of the user's question. +4. You must ONLY output the ID of the tool you chose in a parsible JSON format. Two example outputs look like: Example 1: {{"ID": 1}} Example 2: {{"ID": 2}} @@ -81,8 +87,10 @@ {reflections} Please note that: -1. You should only choose one tool from the Tool List to solve this question. -2. You must ONLY output the ID of the tool you chose in a parsible JSON format. Two example outputs look like: +1. You should only choose one tool from the Tool List to solve this question and it should have maximum chance of solving the question. +2. You should only choose the tool whose parameters are most relevant to the user's question and are availale as part of the question. +3. You should choose the tool whose return type is most relevant to the answer of the user's question. +4. You must ONLY output the ID of the tool you chose in a parsible JSON format. Two example outputs look like: Example 1: {{"ID": 1}} Example 2: {{"ID": 2}} @@ -91,7 +99,7 @@ CHOOSE_PARAMETER_DEPENDS = """Given a user's question and an API tool documentation, you need to output parameters according to the API tool documentation to successfully call the API to solve the user's question. Please note that: -1. The Example in the API tool documentation can help you better understand the use of the API. +1. The Example in the API tool documentation can help you better understand the use of the API. Pay attention to the examples which show how to parse the question and extract tool parameters such as prompts and visual inputs. 2. Ensure the parameters you output are correct. The output must contain the required parameters, and can contain the optional parameters based on the question. If there are no paremters in the required parameters and optional parameters, just leave it as {{"Parameters":{{}}}} 3. If the user's question mentions other APIs, you should ONLY consider the API tool documentation I give and do not consider other APIs. 4. The question may have dependencies on answers of other questions, so we will provide logs of previous questions and answers for your reference. diff --git a/vision_agent/image_utils.py b/vision_agent/image_utils.py index 23dc8506..1fb68a3f 100644 --- a/vision_agent/image_utils.py +++ b/vision_agent/image_utils.py @@ -238,7 +238,7 @@ def overlay_heat_map( elif isinstance(image, np.ndarray): image = Image.fromarray(image) - if "heat_map" not in heat_map: + if "heat_map" not in heat_map or len(heat_map["heat_map"]) == 0: return image.convert("RGB") image = image.convert("L") diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py index fa06a823..a9eca833 100644 --- a/vision_agent/tools/tools.py +++ b/vision_agent/tools/tools.py @@ -175,7 +175,7 @@ class GroundingDINO(Tool): """ name = "grounding_dino_" - description = "'grounding_dino_' is a tool that can detect arbitrary objects with inputs such as category names or referring expressions. It returns a list of bounding boxes, label names and associated probability scores." + description = "'grounding_dino_' is a tool that can detect and count objects given a text prompt such as category names or referring expressions. It returns a list and count of bounding boxes, label names and associated probability scores." usage = { "required_parameters": [ {"name": "prompt", "type": "str"}, @@ -186,6 +186,13 @@ class GroundingDINO(Tool): {"name": "iou_threshold", "type": "float"}, ], "examples": [ + { + "scenario": "Can you detect and count the giraffes and zebras in this image? Image name: animal.jpg", + "parameters": { + "prompt": "giraffe. zebra", + "image": "person.jpg", + }, + }, { "scenario": "Can you build me a car detector?", "parameters": {"prompt": "car", "image": ""}, @@ -198,7 +205,7 @@ class GroundingDINO(Tool): }, }, { - "scenario": "Detect the red shirts and green shirst. Image name: shirts.jpg", + "scenario": "Detect the red shirts and green shirt. Image name: shirts.jpg", "parameters": { "prompt": "red shirt. green shirt", "image": "shirts.jpg", @@ -271,7 +278,7 @@ class GroundingSAM(Tool): """ name = "grounding_sam_" - description = "'grounding_sam_' is a tool that can detect arbitrary objects with inputs such as category names or referring expressions. It returns a list of bounding boxes, label names and masks file names and associated probability scores." + description = "'grounding_sam_' is a tool that can detect and segment objects given a text prompt such as category names or referring expressions. It returns a list of bounding boxes, label names and masks file names and associated probability scores." usage = { "required_parameters": [ {"name": "prompt", "type": "str"}, @@ -282,6 +289,13 @@ class GroundingSAM(Tool): {"name": "iou_threshold", "type": "float"}, ], "examples": [ + { + "scenario": "Can you segment the apples and grapes in this image? Image name: fruits.jpg", + "parameters": { + "prompt": "apple. grape", + "image": "fruits.jpg", + }, + }, { "scenario": "Can you build me a car segmentor?", "parameters": {"prompt": "car", "image": ""}, @@ -478,7 +492,7 @@ class ZeroShotCounting(Tool): """ name = "zero_shot_counting_" - description = "'zero_shot_counting_' is a tool that counts and returns the total number of instances of an object present in an image belonging to the same class without a text or visual prompt." + description = "'zero_shot_counting_' is a tool that counts foreground items given only an image and no other information. It returns only the count of the objects in the image" usage = { "required_parameters": [ @@ -486,7 +500,7 @@ class ZeroShotCounting(Tool): ], "examples": [ { - "scenario": "Can you count the lids in the image? Image name: lids.jpg", + "scenario": "Can you count the items in the image? Image name: lids.jpg", "parameters": {"image": "lids.jpg"}, }, { @@ -535,7 +549,7 @@ class VisualPromptCounting(Tool): """ name = "visual_prompt_counting_" - description = "'visual_prompt_counting_' is a tool that can count and return total number of instances of an object present in an image belonging to the same class given an example bounding box." + description = "'visual_prompt_counting_' is a tool that counts foreground items in an image given a visual prompt which is a bounding box describing the object. It returns only the count of the objects in the image." usage = { "required_parameters": [ @@ -544,7 +558,7 @@ class VisualPromptCounting(Tool): ], "examples": [ { - "scenario": "Here is an example of a lid '0.1, 0.1, 0.14, 0.2', Can you count the lids in the image ? Image name: lids.jpg", + "scenario": "Here is an example of a lid '0.1, 0.1, 0.14, 0.2', Can you count the items in the image ? Image name: lids.jpg", "parameters": {"image": "lids.jpg", "prompt": "0.1, 0.1, 0.14, 0.2"}, }, { @@ -552,7 +566,7 @@ class VisualPromptCounting(Tool): "parameters": {"image": "tray.jpg", "prompt": "0.1, 0.1, 0.2, 0.25"}, }, { - "scenario": "Can you build me a few shot object counting tool ? Image name: shirts.jpg", + "scenario": "Can you count this item based on an example, reference_data: '0.1, 0.15, 0.2, 0.2' ? Image name: shirts.jpg", "parameters": { "image": "shirts.jpg", "prompt": "0.1, 0.15, 0.2, 0.2", @@ -605,7 +619,7 @@ class VisualQuestionAnswering(Tool): """ name = "visual_question_answering_" - description = "'visual_question_answering_' is a tool that can describe the contents of the image and it can also answer basic questions about the image." + description = "'visual_question_answering_' is a tool that can answer basic questions about the image given a question and an image. It returns a text describing the image and the answer to the question" usage = { "required_parameters": [ @@ -672,7 +686,7 @@ class ImageQuestionAnswering(Tool): """ name = "image_question_answering_" - description = "'image_question_answering_' is a tool that can describe the contents of the image and it can also answer basic questions about the image." + description = "'image_question_answering_' is a tool that can answer basic questions about the image given a question and an image. It returns a text describing the image and the answer to the question" usage = { "required_parameters": [ @@ -773,7 +787,7 @@ class BboxArea(Tool): r"""BboxArea returns the area of the bounding box in pixels normalized to 2 decimal places.""" name = "bbox_area_" - description = "'bbox_area_' returns the area of the bounding box in pixels normalized to 2 decimal places." + description = "'bbox_area_' returns the area of the given bounding box in pixels normalized to 2 decimal places." usage = { "required_parameters": [{"name": "bboxes", "type": "List[int]"}], "examples": [ @@ -803,7 +817,7 @@ class SegArea(Tool): r"""SegArea returns the area of the segmentation mask in pixels normalized to 2 decimal places.""" name = "seg_area_" - description = "'seg_area_' returns the area of the segmentation mask in pixels normalized to 2 decimal places." + description = "'seg_area_' returns the area of the given segmentation mask in pixels normalized to 2 decimal places." usage = { "required_parameters": [{"name": "masks", "type": "str"}], "examples": [ @@ -883,7 +897,7 @@ def __call__(self, mask1: Union[str, Path], mask2: Union[str, Path]) -> float: class BboxContains(Tool): name = "bbox_contains_" - description = "Given two bounding boxes, a target bounding box and a region bounding box, 'bbox_contains_' returns the intersection of the two bounding boxes over the target bounding box, reflects the percentage area of the target bounding box overlaps with the region bounding box. This is a good tool for determining if the region object contains the target object." + description = "Given two bounding boxes, a target bounding box and a region bounding box, 'bbox_contains_' returns the intersection of the two bounding boxes which is the percentage area of the target bounding box overlaps with the region bounding box. This is a good tool for determining if the region object contains the target object." usage = { "required_parameters": [ {"name": "target", "type": "List[int]"}, @@ -935,9 +949,7 @@ def __call__( class BoxDistance(Tool): name = "box_distance_" - description = ( - "'box_distance_' returns the minimum distance between two bounding boxes." - ) + description = "'box_distance_' calculates distance between two bounding boxes. It returns the minumum distance between the given bounding boxes" usage = { "required_parameters": [ {"name": "bbox1", "type": "List[int]"}, @@ -945,7 +957,7 @@ class BoxDistance(Tool): ], "examples": [ { - "scenario": "If you want to calculate the distance between the bounding boxes [0.2, 0.21, 0.34, 0.42] and [0.3, 0.31, 0.44, 0.52]", + "scenario": "Calculate the distance between the bounding boxes [0.2, 0.21, 0.34, 0.42] and [0.3, 0.31, 0.44, 0.52]", "parameters": { "bbox1": [0.2, 0.21, 0.34, 0.42], "bbox2": [0.3, 0.31, 0.44, 0.52], @@ -1008,7 +1020,7 @@ def __call__(self, video_uri: str) -> List[Tuple[str, float]]: class OCR(Tool): name = "ocr_" - description = "'ocr_' extracts text from an image." + description = "'ocr_' extracts text from an image. It returns a list of detected text, bounding boxes, and confidence scores." usage = { "required_parameters": [ {"name": "image", "type": "str"},