From 079de09e80a5e3815c1997fb26058c05a19934e2 Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Wed, 29 May 2024 11:18:33 -0700 Subject: [PATCH] Update Docs (#98) * renamed v2 to data interpreter * renamed vision agent to easytool v2 * renamed vision agent v3 to vision agent * renamed image to media * moved tools_v2 to tools * return type of __call__ is str * updated docs * remove unused import * black * added documentation for chat with workflow * formatting fix --- README.md | 159 +- docs/easy_tool_v2.md | 183 ++ tests/fixtures.py | 2 +- tests/test_tools.py | 2 +- tests/test_vision_agent.py | 2 +- tests/tools/test_tools.py | 10 +- vision_agent/agent/__init__.py | 4 +- vision_agent/agent/agent.py | 2 +- vision_agent/agent/agent_coder.py | 14 +- ...vision_agent_v2.py => data_interpreter.py} | 24 +- ...prompts.py => data_interpreter_prompts.py} | 6 +- vision_agent/agent/easytool.py | 16 +- vision_agent/agent/easytool_v2.py | 778 ++++++++ vision_agent/agent/easytool_v2_prompts.py | 152 ++ vision_agent/agent/reflexion.py | 16 +- vision_agent/agent/vision_agent.py | 1051 ++++------- vision_agent/agent/vision_agent_prompts.py | 380 ++-- vision_agent/agent/vision_agent_v3.py | 394 ---- vision_agent/agent/vision_agent_v3_prompts.py | 234 --- vision_agent/llm/llm.py | 7 +- vision_agent/lmm/lmm.py | 12 +- vision_agent/tools/__init__.py | 43 +- vision_agent/tools/easytool_tools.py | 1242 +++++++++++++ vision_agent/tools/tools.py | 1623 ++++++----------- vision_agent/tools/tools_v2.py | 685 ------- 25 files changed, 3615 insertions(+), 3426 deletions(-) create mode 100644 docs/easy_tool_v2.md rename vision_agent/agent/{vision_agent_v2.py => data_interpreter.py} (95%) rename vision_agent/agent/{vision_agent_v2_prompts.py => data_interpreter_prompts.py} (97%) create mode 100644 vision_agent/agent/easytool_v2.py create mode 100644 vision_agent/agent/easytool_v2_prompts.py delete mode 100644 vision_agent/agent/vision_agent_v3.py delete mode 100644 vision_agent/agent/vision_agent_v3_prompts.py create mode 100644 vision_agent/tools/easytool_tools.py delete mode 100644 vision_agent/tools/tools_v2.py diff --git a/README.md b/README.md index c4c16e1b..1926f027 100644 --- a/README.md +++ b/README.md @@ -9,13 +9,12 @@ ![version](https://img.shields.io/pypi/pyversions/vision-agent) -Vision Agent is a library that helps you utilize agent frameworks for your vision tasks. -Many current vision problems can easily take hours or days to solve, you need to find the -right model, figure out how to use it, possibly write programming logic around it to -accomplish the task you want or even more expensive, train your own model. Vision Agent -aims to provide an in-seconds experience by allowing users to describe their problem in -text and utilizing agent frameworks to solve the task for them. Check out our discord -for updates and roadmaps! +Vision Agent is a library that helps you utilize agent frameworks to generate code to +solve your vision task. Many current vision problems can easily take hours or days to +solve, you need to find the right model, figure out how to use it and program it to +accomplish the task you want. Vision Agent aims to provide an in-seconds experience by +allowing users to describe their problem in text and have the agent framework generate +code to solve the task for them. Check out our discord for updates and roadmaps! ## Documentation @@ -37,70 +36,71 @@ using Azure OpenAI please see the Azure setup section): export OPENAI_API_KEY="your-api-key" ``` -### Vision Agents -You can interact with the agents as you would with any LLM or LMM model: +### Vision Agent +You can interact with the agent as you would with any LLM or LMM model: ```python >>> from vision_agent.agent import VisionAgent >>> agent = VisionAgent() ->>> agent("What percentage of the area of this jar is filled with coffee beans?", image="jar.jpg") -"The percentage of area of the jar filled with coffee beans is 25%." +>>> code = agent("What percentage of the area of the jar is filled with coffee beans?", media="jar.jpg") ``` -To better understand how the model came up with it's answer, you can also run it in -debug mode by passing in the verbose argument: - +Which produces the following code: ```python ->>> agent = VisionAgent(verbose=True) +from vision_agent.tools import load_image, grounding_sam + +def calculate_filled_percentage(image_path: str) -> float: + # Step 1: Load the image + image = load_image(image_path) + + # Step 2: Segment the jar + jar_segments = grounding_sam(prompt="jar", image=image) + + # Step 3: Segment the coffee beans + coffee_beans_segments = grounding_sam(prompt="coffee beans", image=image) + + # Step 4: Calculate the area of the segmented jar + jar_area = 0 + for segment in jar_segments: + jar_area += segment['mask'].sum() + + # Step 5: Calculate the area of the segmented coffee beans + coffee_beans_area = 0 + for segment in coffee_beans_segments: + coffee_beans_area += segment['mask'].sum() + + # Step 6: Compute the percentage of the jar area that is filled with coffee beans + if jar_area == 0: + return 0.0 # To avoid division by zero + filled_percentage = (coffee_beans_area / jar_area) * 100 + + # Step 7: Return the computed percentage + return filled_percentage ``` -You can also have it return the workflow it used to complete the task along with all -the individual steps and tools to get the answer: +To better understand how the model came up with it's answer, you can run it in debug +mode by passing in the verbose argument: ```python ->>> resp, workflow = agent.chat_with_workflow([{"role": "user", "content": "What percentage of the area of this jar is filled with coffee beans?"}], image="jar.jpg") ->>> print(workflow) -[{"task": "Segment the jar using 'grounding_sam_'.", - "tool": "grounding_sam_", - "parameters": {"prompt": "jar", "image": "jar.jpg"}, - "call_results": [[ - { - "labels": ["jar"], - "scores": [0.99], - "bboxes": [ - [0.58, 0.2, 0.72, 0.45], - ], - "masks": "mask.png" - } - ]], - "answer": "The jar is located at [0.58, 0.2, 0.72, 0.45].", -}, -{"visualize_output": "final_output.png"}] +>>> agent = VisionAgent(verbose=2) ``` -You can also provide reference data for the model to utilize. For example, if you want -to utilize VisualPromptCounting: +You can also have it return more information by calling `chat_with_workflow`: ```python -agent( - "How many apples are in this image?", - image="apples.jpg", - reference_data={"bbox": [0.1, 0.11, 0.24, 0.25]}, -) +>>> results = agent.chat_with_workflow([{"role": "user", "content": "What percentage of the area of the jar is filled with coffee beans?"}], media="jar.jpg") +>>> print(results) +{ + "code": "from vision_agent.tools import ..." + "test": "calculate_filled_percentage('jar.jpg')", + "test_result": "...", + "plan": [{"code": "...", "test": "...", "plan": "..."}, ...], + "working_memory": ..., +} ``` -Where `[0.1, 0.11, 0.24, 0.25]` is the normalized bounding box coordinates of an apple. -Similarly for DINOv you can provide a reference image and mask: -```python -agent( - "Can you detect all of the objects similar to the mask I've provided?", - image="image.jpg", - reference_data={"mask": "reference_mask.png", "image": "reference_image.png"}, -) -``` -Here, `reference_mask.png` and `reference_image.png` in `reference_data` could be any -image with it's corresponding mask that is the object you want to detect in `image.jpg`. -You can find a demo app to generate masks for DINOv [here](examples/mask_app/). +With this you can examine more detailed information such as the etesting code, testing +results, plan or working memory it used to complete the task. ### Tools There are a variety of tools for the model or the user to use. Some are executed locally @@ -120,57 +120,6 @@ you. For example: }] ``` -#### Custom Tools -You can also add your own custom tools for your vision agent to use: - -```python -from vision_agent.tools import Tool, register_tool -@register_tool -class NumItems(Tool): - name = "num_items_" - description = "Returns the number of items in a list." - usage = { - "required_parameters": [{"name": "prompt", "type": "list"}], - "examples": [ - { - "scenario": "How many items are in this list? ['a', 'b', 'c']", - "parameters": {"prompt": "['a', 'b', 'c']"}, - } - ], - } - def __call__(self, prompt: list[str]) -> int: - return len(prompt) -``` -This will register it with the list of tools Vision Agent has access to. It will be able -to pick it based on the tool description and use it based on the usage provided. You can -find an example that creates a custom tool for template matching [here](examples/custom_tools/). - -#### Tool List -| Tool | Description | -| --- | --- | -| CLIP | CLIP is a tool that can classify or tag any image given a set of input classes or tags. | -| ImageCaption| ImageCaption is a tool that can generate a caption for an image. | -| GroundingDINO | GroundingDINO is a tool that can detect arbitrary objects with inputs such as category names or referring expressions. | -| GroundingSAM | GroundingSAM is a tool that can detect and segment arbitrary objects with inputs such as category names or referring expressions. | -| DINOv | DINOv is a tool that can detect arbitrary objects with using a referring mask. | -| Crop | Crop crops an image given a bounding box and returns a file name of the cropped image. | -| BboxArea | BboxArea returns the area of the bounding box in pixels normalized to 2 decimal places. | -| SegArea | SegArea returns the area of the segmentation mask in pixels normalized to 2 decimal places. | -| BboxIoU | BboxIoU returns the intersection over union of two bounding boxes normalized to 2 decimal places. | -| SegIoU | SegIoU returns the intersection over union of two segmentation masks normalized to 2 decimal places. | -| BoxDistance | BoxDistance returns the minimum distance between two bounding boxes normalized to 2 decimal places. | -| MaskDistance | MaskDistance returns the minimum distance between two segmentation masks in pixel units | -| BboxContains | BboxContains returns the intersection of two boxes over the target box area. It is good for check if one box is contained within another box. | -| ExtractFrames | ExtractFrames extracts frames with motion from a video. | -| ZeroShotCounting | ZeroShotCounting returns the total number of objects belonging to a single class in a given image. | -| VisualPromptCounting | VisualPromptCounting returns the total number of objects belonging to a single class given an image and visual prompt. | -| VisualQuestionAnswering | VisualQuestionAnswering is a tool that can explain the contents of an image and answer questions about the image. | -| ImageQuestionAnswering | ImageQuestionAnswering is similar to VisualQuestionAnswering but does not rely on OpenAI and instead uses a dedicated model for the task. | -| OCR | OCR returns the text detected in an image along with the location. | - - -It also has a basic set of calculate tools such as add, subtract, multiply and divide. - ### Azure Setup If you want to use Azure OpenAI models, you can set the environment variable: diff --git a/docs/easy_tool_v2.md b/docs/easy_tool_v2.md new file mode 100644 index 00000000..4e4cca7f --- /dev/null +++ b/docs/easy_tool_v2.md @@ -0,0 +1,183 @@ +# 🔍🤖 Easy Tool V2 + +Easy Tool V2 is a library that helps you utilize agent frameworks for your vision tasks. +Many current vision problems can easily take hours or days to solve, you need to find the +right model, figure out how to use it, possibly write programming logic around it to +accomplish the task you want or even more expensive, train your own model. Easy Tool V2 +aims to provide an in-seconds experience by allowing users to describe their problem in +text and utilizing agent frameworks to solve the task for them. Check out our discord +for updates and roadmaps! + +## Documentation + +- [Easy Tool V2 Library Docs](https://landing-ai.github.io/vision-agent/) + + +## Getting Started +### Installation +To get started, you can install the library using pip: + +```bash +pip install vision-agent +``` + +Ensure you have an OpenAI API key and set it as an environment variable (if you are +using Azure OpenAI please see the Azure setup section): + +```bash +export OPENAI_API_KEY="your-api-key" +``` + +### Easy Tool V2 +You can interact with the agents as you would with any LLM or LMM model: + +```python +>>> from vision_agent.agent import EasyToolV2 +>>> agent = EasyToolV2() +>>> agent("What percentage of the area of this jar is filled with coffee beans?", image="jar.jpg") +"The percentage of area of the jar filled with coffee beans is 25%." +``` + +To better understand how the model came up with it's answer, you can also run it in +debug mode by passing in the verbose argument: + +```python +>>> agent = EasyToolV2(verbose=True) +``` + +You can also have it return the workflow it used to complete the task along with all +the individual steps and tools to get the answer: + +```python +>>> resp, workflow = agent.chat_with_workflow([{"role": "user", "content": "What percentage of the area of this jar is filled with coffee beans?"}], image="jar.jpg") +>>> print(workflow) +[{"task": "Segment the jar using 'grounding_sam_'.", + "tool": "grounding_sam_", + "parameters": {"prompt": "jar", "image": "jar.jpg"}, + "call_results": [[ + { + "labels": ["jar"], + "scores": [0.99], + "bboxes": [ + [0.58, 0.2, 0.72, 0.45], + ], + "masks": "mask.png" + } + ]], + "answer": "The jar is located at [0.58, 0.2, 0.72, 0.45].", +}, +{"visualize_output": "final_output.png"}] +``` + +You can also provide reference data for the model to utilize. For example, if you want +to utilize VisualPromptCounting: + +```python +agent( + "How many apples are in this image?", + image="apples.jpg", + reference_data={"bbox": [0.1, 0.11, 0.24, 0.25]}, +) +``` +Where `[0.1, 0.11, 0.24, 0.25]` is the normalized bounding box coordinates of an apple. +Similarly for DINOv you can provide a reference image and mask: + +```python +agent( + "Can you detect all of the objects similar to the mask I've provided?", + image="image.jpg", + reference_data={"mask": "reference_mask.png", "image": "reference_image.png"}, +) +``` +Here, `reference_mask.png` and `reference_image.png` in `reference_data` could be any +image with it's corresponding mask that is the object you want to detect in `image.jpg`. +You can find a demo app to generate masks for DINOv [here](examples/mask_app/). + +### Tools +There are a variety of tools for the model or the user to use. Some are executed locally +while others are hosted for you. You can also ask an LLM directly to build a tool for +you. For example: + +```python +>>> import vision_agent as va +>>> llm = va.llm.OpenAILLM() +>>> detector = llm.generate_detector("Can you build a jar detector for me?") +>>> detector("jar.jpg") +[{"labels": ["jar",], + "scores": [0.99], + "bboxes": [ + [0.58, 0.2, 0.72, 0.45], + ] +}] +``` + +#### Custom Tools +You can also add your own custom tools for your vision agent to use: + +```python +from vision_agent.tools import Tool, register_tool +@register_tool +class NumItems(Tool): + name = "num_items_" + description = "Returns the number of items in a list." + usage = { + "required_parameters": [{"name": "prompt", "type": "list"}], + "examples": [ + { + "scenario": "How many items are in this list? ['a', 'b', 'c']", + "parameters": {"prompt": "['a', 'b', 'c']"}, + } + ], + } + def __call__(self, prompt: list[str]) -> int: + return len(prompt) +``` +This will register it with the list of tools Easy Tool V2 has access to. It will be able +to pick it based on the tool description and use it based on the usage provided. You can +find an example that creates a custom tool for template matching [here](examples/custom_tools/). + +#### Tool List +| Tool | Description | +| --- | --- | +| CLIP | CLIP is a tool that can classify or tag any image given a set of input classes or tags. | +| ImageCaption| ImageCaption is a tool that can generate a caption for an image. | +| GroundingDINO | GroundingDINO is a tool that can detect arbitrary objects with inputs such as category names or referring expressions. | +| GroundingSAM | GroundingSAM is a tool that can detect and segment arbitrary objects with inputs such as category names or referring expressions. | +| DINOv | DINOv is a tool that can detect arbitrary objects with using a referring mask. | +| Crop | Crop crops an image given a bounding box and returns a file name of the cropped image. | +| BboxArea | BboxArea returns the area of the bounding box in pixels normalized to 2 decimal places. | +| SegArea | SegArea returns the area of the segmentation mask in pixels normalized to 2 decimal places. | +| BboxIoU | BboxIoU returns the intersection over union of two bounding boxes normalized to 2 decimal places. | +| SegIoU | SegIoU returns the intersection over union of two segmentation masks normalized to 2 decimal places. | +| BoxDistance | BoxDistance returns the minimum distance between two bounding boxes normalized to 2 decimal places. | +| MaskDistance | MaskDistance returns the minimum distance between two segmentation masks in pixel units | +| BboxContains | BboxContains returns the intersection of two boxes over the target box area. It is good for check if one box is contained within another box. | +| ExtractFrames | ExtractFrames extracts frames with motion from a video. | +| ZeroShotCounting | ZeroShotCounting returns the total number of objects belonging to a single class in a given image. | +| VisualPromptCounting | VisualPromptCounting returns the total number of objects belonging to a single class given an image and visual prompt. | +| VisualQuestionAnswering | VisualQuestionAnswering is a tool that can explain the contents of an image and answer questions about the image. | +| ImageQuestionAnswering | ImageQuestionAnswering is similar to VisualQuestionAnswering but does not rely on OpenAI and instead uses a dedicated model for the task. | +| OCR | OCR returns the text detected in an image along with the location. | + + +It also has a basic set of calculate tools such as add, subtract, multiply and divide. + +### Azure Setup +If you want to use Azure OpenAI models, you can set the environment variable: + +```bash +export AZURE_OPENAI_API_KEY="your-api-key" +export AZURE_OPENAI_ENDPOINT="your-endpoint" +``` + +You can then run Easy Tool V2 using the Azure OpenAI models: + +```python +>>> import vision_agent as va +>>> agent = va.agent.EasyToolV2( +>>> task_model=va.llm.AzureOpenAILLM(), +>>> answer_model=va.lmm.AzureOpenAILMM(), +>>> reflection_model=va.lmm.AzureOpenAILMM(), +>>> ) +``` + diff --git a/tests/fixtures.py b/tests/fixtures.py index 036ed9d6..4479e66f 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -2,7 +2,7 @@ import pytest -from vision_agent.tools import CLIP, GroundingDINO, GroundingSAM +from vision_agent.tools.easytool_tools import CLIP, GroundingDINO, GroundingSAM @pytest.fixture diff --git a/tests/test_tools.py b/tests/test_tools.py index 56ca2e02..e5ebe4f3 100644 --- a/tests/test_tools.py +++ b/tests/test_tools.py @@ -1,6 +1,6 @@ import skimage as ski -from vision_agent.tools.tools_v2 import ( +from vision_agent.tools import ( clip, grounding_dino, grounding_sam, diff --git a/tests/test_vision_agent.py b/tests/test_vision_agent.py index 98df1f3f..e05b4914 100644 --- a/tests/test_vision_agent.py +++ b/tests/test_vision_agent.py @@ -1,4 +1,4 @@ -from vision_agent.agent.vision_agent import sample_n_evenly_spaced +from vision_agent.agent.easytool_v2 import sample_n_evenly_spaced def test_sample_n_evenly_spaced_side_cases(): diff --git a/tests/tools/test_tools.py b/tests/tools/test_tools.py index 648dbea9..3e7c0a0d 100644 --- a/tests/tools/test_tools.py +++ b/tests/tools/test_tools.py @@ -5,8 +5,14 @@ import pytest from PIL import Image -from vision_agent.tools import TOOLS, Tool, register_tool -from vision_agent.tools.tools import BboxIoU, BoxDistance, MaskDistance, SegArea, SegIoU +from vision_agent.tools.easytool_tools import TOOLS, Tool, register_tool +from vision_agent.tools.easytool_tools import ( + BboxIoU, + BoxDistance, + MaskDistance, + SegArea, + SegIoU, +) def test_bbox_iou(): diff --git a/vision_agent/agent/__init__.py b/vision_agent/agent/__init__.py index 2f62dbf1..3d989b34 100644 --- a/vision_agent/agent/__init__.py +++ b/vision_agent/agent/__init__.py @@ -1,7 +1,7 @@ from .agent import Agent from .agent_coder import AgentCoder +from .data_interpreter import DataInterpreter from .easytool import EasyTool +from .easytool_v2 import EasyToolV2 from .reflexion import Reflexion from .vision_agent import VisionAgent -from .vision_agent_v2 import VisionAgentV2 -from .vision_agent_v3 import VisionAgentV3 diff --git a/vision_agent/agent/agent.py b/vision_agent/agent/agent.py index 135319be..ec47ff86 100644 --- a/vision_agent/agent/agent.py +++ b/vision_agent/agent/agent.py @@ -8,7 +8,7 @@ class Agent(ABC): def __call__( self, input: Union[List[Dict[str, str]], str], - image: Optional[Union[str, Path]] = None, + media: Optional[Union[str, Path]] = None, ) -> str: pass diff --git a/vision_agent/agent/agent_coder.py b/vision_agent/agent/agent_coder.py index 0e4129ed..bba539f2 100644 --- a/vision_agent/agent/agent_coder.py +++ b/vision_agent/agent/agent_coder.py @@ -18,7 +18,7 @@ ) from vision_agent.llm import LLM, OpenAILLM from vision_agent.lmm import LMM, OpenAILMM -from vision_agent.tools.tools_v2 import TOOL_DOCSTRING, UTILITIES_DOCSTRING +from vision_agent.tools import TOOL_DOCSTRING, UTILITIES_DOCSTRING from vision_agent.utils import Execute IMPORT_HELPER = """ @@ -38,7 +38,7 @@ import string from typing import * from collections import * -from vision_agent.tools.tools_v2 import * +from vision_agent.tools import * """ logging.basicConfig(stream=sys.stdout) _LOGGER = logging.getLogger(__name__) @@ -150,20 +150,20 @@ def __init__( def __call__( self, input: Union[List[Dict[str, str]], str], - image: Optional[Union[str, Path]] = None, + media: Optional[Union[str, Path]] = None, ) -> str: if isinstance(input, str): input = [{"role": "user", "content": input}] - return self.chat(input, image) + return self.chat(input, media) def chat( self, input: List[Dict[str, str]], - image: Optional[Union[str, Path]] = None, + media: Optional[Union[str, Path]] = None, ) -> str: question = input[0]["content"] - if image: - question += f" Input file path: {os.path.abspath(image)}" + if media: + question += f" Input file path: {os.path.abspath(media)}" code = "" feedback = "" diff --git a/vision_agent/agent/vision_agent_v2.py b/vision_agent/agent/data_interpreter.py similarity index 95% rename from vision_agent/agent/vision_agent_v2.py rename to vision_agent/agent/data_interpreter.py index d7bf1372..cabf0240 100644 --- a/vision_agent/agent/vision_agent_v2.py +++ b/vision_agent/agent/data_interpreter.py @@ -10,7 +10,7 @@ from tabulate import tabulate from vision_agent.agent import Agent -from vision_agent.agent.vision_agent_v2_prompts import ( +from vision_agent.agent.data_interpreter_prompts import ( CODE, CODE_SYS_MSG, DEBUG, @@ -25,7 +25,7 @@ USER_REQ_SUBTASK_WM_CONTEXT, ) from vision_agent.llm import LLM, OpenAILLM -from vision_agent.tools.tools_v2 import TOOL_DESCRIPTIONS, TOOLS_DF +from vision_agent.tools import TOOL_DESCRIPTIONS, TOOLS_DF from vision_agent.utils import Execute, Sim logging.basicConfig(level=logging.INFO) @@ -331,11 +331,11 @@ def run_plan( return current_code, current_test, plan, working_memory -class VisionAgentV2(Agent): - """Vision Agent is an AI agentic framework geared towards outputting Python code to - solve vision tasks. It is inspired by MetaGPT's Data Interpreter - https://arxiv.org/abs/2402.18679. Vision Agent has several key features to help it - generate code: +class DataInterpreter(Agent): + """This version of Data Interpreter is an AI agentic framework geared towards + outputting Python code to solve vision tasks. It is inspired by MetaGPT's Data + Interpreter https://arxiv.org/abs/2402.18679. This version of Data Interpreter has + several key features to help it generate code: - A planner to generate a plan of tasks to solve a user requirement. The planner can output code tasks or test tasks, where test tasks are used to verify the code. @@ -379,29 +379,29 @@ def __init__( def __call__( self, input: Union[List[Dict[str, str]], str], - image: Optional[Union[str, Path]] = None, + media: Optional[Union[str, Path]] = None, plan: Optional[List[Dict[str, Any]]] = None, ) -> str: if isinstance(input, str): input = [{"role": "user", "content": input}] - results = self.chat_with_workflow(input, image, plan) + results = self.chat_with_workflow(input, media, plan) return results["code"] # type: ignore @traceable def chat_with_workflow( self, chat: List[Dict[str, str]], - image: Optional[Union[str, Path]] = None, + media: Optional[Union[str, Path]] = None, plan: Optional[List[Dict[str, Any]]] = None, ) -> Dict[str, Any]: if len(chat) == 0: raise ValueError("Input cannot be empty.") - if image is not None: + if media is not None: # append file names to all user messages for chat_i in chat: if chat_i["role"] == "user": - chat_i["content"] += f" Image name {image}" + chat_i["content"] += f" Image name {media}" working_code = "" if plan is not None: diff --git a/vision_agent/agent/vision_agent_v2_prompts.py b/vision_agent/agent/data_interpreter_prompts.py similarity index 97% rename from vision_agent/agent/vision_agent_v2_prompts.py rename to vision_agent/agent/data_interpreter_prompts.py index 87895da0..998ccf97 100644 --- a/vision_agent/agent/vision_agent_v2_prompts.py +++ b/vision_agent/agent/data_interpreter_prompts.py @@ -74,15 +74,15 @@ # Constraints - Write a function that accomplishes the 'Current Subtask'. You are supplied code from a previous task under 'Previous Code', do not delete or change previous code unless it contains a bug or it is necessary to complete the 'Current Subtask'. -- Always prioritize using pre-defined tools or code for the same functionality from 'Tool Info' when working on 'Current Subtask'. You have access to all these tools through the `from vision_agent.tools.tools_v2 import *` import. +- Always prioritize using pre-defined tools or code for the same functionality from 'Tool Info' when working on 'Current Subtask'. You have access to all these tools through the `from vision_agent.tools import *` import. - You may recieve previous trials and errors under 'Previous Task', this is code, output and reflections from previous tasks. You can use these to avoid running in to the same issues when writing your code. -- Use the `save_json` function from `vision_agent.tools.tools_v2` to save your output as a json file. +- Use the `save_json` function from `vision_agent.tools` to save your output as a json file. - Write clean, readable, and well-documented code. # Output While some concise thoughts are helpful, code is absolutely required. If possible, execute your defined functions in the code output. Output code in the following format: ```python -from vision_agent.tools.tools_v2 imoprt * +from vision_agent.tools imoprt * # your code goes here ``` diff --git a/vision_agent/agent/easytool.py b/vision_agent/agent/easytool.py index 72a6fd75..4d05838e 100644 --- a/vision_agent/agent/easytool.py +++ b/vision_agent/agent/easytool.py @@ -6,7 +6,7 @@ from vision_agent.llm import LLM, OpenAILLM from vision_agent.lmm import LMM -from vision_agent.tools import TOOLS +from vision_agent.tools.easytool_tools import TOOLS from .agent import Agent from .easytool_prompts import ( @@ -272,7 +272,7 @@ def __init__( def __call__( self, input: Union[List[Dict[str, str]], str], - image: Optional[Union[str, Path]] = None, + media: Optional[Union[str, Path]] = None, ) -> str: """Invoke the vision agent. @@ -285,14 +285,14 @@ def __call__( """ if isinstance(input, str): input = [{"role": "user", "content": input}] - return self.chat(input, image=image) + return self.chat(input, media=media) def chat_with_workflow( - self, chat: List[Dict[str, str]], image: Optional[Union[str, Path]] = None + self, chat: List[Dict[str, str]], media: Optional[Union[str, Path]] = None ) -> Tuple[str, List[Dict]]: question = chat[0]["content"] - if image: - question += f" Image name: {image}" + if media: + question += f" Image name: {media}" tasks = task_decompose( self.task_model, question, @@ -340,7 +340,7 @@ def chat_with_workflow( return answer_summarize(self.answer_model, question, answers), all_tool_results def chat( - self, chat: List[Dict[str, str]], image: Optional[Union[str, Path]] = None + self, chat: List[Dict[str, str]], media: Optional[Union[str, Path]] = None ) -> str: - answer, _ = self.chat_with_workflow(chat, image=image) + answer, _ = self.chat_with_workflow(chat, media=media) return answer diff --git a/vision_agent/agent/easytool_v2.py b/vision_agent/agent/easytool_v2.py new file mode 100644 index 00000000..1ef382e7 --- /dev/null +++ b/vision_agent/agent/easytool_v2.py @@ -0,0 +1,778 @@ +import json +import logging +import sys +import tempfile +from pathlib import Path +from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union + +from PIL import Image +from tabulate import tabulate + +from vision_agent.agent.agent import Agent +from vision_agent.agent.easytool_prompts import ( + ANSWER_GENERATE, + ANSWER_SUMMARIZE, + CHOOSE_PARAMETER, + CHOOSE_TOOL, + TASK_DECOMPOSE, + TASK_TOPOLOGY, +) +from vision_agent.agent.easytool_v2_prompts import ( + ANSWER_GENERATE_DEPENDS, + ANSWER_SUMMARIZE_DEPENDS, + CHOOSE_PARAMETER_DEPENDS, + CHOOSE_TOOL_DEPENDS, + TASK_DECOMPOSE_DEPENDS, + VISION_AGENT_REFLECTION, +) +from vision_agent.llm import LLM, OpenAILLM +from vision_agent.lmm import LMM, OpenAILMM +from vision_agent.tools.easytool_tools import TOOLS +from vision_agent.utils.image_utils import ( + convert_to_b64, + overlay_bboxes, + overlay_heat_map, + overlay_masks, +) + +logging.basicConfig(stream=sys.stdout) +_LOGGER = logging.getLogger(__name__) +_MAX_TABULATE_COL_WIDTH = 80 + + +def parse_json(s: str) -> Any: + s = ( + s.replace(": True", ": true") + .replace(": False", ": false") + .replace(":True", ": true") + .replace(":False", ": false") + .replace("```", "") + .strip() + ) + return json.loads(s) + + +def change_name(name: str) -> str: + change_list = ["from", "class", "return", "false", "true", "id", "and", "", "ID"] + if name in change_list: + name = "is_" + name.lower() + return name + + +def format_tools(tools: Dict[int, Any]) -> str: + # Format this way so it's clear what the ID's are + tool_str = "" + for key in tools: + tool_str += f"ID: {key} - {tools[key]}\n" + return tool_str + + +def format_tool_usage(tools: Dict[int, Any], tool_result: List[Dict]) -> str: + usage = [] + name_to_usage = {v["name"]: v["usage"] for v in tools.values()} + for tool_res in tool_result: + if "tool_name" in tool_res: + usage.append((tool_res["tool_name"], name_to_usage[tool_res["tool_name"]])) + + usage_str = "" + for tool_name, tool_usage in usage: + usage_str += f"{tool_name} - {tool_usage}\n" + return usage_str + + +def topological_sort(tasks: List[Dict]) -> List[Dict]: + in_degree = {task["id"]: 0 for task in tasks} + for task in tasks: + for dep in task["dep"]: + if dep in in_degree: + in_degree[task["id"]] += 1 + + queue = [task for task in tasks if in_degree[task["id"]] == 0] + sorted_order = [] + + while queue: + current = queue.pop(0) + sorted_order.append(current) + + for task in tasks: + if current["id"] in task["dep"]: + in_degree[task["id"]] -= 1 + if in_degree[task["id"]] == 0: + queue.append(task) + + if len(sorted_order) != len(tasks): + completed_ids = set([task["id"] for task in sorted_order]) + remaining_tasks = [task for task in tasks if task["id"] not in completed_ids] + sorted_order.extend(remaining_tasks) + return sorted_order + + +def task_decompose( + model: Union[LLM, LMM, Agent], + question: str, + tools: Dict[int, Any], + reflections: str, +) -> Optional[Dict]: + if reflections: + prompt = TASK_DECOMPOSE_DEPENDS.format( + question=question, tools=format_tools(tools), reflections=reflections + ) + else: + prompt = TASK_DECOMPOSE.format(question=question, tools=format_tools(tools)) + tries = 0 + str_result = "" + while True: + try: + str_result = model(prompt) + result = parse_json(str_result) + return result["Tasks"] # type: ignore + except Exception: + if tries > 10: + _LOGGER.error(f"Failed task_decompose on: {str_result}") + return None + tries += 1 + continue + + +def task_topology( + model: Union[LLM, LMM, Agent], question: str, task_list: List[Dict] +) -> List[Dict[str, Any]]: + prompt = TASK_TOPOLOGY.format(question=question, task_list=task_list) + tries = 0 + str_result = "" + while True: + try: + str_result = model(prompt) + result = parse_json(str_result) + for elt in result["Tasks"]: + if isinstance(elt["dep"], str): + elt["dep"] = [int(dep) for dep in elt["dep"].split(",")] + elif isinstance(elt["dep"], int): + elt["dep"] = [elt["dep"]] + elif isinstance(elt["dep"], list): + elt["dep"] = [int(dep) for dep in elt["dep"]] + return result["Tasks"] # type: ignore + except Exception: + if tries > 10: + _LOGGER.error(f"Failed task_topology on: {str_result}") + return task_list + tries += 1 + continue + + +def choose_tool( + model: Union[LLM, LMM, Agent], + question: str, + tools: Dict[int, Any], + reflections: str, +) -> Optional[int]: + if reflections: + prompt = CHOOSE_TOOL_DEPENDS.format( + question=question, tools=format_tools(tools), reflections=reflections + ) + else: + prompt = CHOOSE_TOOL.format(question=question, tools=format_tools(tools)) + tries = 0 + str_result = "" + while True: + try: + str_result = model(prompt) + result = parse_json(str_result) + return result["ID"] # type: ignore + except Exception: + if tries > 10: + _LOGGER.error(f"Failed choose_tool on: {str_result}") + return None + tries += 1 + continue + + +def choose_parameter( + model: Union[LLM, LMM, Agent], + question: str, + tool_usage: Dict, + previous_log: str, + reflections: str, +) -> Optional[Any]: + # TODO: should format tool_usage + if reflections: + prompt = CHOOSE_PARAMETER_DEPENDS.format( + question=question, + tool_usage=tool_usage, + previous_log=previous_log, + reflections=reflections, + ) + else: + prompt = CHOOSE_PARAMETER.format( + question=question, tool_usage=tool_usage, previous_log=previous_log + ) + tries = 0 + str_result = "" + while True: + try: + str_result = model(prompt) + result = parse_json(str_result) + return result["Parameters"] + except Exception: + if tries > 10: + _LOGGER.error(f"Failed choose_parameter on: {str_result}") + return None + tries += 1 + continue + + +def answer_generate( + model: Union[LLM, LMM, Agent], + question: str, + call_results: str, + previous_log: str, + reflections: str, +) -> str: + if reflections: + prompt = ANSWER_GENERATE_DEPENDS.format( + question=question, + call_results=call_results, + previous_log=previous_log, + reflections=reflections, + ) + else: + prompt = ANSWER_GENERATE.format( + question=question, call_results=call_results, previous_log=previous_log + ) + return model(prompt) + + +def answer_summarize( + model: Union[LLM, LMM, Agent], question: str, answers: List[Dict], reflections: str +) -> str: + if reflections: + prompt = ANSWER_SUMMARIZE_DEPENDS.format( + question=question, answers=answers, reflections=reflections + ) + else: + prompt = ANSWER_SUMMARIZE.format(question=question, answers=answers) + return model(prompt) + + +def function_call(tool: Callable, parameters: Dict[str, Any]) -> Any: + try: + return tool()(**parameters) + except Exception as e: + _LOGGER.error(f"Failed function_call on: {e}") + # return error message so it can self-correct + return str(e) + + +def self_reflect( + reflect_model: Union[LLM, LMM], + question: str, + tools: Dict[int, Any], + tool_result: List[Dict], + final_answer: str, + images: Optional[Sequence[Union[str, Path]]] = None, +) -> str: + prompt = VISION_AGENT_REFLECTION.format( + question=question, + tools=format_tools({k: v["description"] for k, v in tools.items()}), + tool_usage=format_tool_usage(tools, tool_result), + tool_results=str(tool_result), + final_answer=final_answer, + ) + if ( + issubclass(type(reflect_model), LMM) + and images is not None + and all([Path(image).suffix in [".jpg", ".jpeg", ".png"] for image in images]) + ): + return reflect_model(prompt, images=images) # type: ignore + return reflect_model(prompt) + + +def parse_reflect(reflect: str) -> Any: + reflect = reflect.strip() + try: + return parse_json(reflect) + except Exception: + _LOGGER.error(f"Failed parse json reflection: {reflect}") + # LMMs have a hard time following directions, so make the criteria less strict + finish = ( + "finish" in reflect.lower() and len(reflect) < 100 + ) or "finish" in reflect.lower()[-10:] + return {"Finish": finish, "Reflection": reflect} + + +def _handle_extract_frames( + image_to_data: Dict[str, Dict], tool_result: Dict +) -> Dict[str, Dict]: + image_to_data = image_to_data.copy() + # handle extract_frames_ case, useful if it extracts frames but doesn't do + # any following processing + for video_file_output in tool_result["call_results"]: + # When the video tool is run with wrong parameters, exit the loop + if not isinstance(video_file_output, tuple) or len(video_file_output) < 2: + break + for frame, _ in video_file_output: + image = frame + if image not in image_to_data: + image_to_data[image] = { + "bboxes": [], + "masks": [], + "heat_map": [], + "labels": [], + "scores": [], + } + return image_to_data + + +def _handle_viz_tools( + image_to_data: Dict[str, Dict], tool_result: Dict +) -> Dict[str, Dict]: + image_to_data = image_to_data.copy() + + # handle grounding_sam_ and grounding_dino_ + parameters = tool_result["parameters"] + # parameters can either be a dictionary or list, parameters can also be malformed + # becaus the LLM builds them + if isinstance(parameters, dict): + if "image" not in parameters: + return image_to_data + parameters = [parameters] + elif isinstance(tool_result["parameters"], list): + if len(tool_result["parameters"]) < 1 or ( + "image" not in tool_result["parameters"][0] + ): + return image_to_data + + for param, call_result in zip(parameters, tool_result["call_results"]): + # Calls can fail, so we need to check if the call was successful. It can either: + # 1. return a str or some error that's not a dictionary + # 2. return a dictionary but not have the necessary keys + + if not isinstance(call_result, dict) or ( + "bboxes" not in call_result + and "mask" not in call_result + and "heat_map" not in call_result + ): + return image_to_data + + # if the call was successful, then we can add the image data + image = param["image"] + if image not in image_to_data: + image_to_data[image] = { + "bboxes": [], + "masks": [], + "heat_map": [], + "labels": [], + "scores": [], + } + + image_to_data[image]["bboxes"].extend(call_result.get("bboxes", [])) + image_to_data[image]["labels"].extend(call_result.get("labels", [])) + image_to_data[image]["scores"].extend(call_result.get("scores", [])) + image_to_data[image]["masks"].extend(call_result.get("masks", [])) + # only single heatmap is returned + if "heat_map" in call_result: + image_to_data[image]["heat_map"].append(call_result["heat_map"]) + if "mask_shape" in call_result: + image_to_data[image]["mask_shape"] = call_result["mask_shape"] + + return image_to_data + + +def sample_n_evenly_spaced(lst: Sequence, n: int) -> Sequence: + if n <= 0: + return [] + elif len(lst) == 0: + return [] + elif n == 1: + return [lst[0]] + elif n >= len(lst): + return lst + + spacing = (len(lst) - 1) / (n - 1) + return [lst[round(spacing * i)] for i in range(n)] + + +def visualize_result(all_tool_results: List[Dict]) -> Sequence[Union[str, Path]]: + image_to_data: Dict[str, Dict] = {} + for tool_result in all_tool_results: + # only handle bbox/mask tools or frame extraction + if tool_result["tool_name"] not in [ + "grounding_sam_", + "grounding_dino_", + "extract_frames_", + "dinov_", + "zero_shot_counting_", + "visual_prompt_counting_", + "ocr_", + ]: + continue + + if tool_result["tool_name"] == "extract_frames_": + image_to_data = _handle_extract_frames(image_to_data, tool_result) + else: + image_to_data = _handle_viz_tools(image_to_data, tool_result) + + visualized_images = [] + for image_str in image_to_data: + image_path = Path(image_str) + image_data = image_to_data[image_str] + if "_counting_" in tool_result["tool_name"]: + image = overlay_heat_map(image_path, image_data) + else: + image = overlay_masks(image_path, image_data) + image = overlay_bboxes(image, image_data) + with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as f: + image.save(f.name) + visualized_images.append(f.name) + return visualized_images + + +class EasyToolV2(Agent): + r"""EasyToolV2 is an agent framework that utilizes tools as well as self + reflection to accomplish tasks, in particular vision tasks. EasyToolV2 is based + off of EasyTool https://arxiv.org/abs/2401.06201 and Reflexion + https://arxiv.org/abs/2303.11366 where it will attempt to complete a task and then + reflect on whether or not it was able to accomplish the task based off of the plan + and final results, if not it will redo the task with this newly added reflection. + + Example + ------- + >>> from vision_agent.agent import EasyToolV2 + >>> agent = EasyToolV2() + >>> resp = agent("If red tomatoes cost $5 each and yellow tomatoes cost $2.50 each, what is the total cost of all the tomatoes in the image?", image="tomatoes.jpg") + >>> print(resp) + "The total cost is $57.50." + """ + + def __init__( + self, + task_model: Optional[Union[LLM, LMM]] = None, + answer_model: Optional[Union[LLM, LMM]] = None, + reflect_model: Optional[Union[LLM, LMM]] = None, + max_retries: int = 2, + verbose: bool = False, + report_progress_callback: Optional[Callable[[Dict[str, Any]], None]] = None, + ): + """EasyToolV2 constructor. + + Parameters: + task_model: the model to use for task decomposition. + answer_model: the model to use for reasoning and concluding the answer. + reflect_model: the model to use for self reflection. + max_retries: maximum number of retries to attempt to complete the task. + verbose: whether to print more logs. + report_progress_callback: a callback to report the progress of the agent. This is useful for streaming logs in a web application where multiple EasyToolV2 instances are running in parallel. This callback ensures that the progress are not mixed up. + """ + self.task_model = ( + OpenAILLM(model_name="gpt-4-turbo", json_mode=True, temperature=0.0) + if task_model is None + else task_model + ) + self.answer_model = ( + OpenAILLM(model_name="gpt-4-turbo", temperature=0.0) + if answer_model is None + else answer_model + ) + self.reflect_model = ( + OpenAILMM(model_name="gpt-4-turbo", json_mode=True, temperature=0.0) + if reflect_model is None + else reflect_model + ) + self.max_retries = max_retries + self.tools = TOOLS + self.report_progress_callback = report_progress_callback + if verbose: + _LOGGER.setLevel(logging.INFO) + + def __call__( + self, + input: Union[List[Dict[str, str]], str], + media: Optional[Union[str, Path]] = None, + reference_data: Optional[Dict[str, str]] = None, + visualize_output: Optional[bool] = False, + self_reflection: Optional[bool] = True, + ) -> str: + """Invoke the vision agent. + + Parameters: + chat: A conversation in the format of + [{"role": "user", "content": "describe your task here..."}]. + image: The input image referenced in the chat parameter. + reference_data: A dictionary containing the reference image, mask or bounding + box in the format of: + {"image": "image.jpg", "mask": "mask.jpg", "bbox": [0.1, 0.2, 0.1, 0.2]} + where the bounding box coordinates are normalized. + visualize_output: Whether to visualize the output. + self_reflection: boolean to enable and disable self reflection. + + Returns: + The result of the vision agent in text. + """ + if isinstance(input, str): + input = [{"role": "user", "content": input}] + return self.chat( + input, + media=media, + visualize_output=visualize_output, + reference_data=reference_data, + self_reflection=self_reflection, + ) + + def log_progress(self, data: Dict[str, Any]) -> None: + _LOGGER.info(data) + if self.report_progress_callback: + self.report_progress_callback(data) + + def _report_visualization_via_callback( + self, images: Sequence[Union[str, Path]] + ) -> None: + """This is intended for streaming the visualization images via the callback to the client side.""" + if self.report_progress_callback: + self.report_progress_callback({"log": ""}) + if images: + for img in images: + self.report_progress_callback( + {"log": f"base:64{convert_to_b64(img)}"} + ) + self.report_progress_callback({"log": ""}) + + def chat_with_workflow( + self, + chat: List[Dict[str, str]], + media: Optional[Union[str, Path]] = None, + reference_data: Optional[Dict[str, str]] = None, + visualize_output: Optional[bool] = False, + self_reflection: Optional[bool] = True, + ) -> Tuple[str, List[Dict]]: + """Chat with EasyToolV2 and return the final answer and all tool results. + + Parameters: + chat: A conversation in the format of + [{"role": "user", "content": "describe your task here..."}]. + image: The input image referenced in the chat parameter. + reference_data: A dictionary containing the reference image, mask or bounding + box in the format of: + {"image": "image.jpg", "mask": "mask.jpg", "bbox": [0.1, 0.2, 0.1, 0.2]} + where the bounding box coordinates are normalized. + visualize_output: Whether to visualize the output. + self_reflection: boolean to enable and disable self reflection. + + Returns: + A tuple where the first item is the final answer and the second item is a + list of all the tool results. The last item in the tool results also + contains the visualized output. + """ + if len(chat) == 0: + raise ValueError("Input cannot be empty.") + + question = chat[0]["content"] + if media: + question += f" Image name: {media}" + if reference_data: + question += ( + f" Reference image: {reference_data['image']}" + if "image" in reference_data + else "" + ) + question += ( + f" Reference mask: {reference_data['mask']}" + if "mask" in reference_data + else "" + ) + question += ( + f" Reference bbox: {reference_data['bbox']}" + if "bbox" in reference_data + else "" + ) + + reflections = "" + final_answer = "" + all_tool_results: List[Dict] = [] + + for _ in range(self.max_retries): + task_list = self.create_tasks( + self.task_model, question, self.tools, reflections + ) + + task_depend = {"Original Question": question} + previous_log = "" + answers = [] + for task in task_list: + task_depend[task["id"]] = {"task": task["task"], "answer": "", "call_result": ""} # type: ignore + all_tool_results = [] + + for task in task_list: + task_str = task["task"] + previous_log = str(task_depend) + tool_results, call_results = self.retrieval( + self.task_model, + task_str, + self.tools, + previous_log, + reflections, + ) + answer = answer_generate( + self.answer_model, task_str, call_results, previous_log, reflections + ) + + tool_results["answer"] = answer + all_tool_results.append(tool_results) + + self.log_progress({"log": f"\tCall Result: {call_results}"}) + self.log_progress({"log": f"\tAnswer: {answer}"}) + answers.append({"task": task_str, "answer": answer}) + task_depend[task["id"]]["answer"] = answer # type: ignore + task_depend[task["id"]]["call_result"] = call_results # type: ignore + final_answer = answer_summarize( + self.answer_model, question, answers, reflections + ) + visualized_output = visualize_result(all_tool_results) + all_tool_results.append({"visualized_output": visualized_output}) + if len(visualized_output) > 0: + reflection_images = sample_n_evenly_spaced(visualized_output, 3) + elif media is not None: + reflection_images = [media] + else: + reflection_images = None + + if self_reflection: + reflection = self_reflect( + self.reflect_model, + question, + self.tools, + all_tool_results, + final_answer, + reflection_images, + ) + self.log_progress({"log": f"Reflection: {reflection}"}) + parsed_reflection = parse_reflect(reflection) + if parsed_reflection["Finish"]: + break + else: + reflections += "\n" + parsed_reflection["Reflection"] + else: + self.log_progress( + {"log": "Self Reflection skipped based on user request."} + ) + break + # '' is a symbol to indicate the end of the chat, which is useful for streaming logs. + self.log_progress( + { + "log": f"EasyToolV2 has concluded this chat. {final_answer}" + } + ) + + if visualize_output: + viz_images: Sequence[Union[str, Path]] = all_tool_results[-1][ + "visualized_output" + ] + self._report_visualization_via_callback(viz_images) + for img in viz_images: + Image.open(img).show() + + return final_answer, all_tool_results + + def chat( + self, + chat: List[Dict[str, str]], + media: Optional[Union[str, Path]] = None, + reference_data: Optional[Dict[str, str]] = None, + visualize_output: Optional[bool] = False, + self_reflection: Optional[bool] = True, + ) -> str: + answer, _ = self.chat_with_workflow( + chat, + media=media, + visualize_output=visualize_output, + reference_data=reference_data, + self_reflection=self_reflection, + ) + return answer + + def retrieval( + self, + model: Union[LLM, LMM, Agent], + question: str, + tools: Dict[int, Any], + previous_log: str, + reflections: str, + ) -> Tuple[Dict, str]: + tool_id = choose_tool( + model, + question, + {k: v["description"] for k, v in tools.items()}, + reflections, + ) + if tool_id is None: + return {}, "" + + tool_instructions = tools[tool_id] + tool_usage = tool_instructions["usage"] + tool_name = tool_instructions["name"] + + parameters = choose_parameter( + model, question, tool_usage, previous_log, reflections + ) + if parameters is None: + return {}, "" + tool_results = { + "task": question, + "tool_name": tool_name, + "parameters": parameters, + } + + self.log_progress( + { + "log": f"""Going to run the following tool(s) in sequence: +{tabulate(tabular_data=[tool_results], headers="keys", tablefmt="mixed_grid", maxcolwidths=_MAX_TABULATE_COL_WIDTH)}""" + } + ) + + def parse_tool_results(result: Dict[str, Union[Dict, List]]) -> Any: + call_results: List[Any] = [] + if isinstance(result["parameters"], Dict): + call_results.append( + function_call(tools[tool_id]["class"], result["parameters"]) + ) + elif isinstance(result["parameters"], List): + for parameters in result["parameters"]: + call_results.append( + function_call(tools[tool_id]["class"], parameters) + ) + return call_results + + call_results = parse_tool_results(tool_results) + tool_results["call_results"] = call_results + + call_results_str = str(call_results) + return tool_results, call_results_str + + def create_tasks( + self, + task_model: Union[LLM, LMM], + question: str, + tools: Dict[int, Any], + reflections: str, + ) -> List[Dict]: + tasks = task_decompose( + task_model, + question, + {k: v["description"] for k, v in tools.items()}, + reflections, + ) + if tasks is not None: + task_list = [{"task": task, "id": i + 1} for i, task in enumerate(tasks)] + task_list = task_topology(task_model, question, task_list) + try: + task_list = topological_sort(task_list) + except Exception: + _LOGGER.error(f"Failed topological_sort on: {task_list}") + else: + task_list = [] + self.log_progress( + { + "log": "Planned tasks:", + "plan": task_list, + } + ) + return task_list diff --git a/vision_agent/agent/easytool_v2_prompts.py b/vision_agent/agent/easytool_v2_prompts.py new file mode 100644 index 00000000..8b3cbaa1 --- /dev/null +++ b/vision_agent/agent/easytool_v2_prompts.py @@ -0,0 +1,152 @@ +VISION_AGENT_REFLECTION = """You are an advanced reasoning agent that can improve based on self-refection. You will be given a previous reasoning trial in which you were given the user's question, the available tools that the agent has, the decomposed tasks and tools that the agent used to answer the question, the tool usage for each of the tools used and the final answer the agent provided. You may also receive an image with the visualized bounding boxes or masks with their associated labels and scores from the tools used. + +Please note that: +1. You must ONLY output parsible JSON format. If the agents output was correct set "Finish" to true, else set "Finish" to false. An example output looks like: +{{"Finish": true, "Reflection": "The agent's answer was correct."}} +2. You must utilize the image with the visualized bounding boxes or masks and determine if the tools were used correctly or if the tools were used incorrectly or the wrong tools were used. +3. If the agent's answer was incorrect, you must diagnose the reason for failure and devise a new concise and concrete plan that aims to mitigate the same failure with the tools available. An example output looks like: + {{"Finish": false, "Reflection": "I can see from the visualized bounding boxes that the agent's answer was incorrect because the grounding_dino_ tool produced false positive predictions. The agent should use the following tools with the following parameters: + Step 1: Use 'grounding_dino_' with a 'prompt' of 'baby. bed' and a 'box_threshold' of 0.7 to reduce the false positives. + Step 2: Use 'box_iou_' with the baby bounding box and the bed bounding box to determine if the baby is on the bed or not."}} +4. If the task cannot be completed with the existing tools or by adjusting the parameters, set "Finish" to true. + +User's question: {question} + +Tools available: +{tools} + +Tasks and tools used: +{tool_results} + +Tool's used API documentation: +{tool_usage} + +Final answer: +{final_answer} + +Reflection: """ + +TASK_DECOMPOSE = """You need to decompose a user's complex question into one or more simple subtasks and let the model execute it step by step. +This is the user's question: {question} +This is the tool list: +{tools} + +Please note that: +1. If the given task is simple and the answer can be provided by executing one tool, you should only use that tool to provide the answer. +2. If the given task is complex, You should decompose this user's complex question into simple subtasks which can only be executed easily by using one single tool in the tool list. +3. You should try to decompose the complex question into least number of subtasks. +4. If one subtask needs the results from another subtask, you should write clearly. For example: +{{"Tasks": ["Convert 23 km/h to X km/min by 'divide_'", "Multiply X km/min by 45 min to get Y by 'multiply_'"]}} +5. You must ONLY output in a parsible JSON format. An example output looks like: + +{{"Tasks": ["Task 1", "Task 2", ...]}} + +Output: """ + +TASK_DECOMPOSE_DEPENDS = """You need to decompose a user's complex question into one or more simple subtasks and let the model execute it step by step. +This is the user's question: {question} + +This is the tool list: +{tools} + +This is a reflection from a previous failed attempt: +{reflections} + +Please note that: +1. If the given task is simple and the answer can be provided by executing one tool, you should only use that tool to provide the answer. +2. If the given task is complex, You should decompose this user's complex question into simple subtasks which can only be executed easily by using one single tool in the tool list. +3. You should try to decompose the complex question into least number of subtasks. +4. If one subtask needs the results from another subtask, you should write clearly. For example: +{{"Tasks": ["Convert 23 km/h to X km/min by 'divide_'", "Multiply X km/min by 45 min to get Y by 'multiply_'"]}} +5. You must ONLY output in a parsible JSON format. An example output looks like: + +{{"Tasks": ["Task 1", "Task 2", ...]}} + +Output: """ + +CHOOSE_TOOL = """This is the user's question: {question} +These are the tools you can select to solve the question: +{tools} + +Please note that: +1. You should only choose one tool from the Tool List to solve this question and it should have maximum chance of solving the question. +2. You should only choose the tool whose parameters are most relevant to the user's question and are available as part of the question. +3. You should choose the tool whose return type is most relevant to the answer of the user's question. +4. You must ONLY output the ID of the tool you chose in a parsible JSON format. Two example outputs look like: + +Example 1: {{"ID": 1}} +Example 2: {{"ID": 2}} + +Output: """ + +CHOOSE_TOOL_DEPENDS = """This is the user's question: {question} +These are the tools you can select to solve the question: +{tools} + +This is a reflection from a previous failed attempt: +{reflections} + +Please note that: +1. You should only choose one tool from the Tool List to solve this question and it should have maximum chance of solving the question. +2. You should only choose the tool whose parameters are most relevant to the user's question and are available as part of the question. +3. You should choose the tool whose return type is most relevant to the answer of the user's question. +4. You must ONLY output the ID of the tool you chose in a parsible JSON format. Two example outputs look like: + +Example 1: {{"ID": 1}} +Example 2: {{"ID": 2}} + +Output: """ + +CHOOSE_PARAMETER_DEPENDS = """Given a user's question and an API tool documentation, you need to output parameters according to the API tool documentation to successfully call the API to solve the user's question. +Please note that: +1. The Example in the API tool documentation can help you better understand the use of the API. Pay attention to the examples which show how to parse the question and extract tool parameters such as prompts and visual inputs. +2. Ensure the parameters you output are correct. The output must contain the required parameters, and can contain the optional parameters based on the question. If there are no parameters in the required parameters and optional parameters, just leave it as {{"Parameters":{{}}}} +3. If the user's question mentions other APIs, you should ONLY consider the API tool documentation I give and do not consider other APIs. +4. The question may have dependencies on answers of other questions, so we will provide logs of previous questions and answers for your reference. +5. If you need to use this API multiple times, please set "Parameters" to a list. +6. You must ONLY output in a parsible JSON format. Two example outputs look like: + +Example 1: {{"Parameters":{{"input": [1,2,3]}}}} +Example 2: {{"Parameters":[{{"input": [1,2,3]}}, {{"input": [2,3,4]}}]}} + +This is a reflection from a previous failed attempt: +{reflections} + +These are logs of previous questions and answers: +{previous_log} + +This is the current user's question: {question} +This is the API tool documentation: {tool_usage} +Output: """ + +ANSWER_GENERATE_DEPENDS = """You should answer the question based on the response output by the API tool. +Please note that: +1. You should try to organize the response into a natural language answer. +2. We will not show the API response to the user, thus you need to make full use of the response and give the information in the response that can satisfy the user's question in as much detail as possible. +3. If the API tool does not provide useful information in the response, please answer with your knowledge. +4. The question may have dependencies on answers of other questions, so we will provide logs of previous questions and answers. + +This is a reflection from a previous failed attempt: +{reflections} + +These are logs of previous questions and answers: +{previous_log} + +This is the user's question: {question} + +This is the response output by the API tool: +{call_results} + +We will not show the API response to the user, thus you need to make full use of the response and give the information in the response that can satisfy the user's question in as much detail as possible. +Output: """ + +ANSWER_SUMMARIZE_DEPENDS = """We break down a user's complex problems into simple subtasks and provide answers to each simple subtask. You need to organize these answers to each subtask and form a self-consistent final answer to the user's question +This is the user's question: {question} + +These are subtasks and their answers: +{answers} + +This is a reflection from a previous failed attempt: +{reflections} + +Final answer: """ diff --git a/vision_agent/agent/reflexion.py b/vision_agent/agent/reflexion.py index 61dded6d..d3b479b2 100644 --- a/vision_agent/agent/reflexion.py +++ b/vision_agent/agent/reflexion.py @@ -138,7 +138,7 @@ def __init__( def __call__( self, input: Union[str, List[Dict[str, str]]], - image: Optional[Union[str, Path]] = None, + media: Optional[Union[str, Path]] = None, ) -> str: """Invoke the vision agent. @@ -151,24 +151,24 @@ def __call__( """ if isinstance(input, str): input = [{"role": "user", "content": input}] - return self.chat(input, image) + return self.chat(input, media) def chat( - self, chat: List[Dict[str, str]], image: Optional[Union[str, Path]] = None + self, chat: List[Dict[str, str]], media: Optional[Union[str, Path]] = None ) -> str: if len(chat) == 0 or chat[0]["role"] != "user": raise ValueError( f"Invalid chat. Should start with user and alternate between user" f"and assistant and contain at least one entry {chat}" ) - if image is not None and isinstance(self.action_agent, LLM): + if media is not None and isinstance(self.action_agent, LLM): raise ValueError( "If image is provided, then action_agent must be an agent or LMM." ) question = chat[0]["content"] if len(chat) == 1: - results = self._step(question, image=image) + results = self._step(question, image=media) self.last_scratchpad = results["scratchpad"] return results["action_arg"] @@ -183,10 +183,10 @@ def chat( self.last_scratchpad += "Answer is INCORRECT" chat_context = "The previous conversation was:\n" + chat_str reflections = self.reflect( - question, chat_context, self.last_scratchpad, image + question, chat_context, self.last_scratchpad, media ) _LOGGER.info(f" {reflections}") - results = self._step(question, reflections, image=image) + results = self._step(question, reflections, image=media) self.last_scratchpad = results["scratchpad"] return results["action_arg"] @@ -249,7 +249,7 @@ def prompt_agent( return format_step( self.action_agent( self._build_agent_prompt(question, reflections, scratchpad), - image=image, + media=image, ) ) diff --git a/vision_agent/agent/vision_agent.py b/vision_agent/agent/vision_agent.py index 2db933d9..f45ca7b4 100644 --- a/vision_agent/agent/vision_agent.py +++ b/vision_agent/agent/vision_agent.py @@ -1,778 +1,447 @@ +import copy import json import logging import sys -import tempfile from pathlib import Path -from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union +from typing import Any, Callable, Dict, List, Optional, Union, cast -from PIL import Image +from rich.console import Console +from rich.syntax import Syntax from tabulate import tabulate -from vision_agent.agent.agent import Agent -from vision_agent.agent.easytool_prompts import ( - ANSWER_GENERATE, - ANSWER_SUMMARIZE, - CHOOSE_PARAMETER, - CHOOSE_TOOL, - TASK_DECOMPOSE, - TASK_TOPOLOGY, -) +from vision_agent.agent import Agent from vision_agent.agent.vision_agent_prompts import ( - ANSWER_GENERATE_DEPENDS, - ANSWER_SUMMARIZE_DEPENDS, - CHOOSE_PARAMETER_DEPENDS, - CHOOSE_TOOL_DEPENDS, - TASK_DECOMPOSE_DEPENDS, - VISION_AGENT_REFLECTION, + CODE, + FEEDBACK, + FIX_BUG, + FULL_TASK, + PLAN, + REFLECT, + SIMPLE_TEST, + USER_REQ, ) from vision_agent.llm import LLM, OpenAILLM -from vision_agent.lmm import LMM, OpenAILMM -from vision_agent.tools import TOOLS -from vision_agent.utils.image_utils import ( - convert_to_b64, - overlay_bboxes, - overlay_heat_map, - overlay_masks, -) +from vision_agent.tools import TOOL_DESCRIPTIONS, TOOLS_DF, UTILITIES_DOCSTRING +from vision_agent.utils import Execute +from vision_agent.utils.sim import Sim logging.basicConfig(stream=sys.stdout) _LOGGER = logging.getLogger(__name__) _MAX_TABULATE_COL_WIDTH = 80 +_EXECUTE = Execute(600) +_CONSOLE = Console() -def parse_json(s: str) -> Any: - s = ( - s.replace(": True", ": true") - .replace(": False", ": false") - .replace(":True", ": true") - .replace(":False", ": false") - .replace("```", "") - .strip() - ) - return json.loads(s) - - -def change_name(name: str) -> str: - change_list = ["from", "class", "return", "false", "true", "id", "and", "", "ID"] - if name in change_list: - name = "is_" + name.lower() - return name - - -def format_tools(tools: Dict[int, Any]) -> str: - # Format this way so it's clear what the ID's are - tool_str = "" - for key in tools: - tool_str += f"ID: {key} - {tools[key]}\n" - return tool_str - - -def format_tool_usage(tools: Dict[int, Any], tool_result: List[Dict]) -> str: - usage = [] - name_to_usage = {v["name"]: v["usage"] for v in tools.values()} - for tool_res in tool_result: - if "tool_name" in tool_res: - usage.append((tool_res["tool_name"], name_to_usage[tool_res["tool_name"]])) - - usage_str = "" - for tool_name, tool_usage in usage: - usage_str += f"{tool_name} - {tool_usage}\n" - return usage_str - - -def topological_sort(tasks: List[Dict]) -> List[Dict]: - in_degree = {task["id"]: 0 for task in tasks} - for task in tasks: - for dep in task["dep"]: - if dep in in_degree: - in_degree[task["id"]] += 1 - - queue = [task for task in tasks if in_degree[task["id"]] == 0] - sorted_order = [] - - while queue: - current = queue.pop(0) - sorted_order.append(current) - - for task in tasks: - if current["id"] in task["dep"]: - in_degree[task["id"]] -= 1 - if in_degree[task["id"]] == 0: - queue.append(task) - - if len(sorted_order) != len(tasks): - completed_ids = set([task["id"] for task in sorted_order]) - remaining_tasks = [task for task in tasks if task["id"] not in completed_ids] - sorted_order.extend(remaining_tasks) - return sorted_order - - -def task_decompose( - model: Union[LLM, LMM, Agent], - question: str, - tools: Dict[int, Any], - reflections: str, -) -> Optional[Dict]: - if reflections: - prompt = TASK_DECOMPOSE_DEPENDS.format( - question=question, tools=format_tools(tools), reflections=reflections - ) - else: - prompt = TASK_DECOMPOSE.format(question=question, tools=format_tools(tools)) - tries = 0 - str_result = "" - while True: - try: - str_result = model(prompt) - result = parse_json(str_result) - return result["Tasks"] # type: ignore - except Exception: - if tries > 10: - _LOGGER.error(f"Failed task_decompose on: {str_result}") - return None - tries += 1 - continue - - -def task_topology( - model: Union[LLM, LMM, Agent], question: str, task_list: List[Dict] -) -> List[Dict[str, Any]]: - prompt = TASK_TOPOLOGY.format(question=question, task_list=task_list) - tries = 0 - str_result = "" - while True: - try: - str_result = model(prompt) - result = parse_json(str_result) - for elt in result["Tasks"]: - if isinstance(elt["dep"], str): - elt["dep"] = [int(dep) for dep in elt["dep"].split(",")] - elif isinstance(elt["dep"], int): - elt["dep"] = [elt["dep"]] - elif isinstance(elt["dep"], list): - elt["dep"] = [int(dep) for dep in elt["dep"]] - return result["Tasks"] # type: ignore - except Exception: - if tries > 10: - _LOGGER.error(f"Failed task_topology on: {str_result}") - return task_list - tries += 1 - continue - - -def choose_tool( - model: Union[LLM, LMM, Agent], - question: str, - tools: Dict[int, Any], - reflections: str, -) -> Optional[int]: - if reflections: - prompt = CHOOSE_TOOL_DEPENDS.format( - question=question, tools=format_tools(tools), reflections=reflections - ) - else: - prompt = CHOOSE_TOOL.format(question=question, tools=format_tools(tools)) - tries = 0 - str_result = "" - while True: - try: - str_result = model(prompt) - result = parse_json(str_result) - return result["ID"] # type: ignore - except Exception: - if tries > 10: - _LOGGER.error(f"Failed choose_tool on: {str_result}") - return None - tries += 1 - continue - - -def choose_parameter( - model: Union[LLM, LMM, Agent], - question: str, - tool_usage: Dict, - previous_log: str, - reflections: str, -) -> Optional[Any]: - # TODO: should format tool_usage - if reflections: - prompt = CHOOSE_PARAMETER_DEPENDS.format( - question=question, - tool_usage=tool_usage, - previous_log=previous_log, - reflections=reflections, - ) - else: - prompt = CHOOSE_PARAMETER.format( - question=question, tool_usage=tool_usage, previous_log=previous_log - ) - tries = 0 - str_result = "" - while True: - try: - str_result = model(prompt) - result = parse_json(str_result) - return result["Parameters"] - except Exception: - if tries > 10: - _LOGGER.error(f"Failed choose_parameter on: {str_result}") - return None - tries += 1 - continue - - -def answer_generate( - model: Union[LLM, LMM, Agent], - question: str, - call_results: str, - previous_log: str, - reflections: str, -) -> str: - if reflections: - prompt = ANSWER_GENERATE_DEPENDS.format( - question=question, - call_results=call_results, - previous_log=previous_log, - reflections=reflections, - ) - else: - prompt = ANSWER_GENERATE.format( - question=question, call_results=call_results, previous_log=previous_log +def format_memory(memory: List[Dict[str, str]]) -> str: + return FEEDBACK.format( + feedback="\n".join( + [ + f"### Feedback {i}:\nCode: ```python\n{m['code']}\n```\nFeedback: {m['feedback']}\n" + for i, m in enumerate(memory) + ] ) - return model(prompt) + ) -def answer_summarize( - model: Union[LLM, LMM, Agent], question: str, answers: List[Dict], reflections: str -) -> str: - if reflections: - prompt = ANSWER_SUMMARIZE_DEPENDS.format( - question=question, answers=answers, reflections=reflections - ) +def extract_code(code: str) -> str: + if "\n```python" in code: + start = "\n```python" + elif "```python" in code: + start = "```python" else: - prompt = ANSWER_SUMMARIZE.format(question=question, answers=answers) - return model(prompt) + return code + + code = code[code.find(start) + len(start) :] + code = code[: code.find("```")] + if code.startswith("python\n"): + code = code[len("python\n") :] + return code -def function_call(tool: Callable, parameters: Dict[str, Any]) -> Any: +def extract_json(json_str: str) -> Dict[str, Any]: try: - return tool()(**parameters) - except Exception as e: - _LOGGER.error(f"Failed function_call on: {e}") - # return error message so it can self-correct - return str(e) - - -def self_reflect( - reflect_model: Union[LLM, LMM], - question: str, - tools: Dict[int, Any], - tool_result: List[Dict], - final_answer: str, - images: Optional[Sequence[Union[str, Path]]] = None, -) -> str: - prompt = VISION_AGENT_REFLECTION.format( - question=question, - tools=format_tools({k: v["description"] for k, v in tools.items()}), - tool_usage=format_tool_usage(tools, tool_result), - tool_results=str(tool_result), - final_answer=final_answer, + json_dict = json.loads(json_str) + except json.JSONDecodeError: + if "```json" in json_str: + json_str = json_str[json_str.find("```json") + len("```json") :] + json_str = json_str[: json_str.find("```")] + elif "```" in json_str: + json_str = json_str[json_str.find("```") + len("```") :] + # get the last ``` not one from an intermediate string + json_str = json_str[: json_str.find("}```")] + json_dict = json.loads(json_str) + return json_dict # type: ignore + + +def write_plan( + chat: List[Dict[str, str]], + tool_desc: str, + working_memory: str, + model: LLM, +) -> List[Dict[str, str]]: + chat = copy.deepcopy(chat) + if chat[-1]["role"] != "user": + raise ValueError("Last chat message must be from the user.") + + user_request = chat[-1]["content"] + context = USER_REQ.format(user_request=user_request) + prompt = PLAN.format(context=context, tool_desc=tool_desc, feedback=working_memory) + chat[-1]["content"] = prompt + return extract_json(model.chat(chat))["plan"] # type: ignore + + +def reflect( + chat: List[Dict[str, str]], + plan: str, + code: str, + model: LLM, +) -> Dict[str, Union[str, bool]]: + chat = copy.deepcopy(chat) + if chat[-1]["role"] != "user": + raise ValueError("Last chat message must be from the user.") + + user_request = chat[-1]["content"] + context = USER_REQ.format(user_request=user_request) + prompt = REFLECT.format(context=context, plan=plan, code=code) + chat[-1]["content"] = prompt + return extract_json(model.chat(chat)) + + +def write_and_test_code( + task: str, + tool_info: str, + tool_utils: str, + working_memory: str, + coder: LLM, + tester: LLM, + debugger: LLM, + log_progress: Callable[[Dict[str, Any]], None], + verbosity: int = 0, + max_retries: int = 3, + input_media: Optional[Union[str, Path]] = None, +) -> Dict[str, Any]: + code = extract_code( + coder(CODE.format(docstring=tool_info, question=task, feedback=working_memory)) + ) + test = extract_code( + tester( + SIMPLE_TEST.format( + docstring=tool_utils, + question=task, + code=code, + feedback=working_memory, + media=input_media, + ) + ) ) - if ( - issubclass(type(reflect_model), LMM) - and images is not None - and all([Path(image).suffix in [".jpg", ".jpeg", ".png"] for image in images]) - ): - return reflect_model(prompt, images=images) # type: ignore - return reflect_model(prompt) + success, result = _EXECUTE.run_isolation(f"{code}\n{test}") + if verbosity == 2: + _LOGGER.info("Initial code and tests:") + log_progress( + { + "log": "Code:", + "code": code, + } + ) + log_progress( + { + "log": "Test:", + "code": test, + } + ) + _CONSOLE.print( + Syntax(f"{code}\n{test}", "python", theme="gruvbox-dark", line_numbers=True) + ) + log_progress( + { + "log": "Result:", + "result": result, + } + ) + _LOGGER.info(f"Initial result: {result}") + + count = 0 + new_working_memory = [] + while not success and count < max_retries: + fixed_code_and_test = extract_json( + debugger( + FIX_BUG.format( + code=code, tests=test, result=result, feedback=working_memory + ) + ) + ) + if fixed_code_and_test["code"].strip() != "": + code = extract_code(fixed_code_and_test["code"]) + if fixed_code_and_test["test"].strip() != "": + test = extract_code(fixed_code_and_test["test"]) + new_working_memory.append( + {"code": f"{code}\n{test}", "feedback": fixed_code_and_test["reflections"]} + ) -def parse_reflect(reflect: str) -> Any: - reflect = reflect.strip() - try: - return parse_json(reflect) - except Exception: - _LOGGER.error(f"Failed parse json reflection: {reflect}") - # LMMs have a hard time following directions, so make the criteria less strict - finish = ( - "finish" in reflect.lower() and len(reflect) < 100 - ) or "finish" in reflect.lower()[-10:] - return {"Finish": finish, "Reflection": reflect} - - -def _handle_extract_frames( - image_to_data: Dict[str, Dict], tool_result: Dict -) -> Dict[str, Dict]: - image_to_data = image_to_data.copy() - # handle extract_frames_ case, useful if it extracts frames but doesn't do - # any following processing - for video_file_output in tool_result["call_results"]: - # When the video tool is run with wrong parameters, exit the loop - if not isinstance(video_file_output, tuple) or len(video_file_output) < 2: - break - for frame, _ in video_file_output: - image = frame - if image not in image_to_data: - image_to_data[image] = { - "bboxes": [], - "masks": [], - "heat_map": [], - "labels": [], - "scores": [], + success, result = _EXECUTE.run_isolation(f"{code}\n{test}") + if verbosity == 2: + log_progress( + { + "log": f"Debug attempt {count + 1}, reflection:", + "result": fixed_code_and_test["reflections"], } - return image_to_data - - -def _handle_viz_tools( - image_to_data: Dict[str, Dict], tool_result: Dict -) -> Dict[str, Dict]: - image_to_data = image_to_data.copy() - - # handle grounding_sam_ and grounding_dino_ - parameters = tool_result["parameters"] - # parameters can either be a dictionary or list, parameters can also be malformed - # becaus the LLM builds them - if isinstance(parameters, dict): - if "image" not in parameters: - return image_to_data - parameters = [parameters] - elif isinstance(tool_result["parameters"], list): - if len(tool_result["parameters"]) < 1 or ( - "image" not in tool_result["parameters"][0] - ): - return image_to_data - - for param, call_result in zip(parameters, tool_result["call_results"]): - # Calls can fail, so we need to check if the call was successful. It can either: - # 1. return a str or some error that's not a dictionary - # 2. return a dictionary but not have the necessary keys - - if not isinstance(call_result, dict) or ( - "bboxes" not in call_result - and "mask" not in call_result - and "heat_map" not in call_result - ): - return image_to_data - - # if the call was successful, then we can add the image data - image = param["image"] - if image not in image_to_data: - image_to_data[image] = { - "bboxes": [], - "masks": [], - "heat_map": [], - "labels": [], - "scores": [], - } + ) + _LOGGER.info( + f"Debug attempt {count + 1}, reflection: {fixed_code_and_test['reflections']}" + ) + _CONSOLE.print( + Syntax( + f"{code}\n{test}", "python", theme="gruvbox-dark", line_numbers=True + ) + ) + log_progress( + { + "log": "Debug result:", + "result": result, + } + ) + _LOGGER.info(f"Debug result: {result}") + count += 1 - image_to_data[image]["bboxes"].extend(call_result.get("bboxes", [])) - image_to_data[image]["labels"].extend(call_result.get("labels", [])) - image_to_data[image]["scores"].extend(call_result.get("scores", [])) - image_to_data[image]["masks"].extend(call_result.get("masks", [])) - # only single heatmap is returned - if "heat_map" in call_result: - image_to_data[image]["heat_map"].append(call_result["heat_map"]) - if "mask_shape" in call_result: - image_to_data[image]["mask_shape"] = call_result["mask_shape"] - - return image_to_data - - -def sample_n_evenly_spaced(lst: Sequence, n: int) -> Sequence: - if n <= 0: - return [] - elif len(lst) == 0: - return [] - elif n == 1: - return [lst[0]] - elif n >= len(lst): - return lst - - spacing = (len(lst) - 1) / (n - 1) - return [lst[round(spacing * i)] for i in range(n)] - - -def visualize_result(all_tool_results: List[Dict]) -> Sequence[Union[str, Path]]: - image_to_data: Dict[str, Dict] = {} - for tool_result in all_tool_results: - # only handle bbox/mask tools or frame extraction - if tool_result["tool_name"] not in [ - "grounding_sam_", - "grounding_dino_", - "extract_frames_", - "dinov_", - "zero_shot_counting_", - "visual_prompt_counting_", - "ocr_", - ]: - continue - - if tool_result["tool_name"] == "extract_frames_": - image_to_data = _handle_extract_frames(image_to_data, tool_result) - else: - image_to_data = _handle_viz_tools(image_to_data, tool_result) - - visualized_images = [] - for image_str in image_to_data: - image_path = Path(image_str) - image_data = image_to_data[image_str] - if "_counting_" in tool_result["tool_name"]: - image = overlay_heat_map(image_path, image_data) - else: - image = overlay_masks(image_path, image_data) - image = overlay_bboxes(image, image_data) - with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as f: - image.save(f.name) - visualized_images.append(f.name) - return visualized_images + if verbosity >= 1: + _LOGGER.info("Final code and tests:") + _CONSOLE.print( + Syntax(f"{code}\n{test}", "python", theme="gruvbox-dark", line_numbers=True) + ) + _LOGGER.info(f"Final Result: {result}") + + return { + "code": code, + "test": test, + "success": success, + "test_result": result, + "working_memory": new_working_memory, + } + + +def retrieve_tools( + plan: List[Dict[str, str]], + tool_recommender: Sim, + log_progress: Callable[[Dict[str, Any]], None], + verbosity: int = 0, +) -> str: + tool_info = [] + tool_desc = [] + for task in plan: + tools = tool_recommender.top_k(task["instructions"], k=2, thresh=0.3) + tool_info.extend([e["doc"] for e in tools]) + tool_desc.extend([e["desc"] for e in tools]) + if verbosity == 2: + log_progress( + { + "log": "Retrieved tools:", + "tools": tool_desc, + } + ) + _LOGGER.info(f"Tools: {tool_desc}") + tool_info_set = set(tool_info) + return "\n\n".join(tool_info_set) class VisionAgent(Agent): - r"""Vision Agent is an agent framework that utilizes tools as well as self - reflection to accomplish tasks, in particular vision tasks. Vision Agent is based - off of EasyTool https://arxiv.org/abs/2401.06201 and Reflexion - https://arxiv.org/abs/2303.11366 where it will attempt to complete a task and then - reflect on whether or not it was able to accomplish the task based off of the plan - and final results, if not it will redo the task with this newly added reflection. + """Vision Agent is an agentic framework that can output code based on a user + request. It can plan tasks, retrieve relevant tools, write code, write tests and + reflect on failed test cases to debug code. It is inspired by AgentCoder + https://arxiv.org/abs/2312.13010 and Data Interpeter + https://arxiv.org/abs/2402.18679 Example ------- - >>> from vision_agent.agent import VisionAgent + >>> from vision_agent import VisionAgent >>> agent = VisionAgent() - >>> resp = agent("If red tomatoes cost $5 each and yellow tomatoes cost $2.50 each, what is the total cost of all the tomatoes in the image?", image="tomatoes.jpg") - >>> print(resp) - "The total cost is $57.50." + >>> code = agent("What percentage of the area of the jar is filled with coffee beans?", media="jar.jpg") """ def __init__( self, - task_model: Optional[Union[LLM, LMM]] = None, - answer_model: Optional[Union[LLM, LMM]] = None, - reflect_model: Optional[Union[LLM, LMM]] = None, - max_retries: int = 2, - verbose: bool = False, + planner: Optional[LLM] = None, + coder: Optional[LLM] = None, + tester: Optional[LLM] = None, + debugger: Optional[LLM] = None, + tool_recommender: Optional[Sim] = None, + verbosity: int = 0, report_progress_callback: Optional[Callable[[Dict[str, Any]], None]] = None, - ): - """VisionAgent constructor. + ) -> None: + """Initialize the Vision Agent. Parameters: - task_model: the model to use for task decomposition. - answer_model: the model to use for reasoning and concluding the answer. - reflect_model: the model to use for self reflection. - max_retries: maximum number of retries to attempt to complete the task. - verbose: whether to print more logs. - report_progress_callback: a callback to report the progress of the agent. This is useful for streaming logs in a web application where multiple VisionAgent instances are running in parallel. This callback ensures that the progress are not mixed up. + planner (Optional[LLM]): The planner model to use. Defaults to OpenAILLM. + coder (Optional[LLM]): The coder model to use. Defaults to OpenAILLM. + tester (Optional[LLM]): The tester model to use. Defaults to OpenAILLM. + debugger (Optional[LLM]): The debugger model to + tool_recommender (Optional[Sim]): The tool recommender model to use. + verbosity (int): The verbosity level of the agent. Defaults to 0. 2 is the + highest verbosity level which will output all intermediate debugging + code. + report_progress_callback: a callback to report the progress of the agent. + This is useful for streaming logs in a web application where multiple + VisionAgent instances are running in parallel. This callback ensures + that the progress are not mixed up. """ - self.task_model = ( - OpenAILLM(model_name="gpt-4-turbo", json_mode=True, temperature=0.0) - if task_model is None - else task_model + + self.planner = ( + OpenAILLM(temperature=0.0, json_mode=True) if planner is None else planner ) - self.answer_model = ( - OpenAILLM(model_name="gpt-4-turbo", temperature=0.0) - if answer_model is None - else answer_model + self.coder = OpenAILLM(temperature=0.0) if coder is None else coder + self.tester = OpenAILLM(temperature=0.0) if tester is None else tester + self.debugger = ( + OpenAILLM(temperature=0.0, json_mode=True) if debugger is None else debugger ) - self.reflect_model = ( - OpenAILMM(model_name="gpt-4-turbo", json_mode=True, temperature=0.0) - if reflect_model is None - else reflect_model + + self.tool_recommender = ( + Sim(TOOLS_DF, sim_key="desc") + if tool_recommender is None + else tool_recommender ) - self.max_retries = max_retries - self.tools = TOOLS + self.verbosity = verbosity + self.max_retries = 2 self.report_progress_callback = report_progress_callback - if verbose: - _LOGGER.setLevel(logging.INFO) def __call__( self, input: Union[List[Dict[str, str]], str], - image: Optional[Union[str, Path]] = None, - reference_data: Optional[Dict[str, str]] = None, - visualize_output: Optional[bool] = False, - self_reflection: Optional[bool] = True, + media: Optional[Union[str, Path]] = None, ) -> str: - """Invoke the vision agent. + """Chat with Vision Agent and return intermediate information regarding the task. Parameters: - chat: A conversation in the format of + chat (List[Dict[str, str]]): A conversation in the format of [{"role": "user", "content": "describe your task here..."}]. - image: The input image referenced in the chat parameter. - reference_data: A dictionary containing the reference image, mask or bounding - box in the format of: - {"image": "image.jpg", "mask": "mask.jpg", "bbox": [0.1, 0.2, 0.1, 0.2]} - where the bounding box coordinates are normalized. - visualize_output: Whether to visualize the output. - self_reflection: boolean to enable and disable self reflection. + media (Optional[Union[str, Path]]): The media file to be used in the task. + self_reflection (bool): Whether to reflect on the task and debug the code. Returns: - The result of the vision agent in text. + str: The code output by the Vision Agent. """ + if isinstance(input, str): input = [{"role": "user", "content": input}] - return self.chat( - input, - image=image, - visualize_output=visualize_output, - reference_data=reference_data, - self_reflection=self_reflection, - ) - - def log_progress(self, data: Dict[str, Any]) -> None: - _LOGGER.info(data) - if self.report_progress_callback: - self.report_progress_callback(data) - - def _report_visualization_via_callback( - self, images: Sequence[Union[str, Path]] - ) -> None: - """This is intended for streaming the visualization images via the callback to the client side.""" - if self.report_progress_callback: - self.report_progress_callback({"log": ""}) - if images: - for img in images: - self.report_progress_callback( - {"log": f"base:64{convert_to_b64(img)}"} - ) - self.report_progress_callback({"log": ""}) + results = self.chat_with_workflow(input, media) + results.pop("working_memory") + return results["code"] # type: ignore def chat_with_workflow( self, chat: List[Dict[str, str]], - image: Optional[Union[str, Path]] = None, - reference_data: Optional[Dict[str, str]] = None, - visualize_output: Optional[bool] = False, - self_reflection: Optional[bool] = True, - ) -> Tuple[str, List[Dict]]: - """Chat with the vision agent and return the final answer and all tool results. + media: Optional[Union[str, Path]] = None, + self_reflection: bool = False, + ) -> Dict[str, Any]: + """Chat with Vision Agent and return intermediate information regarding the task. Parameters: - chat: A conversation in the format of + chat (List[Dict[str, str]]): A conversation in the format of [{"role": "user", "content": "describe your task here..."}]. - image: The input image referenced in the chat parameter. - reference_data: A dictionary containing the reference image, mask or bounding - box in the format of: - {"image": "image.jpg", "mask": "mask.jpg", "bbox": [0.1, 0.2, 0.1, 0.2]} - where the bounding box coordinates are normalized. - visualize_output: Whether to visualize the output. - self_reflection: boolean to enable and disable self reflection. + media (Optional[Union[str, Path]]): The media file to be used in the task. + self_reflection (bool): Whether to reflect on the task and debug the code. Returns: - A tuple where the first item is the final answer and the second item is a - list of all the tool results. The last item in the tool results also - contains the visualized output. + Dict[str, Any]: A dictionary containing the code, test, test result, plan, + and working memory of the agent. """ - if len(chat) == 0: - raise ValueError("Input cannot be empty.") - - question = chat[0]["content"] - if image: - question += f" Image name: {image}" - if reference_data: - question += ( - f" Reference image: {reference_data['image']}" - if "image" in reference_data - else "" - ) - question += ( - f" Reference mask: {reference_data['mask']}" - if "mask" in reference_data - else "" - ) - question += ( - f" Reference bbox: {reference_data['bbox']}" - if "bbox" in reference_data - else "" - ) - - reflections = "" - final_answer = "" - all_tool_results: List[Dict] = [] - for _ in range(self.max_retries): - task_list = self.create_tasks( - self.task_model, question, self.tools, reflections + if len(chat) == 0: + raise ValueError("Chat cannot be empty.") + + if media is not None: + for chat_i in chat: + if chat_i["role"] == "user": + chat_i["content"] += f" Image name {media}" + + code = "" + test = "" + working_memory: List[Dict[str, str]] = [] + results = {"code": "", "test": "", "plan": []} + plan = [] + success = False + retries = 0 + + while not success and retries < self.max_retries: + plan_i = write_plan( + chat, TOOL_DESCRIPTIONS, format_memory(working_memory), self.planner ) - - task_depend = {"Original Question": question} - previous_log = "" - answers = [] - for task in task_list: - task_depend[task["id"]] = {"task": task["task"], "answer": "", "call_result": ""} # type: ignore - all_tool_results = [] - - for task in task_list: - task_str = task["task"] - previous_log = str(task_depend) - tool_results, call_results = self.retrieval( - self.task_model, - task_str, - self.tools, - previous_log, - reflections, - ) - answer = answer_generate( - self.answer_model, task_str, call_results, previous_log, reflections + plan_i_str = "\n-".join([e["instructions"] for e in plan_i]) + if self.verbosity >= 1: + self.log_progress( + { + "log": "Going to run the following plan(s) in sequence:\n", + "plan": plan_i, + } ) - tool_results["answer"] = answer - all_tool_results.append(tool_results) + _LOGGER.info( + f""" +{tabulate(tabular_data=plan_i, headers="keys", tablefmt="mixed_grid", maxcolwidths=_MAX_TABULATE_COL_WIDTH)}""" + ) - self.log_progress({"log": f"\tCall Result: {call_results}"}) - self.log_progress({"log": f"\tAnswer: {answer}"}) - answers.append({"task": task_str, "answer": answer}) - task_depend[task["id"]]["answer"] = answer # type: ignore - task_depend[task["id"]]["call_result"] = call_results # type: ignore - final_answer = answer_summarize( - self.answer_model, question, answers, reflections + tool_info = retrieve_tools( + plan_i, + self.tool_recommender, + self.log_progress, + self.verbosity, ) - visualized_output = visualize_result(all_tool_results) - all_tool_results.append({"visualized_output": visualized_output}) - if len(visualized_output) > 0: - reflection_images = sample_n_evenly_spaced(visualized_output, 3) - elif image is not None: - reflection_images = [image] - else: - reflection_images = None + results = write_and_test_code( + FULL_TASK.format(user_request=chat[0]["content"], subtasks=plan_i_str), + tool_info, + UTILITIES_DOCSTRING, + format_memory(working_memory), + self.coder, + self.tester, + self.debugger, + self.log_progress, + verbosity=self.verbosity, + input_media=media, + ) + success = cast(bool, results["success"]) + code = cast(str, results["code"]) + test = cast(str, results["test"]) + working_memory.extend(results["working_memory"]) # type: ignore + plan.append({"code": code, "test": test, "plan": plan_i}) if self_reflection: - reflection = self_reflect( - self.reflect_model, - question, - self.tools, - all_tool_results, - final_answer, - reflection_images, - ) - self.log_progress({"log": f"Reflection: {reflection}"}) - parsed_reflection = parse_reflect(reflection) - if parsed_reflection["Finish"]: - break - else: - reflections += "\n" + parsed_reflection["Reflection"] - else: - self.log_progress( - {"log": "Self Reflection skipped based on user request."} + reflection = reflect( + chat, + FULL_TASK.format( + user_request=chat[0]["content"], subtasks=plan_i_str + ), + code, + self.planner, ) - break - # '' is a symbol to indicate the end of the chat, which is useful for streaming logs. - self.log_progress( - { - "log": f"The Vision Agent has concluded this chat. {final_answer}" - } - ) - - if visualize_output: - viz_images: Sequence[Union[str, Path]] = all_tool_results[-1][ - "visualized_output" - ] - self._report_visualization_via_callback(viz_images) - for img in viz_images: - Image.open(img).show() - - return final_answer, all_tool_results - - def chat( - self, - chat: List[Dict[str, str]], - image: Optional[Union[str, Path]] = None, - reference_data: Optional[Dict[str, str]] = None, - visualize_output: Optional[bool] = False, - self_reflection: Optional[bool] = True, - ) -> str: - answer, _ = self.chat_with_workflow( - chat, - image=image, - visualize_output=visualize_output, - reference_data=reference_data, - self_reflection=self_reflection, - ) - return answer - - def retrieval( - self, - model: Union[LLM, LMM, Agent], - question: str, - tools: Dict[int, Any], - previous_log: str, - reflections: str, - ) -> Tuple[Dict, str]: - tool_id = choose_tool( - model, - question, - {k: v["description"] for k, v in tools.items()}, - reflections, - ) - if tool_id is None: - return {}, "" - - tool_instructions = tools[tool_id] - tool_usage = tool_instructions["usage"] - tool_name = tool_instructions["name"] + if self.verbosity > 0: + self.log_progress( + { + "log": "Reflection:", + "reflection": reflection, + } + ) + _LOGGER.info(f"Reflection: {reflection}") + feedback = cast(str, reflection["feedback"]) + success = cast(bool, reflection["success"]) + working_memory.append({"code": f"{code}\n{test}", "feedback": feedback}) - parameters = choose_parameter( - model, question, tool_usage, previous_log, reflections - ) - if parameters is None: - return {}, "" - tool_results = { - "task": question, - "tool_name": tool_name, - "parameters": parameters, - } + retries += 1 self.log_progress( { - "log": f"""Going to run the following tool(s) in sequence: -{tabulate(tabular_data=[tool_results], headers="keys", tablefmt="mixed_grid", maxcolwidths=_MAX_TABULATE_COL_WIDTH)}""" + "log": f"Vision Agent has concluded this chat.\nSuccess: {success}", + "finished": True, } ) - def parse_tool_results(result: Dict[str, Union[Dict, List]]) -> Any: - call_results: List[Any] = [] - if isinstance(result["parameters"], Dict): - call_results.append( - function_call(tools[tool_id]["class"], result["parameters"]) - ) - elif isinstance(result["parameters"], List): - for parameters in result["parameters"]: - call_results.append( - function_call(tools[tool_id]["class"], parameters) - ) - return call_results - - call_results = parse_tool_results(tool_results) - tool_results["call_results"] = call_results - - call_results_str = str(call_results) - return tool_results, call_results_str + return { + "code": code, + "test": test, + "test_result": results["test_result"], + "plan": plan, + "working_memory": working_memory, + } - def create_tasks( - self, - task_model: Union[LLM, LMM], - question: str, - tools: Dict[int, Any], - reflections: str, - ) -> List[Dict]: - tasks = task_decompose( - task_model, - question, - {k: v["description"] for k, v in tools.items()}, - reflections, - ) - if tasks is not None: - task_list = [{"task": task, "id": i + 1} for i, task in enumerate(tasks)] - task_list = task_topology(task_model, question, task_list) - try: - task_list = topological_sort(task_list) - except Exception: - _LOGGER.error(f"Failed topological_sort on: {task_list}") - else: - task_list = [] - self.log_progress( - { - "log": "Planned tasks:", - "plan": task_list, - } - ) - return task_list + def log_progress(self, data: Dict[str, Any]) -> None: + if self.report_progress_callback is not None: + self.report_progress_callback(data) + pass diff --git a/vision_agent/agent/vision_agent_prompts.py b/vision_agent/agent/vision_agent_prompts.py index 8b3cbaa1..6041bfc3 100644 --- a/vision_agent/agent/vision_agent_prompts.py +++ b/vision_agent/agent/vision_agent_prompts.py @@ -1,152 +1,234 @@ -VISION_AGENT_REFLECTION = """You are an advanced reasoning agent that can improve based on self-refection. You will be given a previous reasoning trial in which you were given the user's question, the available tools that the agent has, the decomposed tasks and tools that the agent used to answer the question, the tool usage for each of the tools used and the final answer the agent provided. You may also receive an image with the visualized bounding boxes or masks with their associated labels and scores from the tools used. +USER_REQ = """ +## User Request +{user_request} +""" -Please note that: -1. You must ONLY output parsible JSON format. If the agents output was correct set "Finish" to true, else set "Finish" to false. An example output looks like: -{{"Finish": true, "Reflection": "The agent's answer was correct."}} -2. You must utilize the image with the visualized bounding boxes or masks and determine if the tools were used correctly or if the tools were used incorrectly or the wrong tools were used. -3. If the agent's answer was incorrect, you must diagnose the reason for failure and devise a new concise and concrete plan that aims to mitigate the same failure with the tools available. An example output looks like: - {{"Finish": false, "Reflection": "I can see from the visualized bounding boxes that the agent's answer was incorrect because the grounding_dino_ tool produced false positive predictions. The agent should use the following tools with the following parameters: - Step 1: Use 'grounding_dino_' with a 'prompt' of 'baby. bed' and a 'box_threshold' of 0.7 to reduce the false positives. - Step 2: Use 'box_iou_' with the baby bounding box and the bed bounding box to determine if the baby is on the bed or not."}} -4. If the task cannot be completed with the existing tools or by adjusting the parameters, set "Finish" to true. +FULL_TASK = """ +## User Request +{user_request} -User's question: {question} +## Subtasks +{subtasks} +""" -Tools available: -{tools} - -Tasks and tools used: -{tool_results} - -Tool's used API documentation: -{tool_usage} - -Final answer: -{final_answer} - -Reflection: """ - -TASK_DECOMPOSE = """You need to decompose a user's complex question into one or more simple subtasks and let the model execute it step by step. -This is the user's question: {question} -This is the tool list: -{tools} - -Please note that: -1. If the given task is simple and the answer can be provided by executing one tool, you should only use that tool to provide the answer. -2. If the given task is complex, You should decompose this user's complex question into simple subtasks which can only be executed easily by using one single tool in the tool list. -3. You should try to decompose the complex question into least number of subtasks. -4. If one subtask needs the results from another subtask, you should write clearly. For example: -{{"Tasks": ["Convert 23 km/h to X km/min by 'divide_'", "Multiply X km/min by 45 min to get Y by 'multiply_'"]}} -5. You must ONLY output in a parsible JSON format. An example output looks like: - -{{"Tasks": ["Task 1", "Task 2", ...]}} - -Output: """ - -TASK_DECOMPOSE_DEPENDS = """You need to decompose a user's complex question into one or more simple subtasks and let the model execute it step by step. -This is the user's question: {question} - -This is the tool list: -{tools} - -This is a reflection from a previous failed attempt: -{reflections} - -Please note that: -1. If the given task is simple and the answer can be provided by executing one tool, you should only use that tool to provide the answer. -2. If the given task is complex, You should decompose this user's complex question into simple subtasks which can only be executed easily by using one single tool in the tool list. -3. You should try to decompose the complex question into least number of subtasks. -4. If one subtask needs the results from another subtask, you should write clearly. For example: -{{"Tasks": ["Convert 23 km/h to X km/min by 'divide_'", "Multiply X km/min by 45 min to get Y by 'multiply_'"]}} -5. You must ONLY output in a parsible JSON format. An example output looks like: - -{{"Tasks": ["Task 1", "Task 2", ...]}} - -Output: """ - -CHOOSE_TOOL = """This is the user's question: {question} -These are the tools you can select to solve the question: -{tools} - -Please note that: -1. You should only choose one tool from the Tool List to solve this question and it should have maximum chance of solving the question. -2. You should only choose the tool whose parameters are most relevant to the user's question and are available as part of the question. -3. You should choose the tool whose return type is most relevant to the answer of the user's question. -4. You must ONLY output the ID of the tool you chose in a parsible JSON format. Two example outputs look like: - -Example 1: {{"ID": 1}} -Example 2: {{"ID": 2}} - -Output: """ - -CHOOSE_TOOL_DEPENDS = """This is the user's question: {question} -These are the tools you can select to solve the question: -{tools} - -This is a reflection from a previous failed attempt: -{reflections} - -Please note that: -1. You should only choose one tool from the Tool List to solve this question and it should have maximum chance of solving the question. -2. You should only choose the tool whose parameters are most relevant to the user's question and are available as part of the question. -3. You should choose the tool whose return type is most relevant to the answer of the user's question. -4. You must ONLY output the ID of the tool you chose in a parsible JSON format. Two example outputs look like: - -Example 1: {{"ID": 1}} -Example 2: {{"ID": 2}} - -Output: """ - -CHOOSE_PARAMETER_DEPENDS = """Given a user's question and an API tool documentation, you need to output parameters according to the API tool documentation to successfully call the API to solve the user's question. -Please note that: -1. The Example in the API tool documentation can help you better understand the use of the API. Pay attention to the examples which show how to parse the question and extract tool parameters such as prompts and visual inputs. -2. Ensure the parameters you output are correct. The output must contain the required parameters, and can contain the optional parameters based on the question. If there are no parameters in the required parameters and optional parameters, just leave it as {{"Parameters":{{}}}} -3. If the user's question mentions other APIs, you should ONLY consider the API tool documentation I give and do not consider other APIs. -4. The question may have dependencies on answers of other questions, so we will provide logs of previous questions and answers for your reference. -5. If you need to use this API multiple times, please set "Parameters" to a list. -6. You must ONLY output in a parsible JSON format. Two example outputs look like: - -Example 1: {{"Parameters":{{"input": [1,2,3]}}}} -Example 2: {{"Parameters":[{{"input": [1,2,3]}}, {{"input": [2,3,4]}}]}} - -This is a reflection from a previous failed attempt: -{reflections} - -These are logs of previous questions and answers: -{previous_log} - -This is the current user's question: {question} -This is the API tool documentation: {tool_usage} -Output: """ - -ANSWER_GENERATE_DEPENDS = """You should answer the question based on the response output by the API tool. -Please note that: -1. You should try to organize the response into a natural language answer. -2. We will not show the API response to the user, thus you need to make full use of the response and give the information in the response that can satisfy the user's question in as much detail as possible. -3. If the API tool does not provide useful information in the response, please answer with your knowledge. -4. The question may have dependencies on answers of other questions, so we will provide logs of previous questions and answers. - -This is a reflection from a previous failed attempt: -{reflections} - -These are logs of previous questions and answers: -{previous_log} - -This is the user's question: {question} - -This is the response output by the API tool: -{call_results} - -We will not show the API response to the user, thus you need to make full use of the response and give the information in the response that can satisfy the user's question in as much detail as possible. -Output: """ - -ANSWER_SUMMARIZE_DEPENDS = """We break down a user's complex problems into simple subtasks and provide answers to each simple subtask. You need to organize these answers to each subtask and form a self-consistent final answer to the user's question -This is the user's question: {question} - -These are subtasks and their answers: -{answers} - -This is a reflection from a previous failed attempt: -{reflections} - -Final answer: """ +FEEDBACK = """ +## This contains code and feedback from previous runs and is used for providing context so you do not make the same mistake again. + +{feedback} +""" + + +PLAN = """ +**Context** +{context} + +**Tools Available**: +{tool_desc} + +**Previous Feedback**: +{feedback} + +**Instructions**: +Based on the context and tools you have available, write a plan of subtasks to achieve the user request utilizing given tools when necessary. Output a list of jsons in the following format: + +```json +{{ + "plan": + [ + {{ + "instructions": str # what you should do in this task, one short phrase or sentence + }} + ] +}} +``` +""" + +CODE = """ +**Role**: You are a software programmer. + +**Task**: As a programmer, you are required to complete the function. Use a Chain-of-Thought approach to break down the problem, create pseudocode, and then write the code in Python language. Ensure that your code is efficient, readable, and well-commented. Return the requested information from the function you create. Do not call your code, a test will be run after the code is submitted. + +**Documentation**: +This is the documentation for the functions you have access to. You may call any of these functions to help you complete the task. They are available through importing `from vision_agent.tools import *`. + +{docstring} + +**Input Code Snippet**: +```python +# Your code here +``` + +**User Instructions**: +{question} + +**Previous Feedback**: +{feedback} + +**Instructions**: +1. **Understand and Clarify**: Make sure you understand the task. +2. **Algorithm/Method Selection**: Decide on the most efficient way. +3. **Pseudocode Creation**: Write down the steps you will follow in pseudocode. +4. **Code Generation**: Translate your pseudocode into executable Python code. +5. **Logging**: Log the output of the custom functions that were provided to you from `from vision_agent.tools import *`. Use a debug flag in the function parameters to toggle logging on and off. +""" + +TEST = """ +**Role**: As a tester, your task is to create comprehensive test cases for the provided code. These test cases should encompass Basic and Edge case scenarios to ensure the code's robustness and reliability if possible. + +**Documentation**: +This is the documentation for the functions you have access to. You may call any of these functions to help you complete the task. They are available through importing `from vision_agent.tools import *`. You do not need to test these functions. Test only the code provided by the user. + +{docstring} + +**User Instructions**: +{question} + +**Input Code Snippet**: +```python +### Please decided how would you want to generate test cases. Based on incomplete code or completed version. +{code} +``` + +**Instructions**: +1. Verify the fundamental functionality under normal conditions. +2. Ensure each test case is well-documented with comments explaining the scenario it covers. +3. DO NOT use any files that are not provided by the user's instructions, your test must be run and will crash if it tries to load a non-existent file. +4. DO NOT mock any functions, you must test their functionality as is. + +You should format your test cases at the end of your response wrapped in ```python ``` tags like in the following example: +```python +# You can run assertions to ensure the function is working as expected +assert function(input) == expected_output, "Test case description" + +# You can simply call the function to ensure it runs +function(input) + +# Or you can visualize the output +output = function(input) +visualize(output) +``` + +**Examples**: +## Prompt 1: +```python +def detect_cats_and_dogs(image_path: str) -> Dict[str, List[List[float]]]: + \""" Detects cats and dogs in an image. Returns a dictionary with + {{ + "cats": [[x1, y1, x2, y2], ...], "dogs": [[x1, y1, x2, y2], ...] + }} + \""" +``` + +## Completion 1: +```python +# We can test to ensure the output has the correct structure but we cannot test the +# content of the output without knowing the image. We can test on "image.jpg" because +# it is provided by the user so we know it exists. +output = detect_cats_and_dogs("image.jpg") +assert "cats" in output, "The output should contain 'cats' +assert "dogs" in output, "The output should contain 'dogs' +``` + +## Prompt 2: +```python +def find_text(image_path: str, text: str) -> str: + \""" Finds the text in the image and returns the text. \""" + +## Completion 2: +```python +# Because we do not know ahead of time what text is in the image, we can only run the +# code and print the results. We can test on "image.jpg" because it is provided by the +# user so we know it exists. +found_text = find_text("image.jpg", "Hello World") +print(found_text) +``` +""" + + +SIMPLE_TEST = """ +**Role**: As a tester, your task is to create a simple test case for the provided code. This test case should verify the fundamental functionality under normal conditions. + +**Documentation**: +This is the documentation for the functions you have access to. You may call any of these functions to help you complete the task. They are available through importing `from vision_agent.tools import *`. You do not need to test these functions, only the code provided by the user. + +{docstring} + +**User Instructions**: +{question} + +**Input Code Snippet**: +```python +### Please decide how would you want to generate test cases. Based on incomplete code or completed version. +{code} +``` + +**Previous Feedback**: +{feedback} + +**Instructions**: +1. Verify the fundamental functionality under normal conditions. +2. Ensure each test case is well-documented with comments explaining the scenario it covers. +3. Your test case MUST run only on the given image which is {media} +4. DO NOT use any non-existent or dummy image or video files that are not provided by the user's instructions. +5. DO NOT mock any functions, you must test their functionality as is. +6. DO NOT assert the output value, run the code and verify it runs without any errors and assert only the output format or data structure. +7. DO NOT import the testing function as it will available in the testing environment. +8. Print the output of the function that is being tested. +""" + + +FIX_BUG = """ +**Role** As a coder, your job is to find the error in the code and fix it. You are running in a notebook setting so you can run !pip install to install missing packages. + +**Instructions**: +Please re-complete the code to fix the error message. Here is the previous version: +```python +{code} +``` + +When we run this test code: +```python +{tests} +``` + +It raises this error: +```python +{result} +``` + +This is previous feedback provided on the code: +{feedback} + +Please fix the bug by follow the error information and return a JSON object with the following format: +{{ + "reflections": str # any thoughts you have about the bug and how you fixed it + "code": str # the fixed code if any, else an empty string + "test": str # the fixed test code if any, else an empty string +}} +""" + + +REFLECT = """ +**Role**: You are a reflection agent. Your job is to look at the original user request and the code produced and determine if the code satisfies the user's request. If it does not, you must provide feedback on how to improve the code. You are concerned only if the code meets the user request, not if the code is good or bad. + +**Context**: +{context} + +**Plan**: +{plan} + +**Code**: +{code} + +**Instructions**: +1. **Understand the User Request**: Read the user request and understand what the user is asking for. +2. **Review the Plan**: Check the plan to see if it is a viable approach to solving the user request. +3. **Review the Code**: Check the code to see if it solves the user request. +4. DO NOT add any reflections for test cases, these are taken care of. + +Respond in JSON format with the following structure: +{{ + "feedback": str # the feedback you would give to the coder and tester + "success": bool # whether the code and tests meet the user request +}} +""" diff --git a/vision_agent/agent/vision_agent_v3.py b/vision_agent/agent/vision_agent_v3.py deleted file mode 100644 index d9fb8821..00000000 --- a/vision_agent/agent/vision_agent_v3.py +++ /dev/null @@ -1,394 +0,0 @@ -import copy -import json -import logging -import sys -from pathlib import Path -from typing import Any, Callable, Dict, List, Optional, Union, cast, no_type_check - -from rich.console import Console -from rich.syntax import Syntax -from tabulate import tabulate - -from vision_agent.agent import Agent -from vision_agent.agent.vision_agent_v3_prompts import ( - CODE, - FEEDBACK, - FIX_BUG, - FULL_TASK, - PLAN, - REFLECT, - SIMPLE_TEST, - USER_REQ, -) -from vision_agent.llm import LLM, OpenAILLM -from vision_agent.tools.tools_v2 import TOOL_DESCRIPTIONS, TOOLS_DF, UTILITIES_DOCSTRING -from vision_agent.utils import Execute -from vision_agent.utils.sim import Sim - -logging.basicConfig(stream=sys.stdout) -_LOGGER = logging.getLogger(__name__) -_MAX_TABULATE_COL_WIDTH = 80 -_EXECUTE = Execute(600) -_CONSOLE = Console() - - -def format_memory(memory: List[Dict[str, str]]) -> str: - return FEEDBACK.format( - feedback="\n".join( - [ - f"### Feedback {i}:\nCode: ```python\n{m['code']}\n```\nFeedback: {m['feedback']}\n" - for i, m in enumerate(memory) - ] - ) - ) - - -def extract_code(code: str) -> str: - if "\n```python" in code: - start = "\n```python" - elif "```python" in code: - start = "```python" - else: - return code - - code = code[code.find(start) + len(start) :] - code = code[: code.find("```")] - if code.startswith("python\n"): - code = code[len("python\n") :] - return code - - -def extract_json(json_str: str) -> Dict[str, Any]: - try: - json_dict = json.loads(json_str) - except json.JSONDecodeError: - if "```json" in json_str: - json_str = json_str[json_str.find("```json") + len("```json") :] - json_str = json_str[: json_str.find("```")] - elif "```" in json_str: - json_str = json_str[json_str.find("```") + len("```") :] - # get the last ``` not one from an intermediate string - json_str = json_str[: json_str.find("}```")] - json_dict = json.loads(json_str) - return json_dict # type: ignore - - -def write_plan( - chat: List[Dict[str, str]], - tool_desc: str, - working_memory: str, - model: LLM, -) -> List[Dict[str, str]]: - chat = copy.deepcopy(chat) - if chat[-1]["role"] != "user": - raise ValueError("Last chat message must be from the user.") - - user_request = chat[-1]["content"] - context = USER_REQ.format(user_request=user_request) - prompt = PLAN.format(context=context, tool_desc=tool_desc, feedback=working_memory) - chat[-1]["content"] = prompt - return extract_json(model.chat(chat))["plan"] # type: ignore - - -def reflect( - chat: List[Dict[str, str]], - plan: str, - code: str, - model: LLM, -) -> Dict[str, Union[str, bool]]: - chat = copy.deepcopy(chat) - if chat[-1]["role"] != "user": - raise ValueError("Last chat message must be from the user.") - - user_request = chat[-1]["content"] - context = USER_REQ.format(user_request=user_request) - prompt = REFLECT.format(context=context, plan=plan, code=code) - chat[-1]["content"] = prompt - return extract_json(model.chat(chat)) - - -def write_and_test_code( - task: str, - tool_info: str, - tool_utils: str, - working_memory: str, - coder: LLM, - tester: LLM, - debugger: LLM, - log_progress: Callable[[Dict[str, Any]], None], - verbosity: int = 0, - max_retries: int = 3, - input_media: Optional[Union[str, Path]] = None, -) -> Dict[str, Any]: - code = extract_code( - coder(CODE.format(docstring=tool_info, question=task, feedback=working_memory)) - ) - test = extract_code( - tester( - SIMPLE_TEST.format( - docstring=tool_utils, - question=task, - code=code, - feedback=working_memory, - media=input_media, - ) - ) - ) - - success, result = _EXECUTE.run_isolation(f"{code}\n{test}") - if verbosity == 2: - _LOGGER.info("Initial code and tests:") - log_progress( - { - "log": "Code:", - "code": code, - } - ) - log_progress( - { - "log": "Test:", - "code": test, - } - ) - _CONSOLE.print( - Syntax(f"{code}\n{test}", "python", theme="gruvbox-dark", line_numbers=True) - ) - log_progress( - { - "log": "Result:", - "result": result, - } - ) - _LOGGER.info(f"Initial result: {result}") - - count = 0 - new_working_memory = [] - while not success and count < max_retries: - fixed_code_and_test = extract_json( - debugger( - FIX_BUG.format( - code=code, tests=test, result=result, feedback=working_memory - ) - ) - ) - if fixed_code_and_test["code"].strip() != "": - code = extract_code(fixed_code_and_test["code"]) - if fixed_code_and_test["test"].strip() != "": - test = extract_code(fixed_code_and_test["test"]) - new_working_memory.append( - {"code": f"{code}\n{test}", "feedback": fixed_code_and_test["reflections"]} - ) - - success, result = _EXECUTE.run_isolation(f"{code}\n{test}") - if verbosity == 2: - log_progress( - { - "log": f"Debug attempt {count + 1}, reflection:", - "result": fixed_code_and_test["reflections"], - } - ) - _LOGGER.info( - f"Debug attempt {count + 1}, reflection: {fixed_code_and_test['reflections']}" - ) - _CONSOLE.print( - Syntax( - f"{code}\n{test}", "python", theme="gruvbox-dark", line_numbers=True - ) - ) - log_progress( - { - "log": "Debug result:", - "result": result, - } - ) - _LOGGER.info(f"Debug result: {result}") - count += 1 - - if verbosity >= 1: - _LOGGER.info("Final code and tests:") - _CONSOLE.print( - Syntax(f"{code}\n{test}", "python", theme="gruvbox-dark", line_numbers=True) - ) - _LOGGER.info(f"Final Result: {result}") - - return { - "code": code, - "test": test, - "success": success, - "test_result": result, - "working_memory": new_working_memory, - } - - -def retrieve_tools( - plan: List[Dict[str, str]], - tool_recommender: Sim, - log_progress: Callable[[Dict[str, Any]], None], - verbosity: int = 0, -) -> str: - tool_info = [] - tool_desc = [] - for task in plan: - tools = tool_recommender.top_k(task["instructions"], k=2, thresh=0.3) - tool_info.extend([e["doc"] for e in tools]) - tool_desc.extend([e["desc"] for e in tools]) - if verbosity == 2: - log_progress( - { - "log": "Retrieved tools:", - "tools": tool_desc, - } - ) - _LOGGER.info(f"Tools: {tool_desc}") - tool_info_set = set(tool_info) - return "\n\n".join(tool_info_set) - - -class VisionAgentV3(Agent): - def __init__( - self, - timeout: int = 600, - planner: Optional[LLM] = None, - coder: Optional[LLM] = None, - tester: Optional[LLM] = None, - debugger: Optional[LLM] = None, - tool_recommender: Optional[Sim] = None, - verbosity: int = 0, - report_progress_callback: Optional[Callable[[Dict[str, Any]], None]] = None, - ) -> None: - self.planner = ( - OpenAILLM(temperature=0.0, json_mode=True) if planner is None else planner - ) - self.coder = OpenAILLM(temperature=0.0) if coder is None else coder - self.tester = OpenAILLM(temperature=0.0) if tester is None else tester - self.debugger = ( - OpenAILLM(temperature=0.0, json_mode=True) if debugger is None else debugger - ) - - self.tool_recommender = ( - Sim(TOOLS_DF, sim_key="desc") - if tool_recommender is None - else tool_recommender - ) - self.verbosity = verbosity - self.max_retries = 2 - self.report_progress_callback = report_progress_callback - - @no_type_check - def __call__( - self, - input: Union[List[Dict[str, str]], str], - image: Optional[Union[str, Path]] = None, - ) -> Dict[str, Any]: - if isinstance(input, str): - input = [{"role": "user", "content": input}] - results = self.chat_with_workflow(input, image) - results.pop("working_memory") - return results - - def chat_with_workflow( - self, - chat: List[Dict[str, str]], - image: Optional[Union[str, Path]] = None, - self_reflection: bool = False, - ) -> Dict[str, Any]: - if len(chat) == 0: - raise ValueError("Chat cannot be empty.") - - if image is not None: - for chat_i in chat: - if chat_i["role"] == "user": - chat_i["content"] += f" Image name {image}" - - code = "" - test = "" - working_memory: List[Dict[str, str]] = [] - results = {"code": "", "test": "", "plan": []} - plan = [] - success = False - retries = 0 - - while not success and retries < self.max_retries: - plan_i = write_plan( - chat, TOOL_DESCRIPTIONS, format_memory(working_memory), self.planner - ) - plan_i_str = "\n-".join([e["instructions"] for e in plan_i]) - if self.verbosity >= 1: - self.log_progress( - { - "log": "Going to run the following plan(s) in sequence:\n", - "plan": plan_i, - } - ) - - _LOGGER.info( - f""" -{tabulate(tabular_data=plan_i, headers="keys", tablefmt="mixed_grid", maxcolwidths=_MAX_TABULATE_COL_WIDTH)}""" - ) - - tool_info = retrieve_tools( - plan_i, - self.tool_recommender, - self.log_progress, - self.verbosity, - ) - results = write_and_test_code( - FULL_TASK.format(user_request=chat[0]["content"], subtasks=plan_i_str), - tool_info, - UTILITIES_DOCSTRING, - format_memory(working_memory), - self.coder, - self.tester, - self.debugger, - self.log_progress, - verbosity=self.verbosity, - input_media=image, - ) - success = cast(bool, results["success"]) - code = cast(str, results["code"]) - test = cast(str, results["test"]) - working_memory.extend(results["working_memory"]) # type: ignore - plan.append({"code": code, "test": test, "plan": plan_i}) - - if self_reflection: - reflection = reflect( - chat, - FULL_TASK.format( - user_request=chat[0]["content"], subtasks=plan_i_str - ), - code, - self.planner, - ) - if self.verbosity > 0: - self.log_progress( - { - "log": "Reflection:", - "reflection": reflection, - } - ) - _LOGGER.info(f"Reflection: {reflection}") - feedback = cast(str, reflection["feedback"]) - success = cast(bool, reflection["success"]) - working_memory.append({"code": f"{code}\n{test}", "feedback": feedback}) - - retries += 1 - - self.log_progress( - { - "log": f"The Vision Agent V3 has concluded this chat.\nSuccess: {success}", - "finished": True, - } - ) - - return { - "code": code, - "test": test, - "test_result": results["test_result"], - "plan": plan, - "working_memory": working_memory, - } - - def log_progress(self, data: Dict[str, Any]) -> None: - if self.report_progress_callback is not None: - self.report_progress_callback(data) - pass diff --git a/vision_agent/agent/vision_agent_v3_prompts.py b/vision_agent/agent/vision_agent_v3_prompts.py deleted file mode 100644 index d1e6077b..00000000 --- a/vision_agent/agent/vision_agent_v3_prompts.py +++ /dev/null @@ -1,234 +0,0 @@ -USER_REQ = """ -## User Request -{user_request} -""" - -FULL_TASK = """ -## User Request -{user_request} - -## Subtasks -{subtasks} -""" - -FEEDBACK = """ -## This contains code and feedback from previous runs and is used for providing context so you do not make the same mistake again. - -{feedback} -""" - - -PLAN = """ -**Context** -{context} - -**Tools Available**: -{tool_desc} - -**Previous Feedback**: -{feedback} - -**Instructions**: -Based on the context and tools you have available, write a plan of subtasks to achieve the user request utilizing given tools when necessary. Output a list of jsons in the following format: - -```json -{{ - "plan": - [ - {{ - "instructions": str # what you should do in this task, one short phrase or sentence - }} - ] -}} -``` -""" - -CODE = """ -**Role**: You are a software programmer. - -**Task**: As a programmer, you are required to complete the function. Use a Chain-of-Thought approach to break down the problem, create pseudocode, and then write the code in Python language. Ensure that your code is efficient, readable, and well-commented. Return the requested information from the function you create. Do not call your code, a test will be run after the code is submitted. - -**Documentation**: -This is the documentation for the functions you have access to. You may call any of these functions to help you complete the task. They are available through importing `from vision_agent.tools.tools_v2 import *`. - -{docstring} - -**Input Code Snippet**: -```python -# Your code here -``` - -**User Instructions**: -{question} - -**Previous Feedback**: -{feedback} - -**Instructions**: -1. **Understand and Clarify**: Make sure you understand the task. -2. **Algorithm/Method Selection**: Decide on the most efficient way. -3. **Pseudocode Creation**: Write down the steps you will follow in pseudocode. -4. **Code Generation**: Translate your pseudocode into executable Python code. -5. **Logging**: Log the output of the custom functions that were provided to you from `from vision_agent.tools.tools_v2 import *`. Use a debug flag in the function parameters to toggle logging on and off. -""" - -TEST = """ -**Role**: As a tester, your task is to create comprehensive test cases for the provided code. These test cases should encompass Basic and Edge case scenarios to ensure the code's robustness and reliability if possible. - -**Documentation**: -This is the documentation for the functions you have access to. You may call any of these functions to help you complete the task. They are available through importing `from vision_agent.tools.tools_v2 import *`. You do not need to test these functions. Test only the code provided by the user. - -{docstring} - -**User Instructions**: -{question} - -**Input Code Snippet**: -```python -### Please decided how would you want to generate test cases. Based on incomplete code or completed version. -{code} -``` - -**Instructions**: -1. Verify the fundamental functionality under normal conditions. -2. Ensure each test case is well-documented with comments explaining the scenario it covers. -3. DO NOT use any files that are not provided by the user's instructions, your test must be run and will crash if it tries to load a non-existent file. -4. DO NOT mock any functions, you must test their functionality as is. - -You should format your test cases at the end of your response wrapped in ```python ``` tags like in the following example: -```python -# You can run assertions to ensure the function is working as expected -assert function(input) == expected_output, "Test case description" - -# You can simply call the function to ensure it runs -function(input) - -# Or you can visualize the output -output = function(input) -visualize(output) -``` - -**Examples**: -## Prompt 1: -```python -def detect_cats_and_dogs(image_path: str) -> Dict[str, List[List[float]]]: - \""" Detects cats and dogs in an image. Returns a dictionary with - {{ - "cats": [[x1, y1, x2, y2], ...], "dogs": [[x1, y1, x2, y2], ...] - }} - \""" -``` - -## Completion 1: -```python -# We can test to ensure the output has the correct structure but we cannot test the -# content of the output without knowing the image. We can test on "image.jpg" because -# it is provided by the user so we know it exists. -output = detect_cats_and_dogs("image.jpg") -assert "cats" in output, "The output should contain 'cats' -assert "dogs" in output, "The output should contain 'dogs' -``` - -## Prompt 2: -```python -def find_text(image_path: str, text: str) -> str: - \""" Finds the text in the image and returns the text. \""" - -## Completion 2: -```python -# Because we do not know ahead of time what text is in the image, we can only run the -# code and print the results. We can test on "image.jpg" because it is provided by the -# user so we know it exists. -found_text = find_text("image.jpg", "Hello World") -print(found_text) -``` -""" - - -SIMPLE_TEST = """ -**Role**: As a tester, your task is to create a simple test case for the provided code. This test case should verify the fundamental functionality under normal conditions. - -**Documentation**: -This is the documentation for the functions you have access to. You may call any of these functions to help you complete the task. They are available through importing `from vision_agent.tools.tools_v2 import *`. You do not need to test these functions, only the code provided by the user. - -{docstring} - -**User Instructions**: -{question} - -**Input Code Snippet**: -```python -### Please decide how would you want to generate test cases. Based on incomplete code or completed version. -{code} -``` - -**Previous Feedback**: -{feedback} - -**Instructions**: -1. Verify the fundamental functionality under normal conditions. -2. Ensure each test case is well-documented with comments explaining the scenario it covers. -3. Your test case MUST run only on the given image which is {media} -4. DO NOT use any non-existent or dummy image or video files that are not provided by the user's instructions. -5. DO NOT mock any functions, you must test their functionality as is. -6. DO NOT assert the output value, run the code and verify it runs without any errors and assert only the output format or data structure. -7. DO NOT import the testing function as it will available in the testing environment. -8. Print the output of the function that is being tested. -""" - - -FIX_BUG = """ -**Role** As a coder, your job is to find the error in the code and fix it. You are running in a notebook setting so you can run !pip install to install missing packages. - -**Instructions**: -Please re-complete the code to fix the error message. Here is the previous version: -```python -{code} -``` - -When we run this test code: -```python -{tests} -``` - -It raises this error: -```python -{result} -``` - -This is previous feedback provided on the code: -{feedback} - -Please fix the bug by follow the error information and return a JSON object with the following format: -{{ - "reflections": str # any thoughts you have about the bug and how you fixed it - "code": str # the fixed code if any, else an empty string - "test": str # the fixed test code if any, else an empty string -}} -""" - - -REFLECT = """ -**Role**: You are a reflection agent. Your job is to look at the original user request and the code produced and determine if the code satisfies the user's request. If it does not, you must provide feedback on how to improve the code. You are concerned only if the code meets the user request, not if the code is good or bad. - -**Context**: -{context} - -**Plan**: -{plan} - -**Code**: -{code} - -**Instructions**: -1. **Understand the User Request**: Read the user request and understand what the user is asking for. -2. **Review the Plan**: Check the plan to see if it is a viable approach to solving the user request. -3. **Review the Code**: Check the code to see if it solves the user request. -4. DO NOT add any reflections for test cases, these are taken care of. - -Respond in JSON format with the following structure: -{{ - "feedback": str # the feedback you would give to the coder and tester - "success": bool # whether the code and tests meet the user request -}} -""" diff --git a/vision_agent/llm/llm.py b/vision_agent/llm/llm.py index a0035b29..7904cea0 100644 --- a/vision_agent/llm/llm.py +++ b/vision_agent/llm/llm.py @@ -6,14 +6,13 @@ from langsmith.wrappers import wrap_openai from openai import AzureOpenAI, OpenAI -from vision_agent.tools import ( - CHOOSE_PARAMS, +from vision_agent.tools.easytool_tools import ( CLIP, - SYSTEM_PROMPT, GroundingDINO, GroundingSAM, ZeroShotCounting, ) +from vision_agent.tools.prompts import CHOOSE_PARAMS, SYSTEM_PROMPT class LLM(ABC): @@ -141,7 +140,7 @@ def generate_zero_shot_counter(self, question: str) -> Callable: return lambda x: ZeroShotCounting()(**{"image": x}) def generate_image_qa_tool(self, question: str) -> Callable: - from vision_agent.tools import ImageQuestionAnswering + from vision_agent.tools.easytool_tools import ImageQuestionAnswering return lambda x: ImageQuestionAnswering()(**{"prompt": question, "image": x}) diff --git a/vision_agent/lmm/lmm.py b/vision_agent/lmm/lmm.py index cc8861bd..a8fa8312 100644 --- a/vision_agent/lmm/lmm.py +++ b/vision_agent/lmm/lmm.py @@ -9,7 +9,7 @@ import requests from openai import AzureOpenAI, OpenAI -from vision_agent.tools import CHOOSE_PARAMS, SYSTEM_PROMPT +from vision_agent.tools.prompts import CHOOSE_PARAMS, SYSTEM_PROMPT _LOGGER = logging.getLogger(__name__) @@ -198,7 +198,7 @@ def generate( return cast(str, response.choices[0].message.content) def generate_classifier(self, question: str) -> Callable: - from vision_agent.tools import CLIP + from vision_agent.tools.easytool_tools import CLIP api_doc = CLIP.description + "\n" + str(CLIP.usage) prompt = CHOOSE_PARAMS.format(api_doc=api_doc, question=question) @@ -223,7 +223,7 @@ def generate_classifier(self, question: str) -> Callable: return lambda x: CLIP()(**{"prompt": params["prompt"], "image": x}) def generate_detector(self, question: str) -> Callable: - from vision_agent.tools import GroundingDINO + from vision_agent.tools.easytool_tools import GroundingDINO api_doc = GroundingDINO.description + "\n" + str(GroundingDINO.usage) prompt = CHOOSE_PARAMS.format(api_doc=api_doc, question=question) @@ -248,7 +248,7 @@ def generate_detector(self, question: str) -> Callable: return lambda x: GroundingDINO()(**{"prompt": params["prompt"], "image": x}) def generate_segmentor(self, question: str) -> Callable: - from vision_agent.tools import GroundingSAM + from vision_agent.tools.easytool_tools import GroundingSAM api_doc = GroundingSAM.description + "\n" + str(GroundingSAM.usage) prompt = CHOOSE_PARAMS.format(api_doc=api_doc, question=question) @@ -273,12 +273,12 @@ def generate_segmentor(self, question: str) -> Callable: return lambda x: GroundingSAM()(**{"prompt": params["prompt"], "image": x}) def generate_zero_shot_counter(self, question: str) -> Callable: - from vision_agent.tools import ZeroShotCounting + from vision_agent.tools.easytool_tools import ZeroShotCounting return lambda x: ZeroShotCounting()(**{"image": x}) def generate_image_qa_tool(self, question: str) -> Callable: - from vision_agent.tools import ImageQuestionAnswering + from vision_agent.tools.easytool_tools import ImageQuestionAnswering return lambda x: ImageQuestionAnswering()(**{"prompt": question, "image": x}) diff --git a/vision_agent/tools/__init__.py b/vision_agent/tools/__init__.py index 08a96d81..a2ab06b3 100644 --- a/vision_agent/tools/__init__.py +++ b/vision_agent/tools/__init__.py @@ -1,25 +1,24 @@ from .prompts import CHOOSE_PARAMS, SYSTEM_PROMPT -from .tools import ( # Counter, - CLIP, - OCR, +from .tools import ( + TOOL_DESCRIPTIONS, + TOOL_DOCSTRING, TOOLS, - BboxIoU, - BboxStats, - BoxDistance, - Crop, - DINOv, - ExtractFrames, - GroundingDINO, - GroundingSAM, - ImageCaption, - ImageQuestionAnswering, - MaskDistance, - ObjectDistance, - SegArea, - SegIoU, - Tool, - VisualPromptCounting, - VisualQuestionAnswering, - ZeroShotCounting, - register_tool, + TOOLS_DF, + UTILITIES_DOCSTRING, + clip, + closest_box_distance, + closest_mask_distance, + extract_frames, + grounding_dino, + grounding_sam, + image_caption, + image_question_answering, + load_image, + ocr, + overlay_bounding_boxes, + overlay_segmentation_masks, + save_image, + save_json, + visual_prompt_counting, + zero_shot_counting, ) diff --git a/vision_agent/tools/easytool_tools.py b/vision_agent/tools/easytool_tools.py new file mode 100644 index 00000000..fdbc1fe2 --- /dev/null +++ b/vision_agent/tools/easytool_tools.py @@ -0,0 +1,1242 @@ +import io +import logging +import tempfile +from abc import ABC +from pathlib import Path +from typing import Any, Dict, List, Tuple, Type, Union, cast + +import numpy as np +import requests +from PIL import Image +from PIL.Image import Image as ImageType +from scipy.spatial import distance # type: ignore + +from vision_agent.lmm import OpenAILMM +from vision_agent.tools.tool_utils import _send_inference_request +from vision_agent.utils import extract_frames_from_video +from vision_agent.utils.image_utils import ( + b64_to_pil, + convert_to_b64, + denormalize_bbox, + get_image_size, + normalize_bbox, + rle_decode, +) + +_LOGGER = logging.getLogger(__name__) + + +class Tool(ABC): + name: str + description: str + usage: Dict + + def __call__(self, *args: Any, **kwargs: Any) -> Any: + raise NotImplementedError + + +class NoOp(Tool): + name = "noop_" + description = "'noop_' is a no-op tool that does nothing if you do not want answer the question directly and not use a tool." + usage = { + "required_parameters": [], + "examples": [ + { + "scenario": "If you do not want to use a tool.", + "parameters": {}, + } + ], + } + + def __call__(self) -> None: + return None + + +class CLIP(Tool): + r"""CLIP is a tool that can classify or tag any image given a set of input classes + or tags. + + Example + ------- + >>> import vision_agent as va + >>> clip = va.tools.CLIP() + >>> clip("red line, yellow dot", "ct_scan1.jpg")) + [{"labels": ["red line", "yellow dot"], "scores": [0.98, 0.02]}] + """ + + name = "clip_" + description = "'clip_' is a tool that can classify any image given a set of input names or tags. It returns a list of the input names along with their probability scores." + usage = { + "required_parameters": [ + {"name": "prompt", "type": "str"}, + {"name": "image", "type": "str"}, + ], + "examples": [ + { + "scenario": "Can you classify this image as a cat? Image name: cat.jpg", + "parameters": {"prompt": "cat", "image": "cat.jpg"}, + }, + { + "scenario": "Can you tag this photograph with cat or dog? Image name: cat_dog.jpg", + "parameters": {"prompt": "cat, dog", "image": "cat_dog.jpg"}, + }, + { + "scenario": "Can you build me a classifier that classifies red shirts, green shirts and other? Image name: shirts.jpg", + "parameters": { + "prompt": "red shirt, green shirt, other", + "image": "shirts.jpg", + }, + }, + ], + } + + # TODO: Add support for input multiple images, which aligns with the output type. + def __call__(self, prompt: str, image: Union[str, ImageType]) -> Dict: + """Invoke the CLIP model. + + Parameters: + prompt: a string includes a list of classes or tags to classify the image. + image: the input image to classify. + + Returns: + A list of dictionaries containing the labels and scores. Each dictionary contains the classification result for an image. E.g. [{"labels": ["red line", "yellow dot"], "scores": [0.98, 0.02]}] + """ + image_b64 = convert_to_b64(image) + data = { + "prompt": prompt, + "image": image_b64, + "tool": "closed_set_image_classification", + } + resp_data = _send_inference_request(data, "tools") + resp_data["scores"] = [round(prob, 4) for prob in resp_data["scores"]] + return resp_data + + +class ImageCaption(Tool): + r"""ImageCaption is a tool that can caption an image based on its contents or tags. + + Example + ------- + >>> import vision_agent as va + >>> caption = va.tools.ImageCaption() + >>> caption("image1.jpg") + {'text': ['a box of orange and white socks']} + """ + + name = "image_caption_" + description = "'image_caption_' is a tool that can caption an image based on its contents or tags. It returns a text describing the image." + usage = { + "required_parameters": [ + {"name": "image", "type": "str"}, + ], + "examples": [ + { + "scenario": "Can you describe this image? Image name: cat.jpg", + "parameters": {"image": "cat.jpg"}, + }, + { + "scenario": "Can you caption this image with their main contents? Image name: cat_dog.jpg", + "parameters": {"image": "cat_dog.jpg"}, + }, + ], + } + + # TODO: Add support for input multiple images, which aligns with the output type. + def __call__(self, image: Union[str, ImageType]) -> Dict: + """Invoke the Image captioning model. + + Parameters: + image: the input image to caption. + + Returns: + A list of dictionaries containing the labels and scores. Each dictionary contains the classification result for an image. E.g. [{"labels": ["red line", "yellow dot"], "scores": [0.98, 0.02]}] + """ + image_b64 = convert_to_b64(image) + data = { + "image": image_b64, + "tool": "image_captioning", + } + return _send_inference_request(data, "tools") + + +class GroundingDINO(Tool): + r"""Grounding DINO is a tool that can detect arbitrary objects with inputs such as + category names or referring expressions. + + Example + ------- + >>> import vision_agent as va + >>> t = va.tools.GroundingDINO() + >>> t("red line. yellow dot", "ct_scan1.jpg") + [{'labels': ['red line', 'yellow dot'], + 'bboxes': [[0.38, 0.15, 0.59, 0.7], [0.48, 0.25, 0.69, 0.71]], + 'scores': [0.98, 0.02]}] + """ + + name = "grounding_dino_" + description = "'grounding_dino_' is a tool that can detect and count multiple objects given a text prompt such as category names or referring expressions. It returns a list and count of bounding boxes, label names and associated probability scores." + usage = { + "required_parameters": [ + {"name": "prompt", "type": "str"}, + {"name": "image", "type": "str"}, + ], + "optional_parameters": [ + {"name": "box_threshold", "type": "float", "min": 0.1, "max": 0.5}, + {"name": "iou_threshold", "type": "float", "min": 0.01, "max": 0.99}, + ], + "examples": [ + { + "scenario": "Can you detect and count the giraffes and zebras in this image? Image name: animal.jpg", + "parameters": { + "prompt": "giraffe. zebra", + "image": "person.jpg", + }, + }, + { + "scenario": "Can you build me a car detector?", + "parameters": {"prompt": "car", "image": ""}, + }, + { + "scenario": "Can you detect the person on the left and right? Image name: person.jpg", + "parameters": { + "prompt": "left person. right person", + "image": "person.jpg", + }, + }, + { + "scenario": "Detect the red shirts and green shirt. Image name: shirts.jpg", + "parameters": { + "prompt": "red shirt. green shirt", + "image": "shirts.jpg", + "box_threshold": 0.20, + "iou_threshold": 0.20, + }, + }, + ], + } + + # TODO: Add support for input multiple images, which aligns with the output type. + def __call__( + self, + prompt: str, + image: Union[str, Path, ImageType], + box_threshold: float = 0.20, + iou_threshold: float = 0.20, + ) -> Dict: + """Invoke the Grounding DINO model. + + Parameters: + prompt: one or multiple class names to detect. The classes should be separated by a period if there are multiple classes. E.g. "big dog . small cat" + image: the input image to run against. + box_threshold: the threshold to filter out the bounding boxes with low scores. + iou_threshold: the threshold for intersection over union used in nms algorithm. It will suppress the boxes which have iou greater than this threshold. + + Returns: + A dictionary containing the labels, scores, and bboxes, which is the detection result for the input image. + """ + image_size = get_image_size(image) + image_b64 = convert_to_b64(image) + request_data = { + "prompt": prompt, + "image": image_b64, + "tool": "visual_grounding", + "kwargs": {"box_threshold": box_threshold, "iou_threshold": iou_threshold}, + } + data: Dict[str, Any] = _send_inference_request(request_data, "tools") + if "bboxes" in data: + data["bboxes"] = [normalize_bbox(box, image_size) for box in data["bboxes"]] + if "scores" in data: + data["scores"] = [round(score, 2) for score in data["scores"]] + if "labels" in data: + data["labels"] = list(data["labels"]) + data["image_size"] = image_size + return data + + +class GroundingSAM(Tool): + r"""Grounding SAM is a tool that can detect and segment arbitrary objects with + inputs such as category names or referring expressions. + + Example + ------- + >>> import vision_agent as va + >>> t = va.tools.GroundingSAM() + >>> t("red line, yellow dot", "ct_scan1.jpg"]) + [{'labels': ['yellow dot', 'red line'], + 'bboxes': [[0.38, 0.15, 0.59, 0.7], [0.48, 0.25, 0.69, 0.71]], + 'masks': [array([[0, 0, 0, ..., 0, 0, 0], + [0, 0, 0, ..., 0, 0, 0], + ..., + [0, 0, 0, ..., 0, 0, 0], + [0, 0, 0, ..., 0, 0, 0]], dtype=uint8)}, + array([[0, 0, 0, ..., 0, 0, 0], + [0, 0, 0, ..., 0, 0, 0], + ..., + [1, 1, 1, ..., 1, 1, 1], + [1, 1, 1, ..., 1, 1, 1]], dtype=uint8)]}] + """ + + name = "grounding_sam_" + description = "'grounding_sam_' is a tool that can detect and segment multiple objects given a text prompt such as category names or referring expressions. It returns a list of bounding boxes, label names and masks file names and associated probability scores." + usage = { + "required_parameters": [ + {"name": "prompt", "type": "str"}, + {"name": "image", "type": "str"}, + ], + "optional_parameters": [ + {"name": "box_threshold", "type": "float", "min": 0.1, "max": 0.5}, + {"name": "iou_threshold", "type": "float", "min": 0.01, "max": 0.99}, + ], + "examples": [ + { + "scenario": "Can you segment the apples and grapes in this image? Image name: fruits.jpg", + "parameters": { + "prompt": "apple. grape", + "image": "fruits.jpg", + }, + }, + { + "scenario": "Can you build me a car segmentor?", + "parameters": {"prompt": "car", "image": ""}, + }, + { + "scenario": "Can you segment the person on the left and right? Image name: person.jpg", + "parameters": { + "prompt": "left person. right person", + "image": "person.jpg", + }, + }, + { + "scenario": "Can you build me a tool that segments red shirts and green shirts? Image name: shirts.jpg", + "parameters": { + "prompt": "red shirt, green shirt", + "image": "shirts.jpg", + "box_threshold": 0.20, + "iou_threshold": 0.20, + }, + }, + ], + } + + # TODO: Add support for input multiple images, which aligns with the output type. + def __call__( + self, + prompt: str, + image: Union[str, ImageType], + box_threshold: float = 0.2, + iou_threshold: float = 0.2, + ) -> Dict: + """Invoke the Grounding SAM model. + + Parameters: + prompt: a list of classes to segment. + image: the input image to segment. + box_threshold: the threshold to filter out the bounding boxes with low scores. + iou_threshold: the threshold for intersection over union used in nms algorithm. It will suppress the boxes which have iou greater than this threshold. + + Returns: + A dictionary containing the labels, scores, bboxes and masks for the input image. + """ + image_size = get_image_size(image) + image_b64 = convert_to_b64(image) + request_data = { + "prompt": prompt, + "image": image_b64, + "tool": "visual_grounding_segment", + "kwargs": {"box_threshold": box_threshold, "iou_threshold": iou_threshold}, + } + data: Dict[str, Any] = _send_inference_request(request_data, "tools") + if "bboxes" in data: + data["bboxes"] = [normalize_bbox(box, image_size) for box in data["bboxes"]] + if "masks" in data: + data["masks"] = [ + rle_decode(mask_rle=mask, shape=data["mask_shape"]) + for mask in data["masks"] + ] + data["image_size"] = image_size + data.pop("mask_shape", None) + return data + + +class DINOv(Tool): + r"""DINOv is a tool that can detect and segment similar objects with the given input masks. + + Example + ------- + >>> import vision_agent as va + >>> t = va.tools.DINOv() + >>> t(prompt=[{"mask":"balloon_mask.jpg", "image": "balloon.jpg"}], image="balloon.jpg"]) + [{'scores': [0.512, 0.212], + 'masks': [array([[0, 0, 0, ..., 0, 0, 0], + ..., + [0, 0, 0, ..., 0, 0, 0]], dtype=uint8)}, + array([[0, 0, 0, ..., 0, 0, 0], + ..., + [1, 1, 1, ..., 1, 1, 1]], dtype=uint8)]}] + """ + + name = "dinov_" + description = "'dinov_' is a tool that can detect and segment similar objects given a reference segmentation mask." + usage = { + "required_parameters": [ + {"name": "prompt", "type": "List[Dict[str, str]]"}, + {"name": "image", "type": "str"}, + ], + "examples": [ + { + "scenario": "Can you find all the balloons in this image that is similar to the provided masked area? Image name: input.jpg Reference image: balloon.jpg Reference mask: balloon_mask.jpg", + "parameters": { + "prompt": [ + {"mask": "balloon_mask.jpg", "image": "balloon.jpg"}, + ], + "image": "input.jpg", + }, + }, + { + "scenario": "Detect all the objects in this image that are similar to the provided mask. Image name: original.jpg Reference image: mask.png Reference mask: background.png", + "parameters": { + "prompt": [ + {"mask": "mask.png", "image": "background.png"}, + ], + "image": "original.jpg", + }, + }, + ], + } + + def __call__( + self, prompt: List[Dict[str, str]], image: Union[str, ImageType] + ) -> Dict: + """Invoke the DINOv model. + + Parameters: + prompt: a list of visual prompts in the form of {'mask': 'MASK_FILE_PATH', 'image': 'IMAGE_FILE_PATH'}. + image: the input image to segment. + + Returns: + A dictionary of the below keys: 'scores', 'masks' and 'mask_shape', which stores a list of detected segmentation masks and its scores. + """ + image_b64 = convert_to_b64(image) + for p in prompt: + p["mask"] = convert_to_b64(p["mask"]) + p["image"] = convert_to_b64(p["image"]) + request_data = { + "prompt": prompt, + "image": image_b64, + } + data: Dict[str, Any] = _send_inference_request(request_data, "dinov") + if "bboxes" in data: + data["bboxes"] = [ + normalize_bbox(box, data["mask_shape"]) for box in data["bboxes"] + ] + if "masks" in data: + data["masks"] = [ + rle_decode(mask_rle=mask, shape=data["mask_shape"]) + for mask in data["masks"] + ] + data["labels"] = ["visual prompt" for _ in range(len(data["masks"]))] + mask_shape = data.pop("mask_shape", None) + data["image_size"] = (mask_shape[0], mask_shape[1]) if mask_shape else None + return data + + +class AgentDINOv(DINOv): + def __call__( + self, + prompt: List[Dict[str, str]], + image: Union[str, ImageType], + ) -> Dict: + rets = super().__call__(prompt, image) + mask_files = [] + for mask in rets["masks"]: + with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp: + file_name = Path(tmp.name).with_suffix(".mask.png") + Image.fromarray(mask * 255).save(file_name) + mask_files.append(str(file_name)) + rets["masks"] = mask_files + return rets + + +class AgentGroundingSAM(GroundingSAM): + r"""AgentGroundingSAM is the same as GroundingSAM but it saves the masks as files + returns the file name. This makes it easier for agents to use. + """ + + def __call__( + self, + prompt: str, + image: Union[str, ImageType], + box_threshold: float = 0.2, + iou_threshold: float = 0.75, + ) -> Dict: + rets = super().__call__(prompt, image, box_threshold, iou_threshold) + mask_files = [] + for mask in rets["masks"]: + with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp: + file_name = Path(tmp.name).with_suffix(".mask.png") + Image.fromarray(mask * 255).save(file_name) + mask_files.append(str(file_name)) + rets["masks"] = mask_files + return rets + + +class ZeroShotCounting(Tool): + r"""ZeroShotCounting is a tool that can count total number of instances of an object + present in an image belonging to same class without a text or visual prompt. + + Example + ------- + >>> import vision_agent as va + >>> zshot_count = va.tools.ZeroShotCounting() + >>> zshot_count("image1.jpg") + {'count': 45} + """ + + name = "zero_shot_counting_" + description = "'zero_shot_counting_' is a tool that counts foreground items given only an image and no other information. It returns only the count of the objects in the image" + + usage = { + "required_parameters": [ + {"name": "image", "type": "str"}, + ], + "examples": [ + { + "scenario": "Can you count the items in the image? Image name: lids.jpg", + "parameters": {"image": "lids.jpg"}, + }, + { + "scenario": "Can you count the total number of objects in this image? Image name: tray.jpg", + "parameters": {"image": "tray.jpg"}, + }, + { + "scenario": "Can you build me an object counting tool? Image name: shirts.jpg", + "parameters": { + "image": "shirts.jpg", + }, + }, + ], + } + + # TODO: Add support for input multiple images, which aligns with the output type. + def __call__(self, image: Union[str, ImageType]) -> Dict: + """Invoke the Zero shot counting model. + + Parameters: + image: the input image. + + Returns: + A dictionary containing the key 'count' and the count as value. E.g. {count: 12} + """ + image_b64 = convert_to_b64(image) + data = { + "image": image_b64, + "tool": "zero_shot_counting", + } + resp_data = _send_inference_request(data, "tools") + resp_data["heat_map"] = np.array(b64_to_pil(resp_data["heat_map"][0])) + return resp_data + + +class VisualPromptCounting(Tool): + r"""VisualPromptCounting is a tool that can count total number of instances of an object + present in an image belonging to same class with help of an visual prompt which is a bounding box. + + Example + ------- + >>> import vision_agent as va + >>> prompt_count = va.tools.VisualPromptCounting() + >>> prompt_count(image="image1.jpg", prompt={"bbox": [0.1, 0.1, 0.4, 0.42]}) + {'count': 23} + """ + + name = "visual_prompt_counting_" + description = "'visual_prompt_counting_' is a tool that counts foreground items in an image given a visual prompt which is a bounding box describing the object. It returns only the count of the objects in the image." + + usage = { + "required_parameters": [ + {"name": "image", "type": "str"}, + {"name": "prompt", "type": "Dict[str, List[float]"}, + ], + "examples": [ + { + "scenario": "Here is an example of a lid '0.1, 0.1, 0.14, 0.2', Can you count the items in the image ? Image name: lids.jpg", + "parameters": { + "image": "lids.jpg", + "prompt": {"bbox": [0.1, 0.1, 0.14, 0.2]}, + }, + }, + { + "scenario": "Can you count the total number of objects in this image ? Image name: tray.jpg, reference_data: {'bbox': [0.1, 0.1, 0.2, 0.25]}", + "parameters": { + "image": "tray.jpg", + "prompt": {"bbox": [0.1, 0.1, 0.2, 0.25]}, + }, + }, + { + "scenario": "Can you count this item based on an example, reference_data: {'bbox': [100, 115, 200, 200]} ? Image name: shirts.jpg", + "parameters": { + "image": "shirts.jpg", + "prompt": {"bbox": [100, 115, 200, 200]}, + }, + }, + { + "scenario": "Can you build me a counting tool based on an example prompt ? Image name: shoes.jpg, reference_data: {'bbox': [0.1, 0.1, 0.6, 0.65]}", + "parameters": { + "image": "shoes.jpg", + "prompt": {"bbox": [0.1, 0.1, 0.6, 0.65]}, + }, + }, + ], + } + + def __call__( + self, image: Union[str, ImageType], prompt: Dict[str, List[float]] + ) -> Dict: + """Invoke the few shot counting model. + + Parameters: + image: the input image. + prompt: the visual prompt which is a bounding box describing the object. + + Returns: + A dictionary containing the key 'count' and the count as value. E.g. {count: 12} + """ + image_size = get_image_size(image) + bbox = prompt["bbox"] + bbox_str = ", ".join(map(str, denormalize_bbox(bbox, image_size))) + image_b64 = convert_to_b64(image) + + data = { + "image": image_b64, + "prompt": bbox_str, + "tool": "few_shot_counting", + } + resp_data = _send_inference_request(data, "tools") + resp_data["heat_map"] = np.array(b64_to_pil(resp_data["heat_map"][0])) + return resp_data + + +class VisualQuestionAnswering(Tool): + r"""VisualQuestionAnswering is a tool that can explain contents of an image and answer questions about the same + + Example + ------- + >>> import vision_agent as va + >>> vqa_tool = va.tools.VisualQuestionAnswering() + >>> vqa_tool(image="image1.jpg", prompt="describe this image in detail") + {'text': "The image contains a cat sitting on a table with a bowl of milk."} + """ + + name = "visual_question_answering_" + description = "'visual_question_answering_' is a tool that can answer basic questions about the image given a question and an image. It returns a text describing the image and the answer to the question" + + usage = { + "required_parameters": [ + {"name": "image", "type": "str"}, + {"name": "prompt", "type": "str"}, + ], + "examples": [ + { + "scenario": "Describe this image in detail. Image name: cat.jpg", + "parameters": { + "image": "cats.jpg", + "prompt": "Describe this image in detail", + }, + }, + { + "scenario": "Can you help me with this street sign in this image ? What does it say ? Image name: sign.jpg", + "parameters": { + "image": "sign.jpg", + "prompt": "Can you help me with this street sign ? What does it say ?", + }, + }, + { + "scenario": "Describe the weather in the image for me ? Image name: weather.jpg", + "parameters": { + "image": "weather.jpg", + "prompt": "Describe the weather in the image for me ", + }, + }, + { + "scenario": "Which 2 are the least frequent bins in this histogram ? Image name: chart.jpg", + "parameters": { + "image": "chart.jpg", + "prompt": "Which 2 are the least frequent bins in this histogram", + }, + }, + ], + } + + def __call__(self, image: str, prompt: str) -> Dict: + """Invoke the visual question answering model. + + Parameters: + image: the input image. + + Returns: + A dictionary containing the key 'text' and the answer to the prompt. E.g. {'text': 'This image contains a cat sitting on a table with a bowl of milk.'} + """ + + gpt = OpenAILMM() + return {"text": gpt(input=prompt, images=[image])} + + +class ImageQuestionAnswering(Tool): + r"""ImageQuestionAnswering is a tool that can explain contents of an image and answer questions about the same + It is same as VisualQuestionAnswering but this tool is not used by agents. It is used when user requests a tool for VQA using generate_image_qa_tool function. + It is also useful if the user wants the data to be not exposed to OpenAI endpoints + + Example + ------- + >>> import vision_agent as va + >>> vqa_tool = va.tools.ImageQuestionAnswering() + >>> vqa_tool(image="image1.jpg", prompt="describe this image in detail") + {'text': "The image contains a cat sitting on a table with a bowl of milk."} + """ + + name = "image_question_answering_" + description = "'image_question_answering_' is a tool that can answer basic questions about the image given a question and an image. It returns a text describing the image and the answer to the question" + + usage = { + "required_parameters": [ + {"name": "image", "type": "str"}, + {"name": "prompt", "type": "str"}, + ], + "examples": [ + { + "scenario": "Describe this image in detail. Image name: cat.jpg", + "parameters": { + "image": "cats.jpg", + "prompt": "Describe this image in detail", + }, + }, + { + "scenario": "Can you help me with this street sign in this image ? What does it say ? Image name: sign.jpg", + "parameters": { + "image": "sign.jpg", + "prompt": "Can you help me with this street sign ? What does it say ?", + }, + }, + { + "scenario": "Describe the weather in the image for me ? Image name: weather.jpg", + "parameters": { + "image": "weather.jpg", + "prompt": "Describe the weather in the image for me ", + }, + }, + { + "scenario": "Can you generate an image question answering tool ? Image name: chart.jpg, prompt: Which 2 are the least frequent bins in this histogram", + "parameters": { + "image": "chart.jpg", + "prompt": "Which 2 are the least frequent bins in this histogram", + }, + }, + ], + } + + def __call__(self, image: Union[str, ImageType], prompt: str) -> Dict: + """Invoke the visual question answering model. + + Parameters: + image: the input image. + + Returns: + A dictionary containing the key 'text' and the answer to the prompt. E.g. {'text': 'This image contains a cat sitting on a table with a bowl of milk.'} + """ + + image_b64 = convert_to_b64(image) + data = { + "image": image_b64, + "prompt": prompt, + "tool": "image_question_answering", + } + + return _send_inference_request(data, "tools") + + +class Crop(Tool): + r"""Crop crops an image given a bounding box and returns a file name of the cropped image.""" + + name = "crop_" + description = "'crop_' crops an image given a bounding box and returns a file name of the cropped image. It returns a file with the cropped image." + usage = { + "required_parameters": [ + {"name": "bbox", "type": "List[float]"}, + {"name": "image", "type": "str"}, + ], + "examples": [ + { + "scenario": "Can you crop the image to the bounding box [0.1, 0.1, 0.9, 0.9]? Image name: image.jpg", + "parameters": {"bbox": [0.1, 0.1, 0.9, 0.9], "image": "image.jpg"}, + }, + { + "scenario": "Cut out the image to the bounding box [0.2, 0.2, 0.8, 0.8]. Image name: car.jpg", + "parameters": {"bbox": [0.2, 0.2, 0.8, 0.8], "image": "car.jpg"}, + }, + ], + } + + def __call__(self, bbox: List[float], image: Union[str, Path]) -> Dict: + pil_image = Image.open(image) + width, height = pil_image.size + bbox = [ + int(bbox[0] * width), + int(bbox[1] * height), + int(bbox[2] * width), + int(bbox[3] * height), + ] + cropped_image = pil_image.crop(bbox) # type: ignore + with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp: + cropped_image.save(tmp.name) + + return {"image": tmp.name} + + +class BboxStats(Tool): + r"""BboxStats returns the height, width and area of the bounding box in pixels to 2 decimal places.""" + + name = "bbox_stats_" + description = "'bbox_stats_' returns the height, width and area of the given bounding box in pixels to 2 decimal places." + usage = { + "required_parameters": [ + {"name": "bboxes", "type": "List[int]"}, + {"name": "image_size", "type": "Tuple[int]"}, + ], + "examples": [ + { + "scenario": "Calculate the width and height of the bounding box [0.2, 0.21, 0.34, 0.42]", + "parameters": { + "bboxes": [[0.2, 0.21, 0.34, 0.42]], + "image_size": (500, 1200), + }, + }, + { + "scenario": "Calculate the area of the bounding box [0.2, 0.21, 0.34, 0.42]", + "parameters": { + "bboxes": [[0.2, 0.21, 0.34, 0.42]], + "image_size": (640, 480), + }, + }, + ], + } + + def __call__( + self, bboxes: List[List[int]], image_size: Tuple[int, int] + ) -> List[Dict]: + areas = [] + height, width = image_size + for bbox in bboxes: + x1, y1, x2, y2 = bbox + areas.append( + { + "width": round((x2 - x1) * width, 2), + "height": round((y2 - y1) * height, 2), + "area": round((x2 - x1) * (y2 - y1) * width * height, 2), + } + ) + + return areas + + +class SegArea(Tool): + r"""SegArea returns the area of the segmentation mask in pixels normalized to 2 decimal places.""" + + name = "seg_area_" + description = "'seg_area_' returns the area of the given segmentation mask in pixels normalized to 2 decimal places." + usage = { + "required_parameters": [{"name": "masks", "type": "str"}], + "examples": [ + { + "scenario": "If you want to calculate the area of the segmentation mask, pass the masks file name.", + "parameters": {"masks": "mask_file.jpg"}, + }, + ], + } + + def __call__(self, masks: Union[str, Path]) -> float: + pil_mask = Image.open(str(masks)) + np_mask = np.array(pil_mask) + np_mask = np.clip(np_mask, 0, 1) + return cast(float, round(np.sum(np_mask), 2)) + + +class BboxIoU(Tool): + name = "bbox_iou_" + description = "'bbox_iou_' returns the intersection over union of two bounding boxes. This is a good tool for determining if two objects are overlapping." + usage = { + "required_parameters": [ + {"name": "bbox1", "type": "List[int]"}, + {"name": "bbox2", "type": "List[int]"}, + ], + "examples": [ + { + "scenario": "If you want to calculate the intersection over union of the bounding boxes [0.2, 0.21, 0.34, 0.42] and [0.3, 0.31, 0.44, 0.52]", + "parameters": { + "bbox1": [0.2, 0.21, 0.34, 0.42], + "bbox2": [0.3, 0.31, 0.44, 0.52], + }, + } + ], + } + + def __call__(self, bbox1: List[int], bbox2: List[int]) -> float: + x1, y1, x2, y2 = bbox1 + x3, y3, x4, y4 = bbox2 + xA = max(x1, x3) + yA = max(y1, y3) + xB = min(x2, x4) + yB = min(y2, y4) + inter_area = max(0, xB - xA) * max(0, yB - yA) + boxa_area = (x2 - x1) * (y2 - y1) + boxb_area = (x4 - x3) * (y4 - y3) + iou = inter_area / float(boxa_area + boxb_area - inter_area) + return round(iou, 2) + + +class SegIoU(Tool): + name = "seg_iou_" + description = "'seg_iou_' returns the intersection over union of two segmentation masks given their segmentation mask files." + usage = { + "required_parameters": [ + {"name": "mask1", "type": "str"}, + {"name": "mask2", "type": "str"}, + ], + "examples": [ + { + "scenario": "Calculate the intersection over union of the segmentation masks for mask_file1.jpg and mask_file2.jpg", + "parameters": {"mask1": "mask_file1.png", "mask2": "mask_file2.png"}, + } + ], + } + + def __call__(self, mask1: Union[str, Path], mask2: Union[str, Path]) -> float: + pil_mask1 = Image.open(str(mask1)) + pil_mask2 = Image.open(str(mask2)) + np_mask1 = np.clip(np.array(pil_mask1), 0, 1) + np_mask2 = np.clip(np.array(pil_mask2), 0, 1) + intersection = np.logical_and(np_mask1, np_mask2) + union = np.logical_or(np_mask1, np_mask2) + iou = np.sum(intersection) / np.sum(union) + return cast(float, round(iou, 2)) + + +class BboxContains(Tool): + name = "bbox_contains_" + description = "Given two bounding boxes, a target bounding box and a region bounding box, 'bbox_contains_' returns the intersection of the two bounding boxes which is the percentage area of the target bounding box overlaps with the region bounding box. This is a good tool for determining if the region object contains the target object." + usage = { + "required_parameters": [ + {"name": "target", "type": "List[int]"}, + {"name": "target_class", "type": "str"}, + {"name": "region", "type": "List[int]"}, + {"name": "region_class", "type": "str"}, + ], + "examples": [ + { + "scenario": "Determine if the dog on the couch, bounding box of the dog: [0.2, 0.21, 0.34, 0.42], bounding box of the couch: [0.3, 0.31, 0.44, 0.52]", + "parameters": { + "target": [0.2, 0.21, 0.34, 0.42], + "target_class": "dog", + "region": [0.3, 0.31, 0.44, 0.52], + "region_class": "couch", + }, + }, + { + "scenario": "Check if the kid is in the pool? bounding box of the kid: [0.2, 0.21, 0.34, 0.42], bounding box of the pool: [0.3, 0.31, 0.44, 0.52]", + "parameters": { + "target": [0.2, 0.21, 0.34, 0.42], + "target_class": "kid", + "region": [0.3, 0.31, 0.44, 0.52], + "region_class": "pool", + }, + }, + ], + } + + def __call__( + self, target: List[int], target_class: str, region: List[int], region_class: str + ) -> Dict[str, Union[str, float]]: + x1, y1, x2, y2 = target + x3, y3, x4, y4 = region + xA = max(x1, x3) + yA = max(y1, y3) + xB = min(x2, x4) + yB = min(y2, y4) + inter_area = max(0, xB - xA) * max(0, yB - yA) + boxa_area = (x2 - x1) * (y2 - y1) + iou = inter_area / float(boxa_area) + area = round(iou, 2) + return { + "target_class": target_class, + "region_class": region_class, + "intersection": area, + } + + +class ObjectDistance(Tool): + name = "object_distance_" + description = "'object_distance_' calculates the distance between two objects in an image. It returns the minimum distance between the two objects." + usage = { + "required_parameters": [ + {"name": "object1", "type": "Dict[str, Any]"}, + {"name": "object2", "type": "Dict[str, Any]"}, + ], + "examples": [ + { + "scenario": "Calculate the distance between these two objects {bboxes: [0.2, 0.21, 0.34, 0.42], masks: 'mask_file1.png'}, {bboxes: [0.3, 0.31, 0.44, 0.52], masks: 'mask_file2.png'}", + "parameters": { + "object1": { + "bboxes": [0.2, 0.21, 0.34, 0.42], + "scores": 0.54, + "masks": "mask_file1.png", + }, + "object2": { + "bboxes": [0.3, 0.31, 0.44, 0.52], + "scores": 0.66, + "masks": "mask_file2.png", + }, + }, + } + ], + } + + def __call__(self, object1: Dict[str, Any], object2: Dict[str, Any]) -> float: + if "masks" in object1 and "masks" in object2: + mask1 = object1["masks"] + mask2 = object2["masks"] + return MaskDistance()(mask1, mask2) + elif "bboxes" in object1 and "bboxes" in object2: + bbox1 = object1["bboxes"] + bbox2 = object2["bboxes"] + return BoxDistance()(bbox1, bbox2) + else: + raise ValueError("Either of the objects should have masks or bboxes") + + +class BoxDistance(Tool): + name = "box_distance_" + description = "'box_distance_' calculates distance between two bounding boxes. It returns the minumum distance between the given bounding boxes" + usage = { + "required_parameters": [ + {"name": "bbox1", "type": "List[int]"}, + {"name": "bbox2", "type": "List[int]"}, + ], + "examples": [ + { + "scenario": "Calculate the distance between these two bounding boxes [0.2, 0.21, 0.34, 0.42] and [0.3, 0.31, 0.44, 0.52]", + "parameters": { + "bbox1": [0.2, 0.21, 0.34, 0.42], + "bbox2": [0.3, 0.31, 0.44, 0.52], + }, + } + ], + } + + def __call__(self, bbox1: List[int], bbox2: List[int]) -> float: + x11, y11, x12, y12 = bbox1 + x21, y21, x22, y22 = bbox2 + + horizontal_dist = np.max([0, x21 - x12, x11 - x22]) + vertical_dist = np.max([0, y21 - y12, y11 - y22]) + + return cast(float, round(np.sqrt(horizontal_dist**2 + vertical_dist**2), 2)) + + +class MaskDistance(Tool): + name = "mask_distance_" + description = "'mask_distance_' calculates distance between two masks. It is helpful in checking proximity of two objects. It returns the minumum distance between the given masks" + usage = { + "required_parameters": [ + {"name": "mask1", "type": "str"}, + {"name": "mask2", "type": "str"}, + ], + "examples": [ + { + "scenario": "Calculate the distance between the segmentation masks for mask_file1.jpg and mask_file2.jpg", + "parameters": {"mask1": "mask_file1.png", "mask2": "mask_file2.png"}, + } + ], + } + + def __call__(self, mask1: Union[str, Path], mask2: Union[str, Path]) -> float: + pil_mask1 = Image.open(str(mask1)) + pil_mask2 = Image.open(str(mask2)) + np_mask1 = np.clip(np.array(pil_mask1), 0, 1) + np_mask2 = np.clip(np.array(pil_mask2), 0, 1) + + mask1_points = np.transpose(np.nonzero(np_mask1)) + mask2_points = np.transpose(np.nonzero(np_mask2)) + dist_matrix = distance.cdist(mask1_points, mask2_points, "euclidean") + return cast(float, np.round(np.min(dist_matrix), 2)) + + +class ExtractFrames(Tool): + r"""Extract frames from a video.""" + + name = "extract_frames_" + description = "'extract_frames_' extracts frames from a video every 2 seconds, returns a list of tuples (frame, timestamp), where timestamp is the relative time in seconds where the frame was captured. The frame is a local image file path." + usage = { + "required_parameters": [{"name": "video_uri", "type": "str"}], + "optional_parameters": [{"name": "frames_every", "type": "float"}], + "examples": [ + { + "scenario": "Can you extract the frames from this video? Video: www.foobar.com/video?name=test.mp4", + "parameters": {"video_uri": "www.foobar.com/video?name=test.mp4"}, + }, + { + "scenario": "Can you extract the images from this video file at every 2 seconds ? Video path: tests/data/test.mp4", + "parameters": {"video_uri": "tests/data/test.mp4", "frames_every": 2}, + }, + ], + } + + def __call__( + self, video_uri: str, frames_every: float = 2 + ) -> List[Tuple[str, float]]: + """Extract frames from a video. + + + Parameters: + video_uri: the path to the video file or a url points to the video data + + Returns: + a list of tuples containing the extracted frame and the timestamp in seconds. E.g. [(path_to_frame1, 0.0), (path_to_frame2, 0.5), ...]. The timestamp is the time in seconds from the start of the video. E.g. 12.125 means 12.125 seconds from the start of the video. The frames are sorted by the timestamp in ascending order. + """ + frames = extract_frames_from_video(video_uri, fps=round(1 / frames_every, 2)) + result = [] + _LOGGER.info( + f"Extracted {len(frames)} frames from video {video_uri}. Temporarily saving them as images to disk for downstream tasks." + ) + for frame, ts in frames: + with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp: + file_name = Path(tmp.name).with_suffix(".frame.png") + Image.fromarray(frame).save(file_name) + result.append((str(file_name), ts)) + return result + + +class OCR(Tool): + name = "ocr_" + description = "'ocr_' extracts text from an image. It returns a list of detected text, bounding boxes, and confidence scores." + usage = { + "required_parameters": [ + {"name": "image", "type": "str"}, + ], + "examples": [ + { + "scenario": "Can you extract the text from this image? Image name: image.png", + "parameters": {"image": "image.png"}, + }, + ], + } + _API_KEY = "land_sk_WVYwP00xA3iXely2vuar6YUDZ3MJT9yLX6oW5noUkwICzYLiDV" + _URL = "https://app.landing.ai/ocr/v1/detect-text" + + def __call__(self, image: str) -> dict: + pil_image = Image.open(image).convert("RGB") + image_size = pil_image.size[::-1] + image_buffer = io.BytesIO() + pil_image.save(image_buffer, format="PNG") + buffer_bytes = image_buffer.getvalue() + image_buffer.close() + + res = requests.post( + self._URL, + files={"images": buffer_bytes}, + data={"language": "en"}, + headers={"contentType": "multipart/form-data", "apikey": self._API_KEY}, + ) + if res.status_code != 200: + _LOGGER.error(f"Request failed: {res.text}") + raise ValueError(f"Request failed: {res.text}") + + data = res.json() + output: Dict[str, List] = {"labels": [], "bboxes": [], "scores": []} + for det in data[0]: + output["labels"].append(det["text"]) + box = [ + det["location"][0]["x"], + det["location"][0]["y"], + det["location"][2]["x"], + det["location"][2]["y"], + ] + box = normalize_bbox(box, image_size) + output["bboxes"].append(box) + output["scores"].append(round(det["score"], 2)) + return output + + +class Calculator(Tool): + r"""Calculator is a tool that can perform basic arithmetic operations.""" + + name = "calculator_" + description = ( + "'calculator_' is a tool that can perform basic arithmetic operations." + ) + usage = { + "required_parameters": [{"name": "equation", "type": "str"}], + "examples": [ + { + "scenario": "If you want to calculate (2 * 3) + 4", + "parameters": {"equation": "2 + 4"}, + }, + { + "scenario": "If you want to calculate (4 + 2.5) / 2.1", + "parameters": {"equation": "(4 + 2.5) / 2.1"}, + }, + ], + } + + def __call__(self, equation: str) -> float: + return cast(float, round(eval(equation), 2)) + + +TOOLS = { + i: {"name": c.name, "description": c.description, "usage": c.usage, "class": c} + for i, c in enumerate( + [ + NoOp, + CLIP, + GroundingDINO, + AgentGroundingSAM, + ZeroShotCounting, + VisualPromptCounting, + VisualQuestionAnswering, + AgentDINOv, + ExtractFrames, + Crop, + BboxStats, + SegArea, + ObjectDistance, + BboxContains, + SegIoU, + OCR, + Calculator, + ] + ) + if (hasattr(c, "name") and hasattr(c, "description") and hasattr(c, "usage")) +} + + +def register_tool(tool: Type[Tool]) -> Type[Tool]: + r"""Add a tool to the list of available tools. + + Parameters: + tool: The tool to add. + """ + + if ( + not hasattr(tool, "name") + or not hasattr(tool, "description") + or not hasattr(tool, "usage") + ): + raise ValueError( + "The tool must have 'name', 'description' and 'usage' attributes." + ) + + TOOLS[len(TOOLS)] = { + "name": tool.name, + "description": tool.description, + "usage": tool.usage, + "class": tool, + } + return tool diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py index fdbc1fe2..8e202856 100644 --- a/vision_agent/tools/tools.py +++ b/vision_agent/tools/tools.py @@ -1,17 +1,18 @@ +import inspect import io +import json import logging import tempfile -from abc import ABC +from importlib import resources from pathlib import Path -from typing import Any, Dict, List, Tuple, Type, Union, cast +from typing import Any, Callable, Dict, List, Tuple, Union, cast import numpy as np +import pandas as pd import requests -from PIL import Image -from PIL.Image import Image as ImageType +from PIL import Image, ImageDraw, ImageFont from scipy.spatial import distance # type: ignore -from vision_agent.lmm import OpenAILMM from vision_agent.tools.tool_utils import _send_inference_request from vision_agent.utils import extract_frames_from_video from vision_agent.utils.image_utils import ( @@ -23,1220 +24,662 @@ rle_decode, ) +COLORS = [ + (158, 218, 229), + (219, 219, 141), + (23, 190, 207), + (188, 189, 34), + (199, 199, 199), + (247, 182, 210), + (127, 127, 127), + (227, 119, 194), + (196, 156, 148), + (197, 176, 213), + (140, 86, 75), + (148, 103, 189), + (255, 152, 150), + (152, 223, 138), + (214, 39, 40), + (44, 160, 44), + (255, 187, 120), + (174, 199, 232), + (255, 127, 14), + (31, 119, 180), +] +_API_KEY = "land_sk_WVYwP00xA3iXely2vuar6YUDZ3MJT9yLX6oW5noUkwICzYLiDV" +_OCR_URL = "https://app.landing.ai/ocr/v1/detect-text" +logging.basicConfig(level=logging.INFO) _LOGGER = logging.getLogger(__name__) -class Tool(ABC): - name: str - description: str - usage: Dict - - def __call__(self, *args: Any, **kwargs: Any) -> Any: - raise NotImplementedError +def grounding_dino( + prompt: str, + image: np.ndarray, + box_threshold: float = 0.20, + iou_threshold: float = 0.20, +) -> List[Dict[str, Any]]: + """'grounding_dino' is a tool that can detect and count objects given a text prompt + such as category names or referring expressions. It returns a list and count of + bounding boxes, label names and associated probability scores. + Parameters: + prompt (str): The prompt to ground to the image. + image (np.ndarray): The image to ground the prompt to. + box_threshold (float, optional): The threshold for the box detection. Defaults + to 0.20. + iou_threshold (float, optional): The threshold for the Intersection over Union + (IoU). Defaults to 0.20. + + Returns: + List[Dict[str, Any]]: A list of dictionaries containing the score, label, and + bounding box of the detected objects with normalized coordinates + (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left and + xmax and ymax are the coordinates of the bottom-right of the bounding box. -class NoOp(Tool): - name = "noop_" - description = "'noop_' is a no-op tool that does nothing if you do not want answer the question directly and not use a tool." - usage = { - "required_parameters": [], - "examples": [ - { - "scenario": "If you do not want to use a tool.", - "parameters": {}, - } - ], + Example + ------- + >>> grounding_dino("car. dinosaur", image) + [ + {'score': 0.99, 'label': 'dinosaur', 'bbox': [0.1, 0.11, 0.35, 0.4]}, + {'score': 0.98, 'label': 'car', 'bbox': [0.2, 0.21, 0.45, 0.5}, + ] + """ + image_size = image.shape[:2] + image_b64 = convert_to_b64(image) + request_data = { + "prompt": prompt, + "image": image_b64, + "tool": "visual_grounding", + "kwargs": {"box_threshold": box_threshold, "iou_threshold": iou_threshold}, } + data: Dict[str, Any] = _send_inference_request(request_data, "tools") + return_data = [] + for i in range(len(data["bboxes"])): + return_data.append( + { + "score": round(data["scores"][i], 2), + "label": data["labels"][i], + "bbox": normalize_bbox(data["bboxes"][i], image_size), + } + ) + return return_data - def __call__(self) -> None: - return None +def grounding_sam( + prompt: str, + image: np.ndarray, + box_threshold: float = 0.20, + iou_threshold: float = 0.20, +) -> List[Dict[str, Any]]: + """'grounding_sam' is a tool that can detect and segment objects given a text + prompt such as category names or referring expressions. It returns a list of + bounding boxes, label names and masks file names and associated probability scores. -class CLIP(Tool): - r"""CLIP is a tool that can classify or tag any image given a set of input classes - or tags. + Parameters: + prompt (str): The prompt to ground to the image. + image (np.ndarray): The image to ground the prompt to. + box_threshold (float, optional): The threshold for the box detection. Defaults + to 0.20. + iou_threshold (float, optional): The threshold for the Intersection over Union + (IoU). Defaults to 0.20. + + Returns: + List[Dict[str, Any]]: A list of dictionaries containing the score, label, + bounding box, and mask of the detected objects with normalized coordinates + (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left and + xmax and ymax are the coordinates of the bottom-right of the bounding box. + The mask is binary 2D numpy array where 1 indicates the object and 0 indicates + the background. Example ------- - >>> import vision_agent as va - >>> clip = va.tools.CLIP() - >>> clip("red line, yellow dot", "ct_scan1.jpg")) - [{"labels": ["red line", "yellow dot"], "scores": [0.98, 0.02]}] + >>> grounding_sam("car. dinosaur", image) + [ + { + 'score': 0.99, + 'label': 'dinosaur', + 'bbox': [0.1, 0.11, 0.35, 0.4], + 'mask': array([[0, 0, 0, ..., 0, 0, 0], + [0, 0, 0, ..., 0, 0, 0], + ..., + [0, 0, 0, ..., 0, 0, 0], + [0, 0, 0, ..., 0, 0, 0]], dtype=uint8), + }, + ] """ - - name = "clip_" - description = "'clip_' is a tool that can classify any image given a set of input names or tags. It returns a list of the input names along with their probability scores." - usage = { - "required_parameters": [ - {"name": "prompt", "type": "str"}, - {"name": "image", "type": "str"}, - ], - "examples": [ - { - "scenario": "Can you classify this image as a cat? Image name: cat.jpg", - "parameters": {"prompt": "cat", "image": "cat.jpg"}, - }, - { - "scenario": "Can you tag this photograph with cat or dog? Image name: cat_dog.jpg", - "parameters": {"prompt": "cat, dog", "image": "cat_dog.jpg"}, - }, - { - "scenario": "Can you build me a classifier that classifies red shirts, green shirts and other? Image name: shirts.jpg", - "parameters": { - "prompt": "red shirt, green shirt, other", - "image": "shirts.jpg", - }, - }, - ], + image_size = image.shape[:2] + image_b64 = convert_to_b64(image) + request_data = { + "prompt": prompt, + "image": image_b64, + "tool": "visual_grounding_segment", + "kwargs": {"box_threshold": box_threshold, "iou_threshold": iou_threshold}, } + data: Dict[str, Any] = _send_inference_request(request_data, "tools") + return_data = [] + for i in range(len(data["bboxes"])): + return_data.append( + { + "score": round(data["scores"][i], 2), + "label": data["labels"][i], + "bbox": normalize_bbox(data["bboxes"][i], image_size), + "mask": rle_decode(mask_rle=data["masks"][i], shape=data["mask_shape"]), + } + ) + return return_data - # TODO: Add support for input multiple images, which aligns with the output type. - def __call__(self, prompt: str, image: Union[str, ImageType]) -> Dict: - """Invoke the CLIP model. - - Parameters: - prompt: a string includes a list of classes or tags to classify the image. - image: the input image to classify. - Returns: - A list of dictionaries containing the labels and scores. Each dictionary contains the classification result for an image. E.g. [{"labels": ["red line", "yellow dot"], "scores": [0.98, 0.02]}] - """ - image_b64 = convert_to_b64(image) - data = { - "prompt": prompt, - "image": image_b64, - "tool": "closed_set_image_classification", - } - resp_data = _send_inference_request(data, "tools") - resp_data["scores"] = [round(prob, 4) for prob in resp_data["scores"]] - return resp_data +def extract_frames( + video_uri: Union[str, Path], fps: float = 0.5 +) -> List[Tuple[np.ndarray, float]]: + """'extract_frames' extracts frames from a video, returns a list of tuples (frame, + timestamp), where timestamp is the relative time in seconds where the frame was + captured. The frame is a local image file path. + Parameters: + video_uri (Union[str, Path]): The path to the video file. + fps (float, optional): The frame rate per second to extract the frames. Defaults + to 0.5. -class ImageCaption(Tool): - r"""ImageCaption is a tool that can caption an image based on its contents or tags. + Returns: + List[Tuple[np.ndarray, float]]: A list of tuples containing the extracted frame + and the timestamp in seconds. Example ------- - >>> import vision_agent as va - >>> caption = va.tools.ImageCaption() - >>> caption("image1.jpg") - {'text': ['a box of orange and white socks']} + >>> extract_frames("path/to/video.mp4") + [(frame1, 0.0), (frame2, 0.5), ...] """ - name = "image_caption_" - description = "'image_caption_' is a tool that can caption an image based on its contents or tags. It returns a text describing the image." - usage = { - "required_parameters": [ - {"name": "image", "type": "str"}, - ], - "examples": [ - { - "scenario": "Can you describe this image? Image name: cat.jpg", - "parameters": {"image": "cat.jpg"}, - }, - { - "scenario": "Can you caption this image with their main contents? Image name: cat_dog.jpg", - "parameters": {"image": "cat_dog.jpg"}, - }, - ], - } - - # TODO: Add support for input multiple images, which aligns with the output type. - def __call__(self, image: Union[str, ImageType]) -> Dict: - """Invoke the Image captioning model. + return extract_frames_from_video(str(video_uri), fps) - Parameters: - image: the input image to caption. - Returns: - A list of dictionaries containing the labels and scores. Each dictionary contains the classification result for an image. E.g. [{"labels": ["red line", "yellow dot"], "scores": [0.98, 0.02]}] - """ - image_b64 = convert_to_b64(image) - data = { - "image": image_b64, - "tool": "image_captioning", - } - return _send_inference_request(data, "tools") +def ocr(image: np.ndarray) -> List[Dict[str, Any]]: + """'ocr' extracts text from an image. It returns a list of detected text, bounding + boxes, and confidence scores. + Parameters: + image (np.ndarray): The image to extract text from. -class GroundingDINO(Tool): - r"""Grounding DINO is a tool that can detect arbitrary objects with inputs such as - category names or referring expressions. + Returns: + List[Dict[str, Any]]: A list of dictionaries containing the detected text, bbox, + and confidence score. Example ------- - >>> import vision_agent as va - >>> t = va.tools.GroundingDINO() - >>> t("red line. yellow dot", "ct_scan1.jpg") - [{'labels': ['red line', 'yellow dot'], - 'bboxes': [[0.38, 0.15, 0.59, 0.7], [0.48, 0.25, 0.69, 0.71]], - 'scores': [0.98, 0.02]}] + >>> ocr(image) + [ + {'label': 'some text', 'bbox': [0.1, 0.11, 0.35, 0.4], 'score': 0.99}, + ] """ - name = "grounding_dino_" - description = "'grounding_dino_' is a tool that can detect and count multiple objects given a text prompt such as category names or referring expressions. It returns a list and count of bounding boxes, label names and associated probability scores." - usage = { - "required_parameters": [ - {"name": "prompt", "type": "str"}, - {"name": "image", "type": "str"}, - ], - "optional_parameters": [ - {"name": "box_threshold", "type": "float", "min": 0.1, "max": 0.5}, - {"name": "iou_threshold", "type": "float", "min": 0.01, "max": 0.99}, - ], - "examples": [ - { - "scenario": "Can you detect and count the giraffes and zebras in this image? Image name: animal.jpg", - "parameters": { - "prompt": "giraffe. zebra", - "image": "person.jpg", - }, - }, - { - "scenario": "Can you build me a car detector?", - "parameters": {"prompt": "car", "image": ""}, - }, - { - "scenario": "Can you detect the person on the left and right? Image name: person.jpg", - "parameters": { - "prompt": "left person. right person", - "image": "person.jpg", - }, - }, - { - "scenario": "Detect the red shirts and green shirt. Image name: shirts.jpg", - "parameters": { - "prompt": "red shirt. green shirt", - "image": "shirts.jpg", - "box_threshold": 0.20, - "iou_threshold": 0.20, - }, - }, - ], - } + pil_image = Image.fromarray(image).convert("RGB") + image_size = pil_image.size[::-1] + image_buffer = io.BytesIO() + pil_image.save(image_buffer, format="PNG") + buffer_bytes = image_buffer.getvalue() + image_buffer.close() + + res = requests.post( + _OCR_URL, + files={"images": buffer_bytes}, + data={"language": "en"}, + headers={"contentType": "multipart/form-data", "apikey": _API_KEY}, + ) - # TODO: Add support for input multiple images, which aligns with the output type. - def __call__( - self, - prompt: str, - image: Union[str, Path, ImageType], - box_threshold: float = 0.20, - iou_threshold: float = 0.20, - ) -> Dict: - """Invoke the Grounding DINO model. - - Parameters: - prompt: one or multiple class names to detect. The classes should be separated by a period if there are multiple classes. E.g. "big dog . small cat" - image: the input image to run against. - box_threshold: the threshold to filter out the bounding boxes with low scores. - iou_threshold: the threshold for intersection over union used in nms algorithm. It will suppress the boxes which have iou greater than this threshold. - - Returns: - A dictionary containing the labels, scores, and bboxes, which is the detection result for the input image. - """ - image_size = get_image_size(image) - image_b64 = convert_to_b64(image) - request_data = { - "prompt": prompt, - "image": image_b64, - "tool": "visual_grounding", - "kwargs": {"box_threshold": box_threshold, "iou_threshold": iou_threshold}, - } - data: Dict[str, Any] = _send_inference_request(request_data, "tools") - if "bboxes" in data: - data["bboxes"] = [normalize_bbox(box, image_size) for box in data["bboxes"]] - if "scores" in data: - data["scores"] = [round(score, 2) for score in data["scores"]] - if "labels" in data: - data["labels"] = list(data["labels"]) - data["image_size"] = image_size - return data - - -class GroundingSAM(Tool): - r"""Grounding SAM is a tool that can detect and segment arbitrary objects with - inputs such as category names or referring expressions. + if res.status_code != 200: + raise ValueError(f"OCR request failed with status code {res.status_code}") + + data = res.json() + output = [] + for det in data[0]: + label = det["text"] + box = [ + det["location"][0]["x"], + det["location"][0]["y"], + det["location"][2]["x"], + det["location"][2]["y"], + ] + box = normalize_bbox(box, image_size) + output.append({"label": label, "bbox": box, "score": round(det["score"], 2)}) + + return output + + +def zero_shot_counting(image: np.ndarray) -> Dict[str, Any]: + """'zero_shot_counting' is a tool that counts the dominant foreground object given an image and no other information about the content. + It returns only the count of the objects in the image. + + Parameters: + image (np.ndarray): The image that contains lot of instances of a single object + + Returns: + Dict[str, Any]: A dictionary containing the key 'count' and the count as a value. E.g. {count: 12}. Example ------- - >>> import vision_agent as va - >>> t = va.tools.GroundingSAM() - >>> t("red line, yellow dot", "ct_scan1.jpg"]) - [{'labels': ['yellow dot', 'red line'], - 'bboxes': [[0.38, 0.15, 0.59, 0.7], [0.48, 0.25, 0.69, 0.71]], - 'masks': [array([[0, 0, 0, ..., 0, 0, 0], - [0, 0, 0, ..., 0, 0, 0], - ..., - [0, 0, 0, ..., 0, 0, 0], - [0, 0, 0, ..., 0, 0, 0]], dtype=uint8)}, - array([[0, 0, 0, ..., 0, 0, 0], - [0, 0, 0, ..., 0, 0, 0], - ..., - [1, 1, 1, ..., 1, 1, 1], - [1, 1, 1, ..., 1, 1, 1]], dtype=uint8)]}] + >>> zero_shot_counting(image) + {'count': 45}, + """ - name = "grounding_sam_" - description = "'grounding_sam_' is a tool that can detect and segment multiple objects given a text prompt such as category names or referring expressions. It returns a list of bounding boxes, label names and masks file names and associated probability scores." - usage = { - "required_parameters": [ - {"name": "prompt", "type": "str"}, - {"name": "image", "type": "str"}, - ], - "optional_parameters": [ - {"name": "box_threshold", "type": "float", "min": 0.1, "max": 0.5}, - {"name": "iou_threshold", "type": "float", "min": 0.01, "max": 0.99}, - ], - "examples": [ - { - "scenario": "Can you segment the apples and grapes in this image? Image name: fruits.jpg", - "parameters": { - "prompt": "apple. grape", - "image": "fruits.jpg", - }, - }, - { - "scenario": "Can you build me a car segmentor?", - "parameters": {"prompt": "car", "image": ""}, - }, - { - "scenario": "Can you segment the person on the left and right? Image name: person.jpg", - "parameters": { - "prompt": "left person. right person", - "image": "person.jpg", - }, - }, - { - "scenario": "Can you build me a tool that segments red shirts and green shirts? Image name: shirts.jpg", - "parameters": { - "prompt": "red shirt, green shirt", - "image": "shirts.jpg", - "box_threshold": 0.20, - "iou_threshold": 0.20, - }, - }, - ], + image_b64 = convert_to_b64(image) + data = { + "image": image_b64, + "tool": "zero_shot_counting", } + resp_data = _send_inference_request(data, "tools") + resp_data["heat_map"] = np.array(b64_to_pil(resp_data["heat_map"][0])) + return resp_data + + +def visual_prompt_counting( + image: np.ndarray, visual_prompt: Dict[str, List[float]] +) -> Dict[str, Any]: + """'visual_prompt_counting' is a tool that counts the dominant foreground object given an image and a visual prompt which is a bounding box describing the object. + It returns only the count of the objects in the image. + + Parameters: + image (np.ndarray): The image that contains lot of instances of a single object - # TODO: Add support for input multiple images, which aligns with the output type. - def __call__( - self, - prompt: str, - image: Union[str, ImageType], - box_threshold: float = 0.2, - iou_threshold: float = 0.2, - ) -> Dict: - """Invoke the Grounding SAM model. - - Parameters: - prompt: a list of classes to segment. - image: the input image to segment. - box_threshold: the threshold to filter out the bounding boxes with low scores. - iou_threshold: the threshold for intersection over union used in nms algorithm. It will suppress the boxes which have iou greater than this threshold. - - Returns: - A dictionary containing the labels, scores, bboxes and masks for the input image. - """ - image_size = get_image_size(image) - image_b64 = convert_to_b64(image) - request_data = { - "prompt": prompt, - "image": image_b64, - "tool": "visual_grounding_segment", - "kwargs": {"box_threshold": box_threshold, "iou_threshold": iou_threshold}, - } - data: Dict[str, Any] = _send_inference_request(request_data, "tools") - if "bboxes" in data: - data["bboxes"] = [normalize_bbox(box, image_size) for box in data["bboxes"]] - if "masks" in data: - data["masks"] = [ - rle_decode(mask_rle=mask, shape=data["mask_shape"]) - for mask in data["masks"] - ] - data["image_size"] = image_size - data.pop("mask_shape", None) - return data - - -class DINOv(Tool): - r"""DINOv is a tool that can detect and segment similar objects with the given input masks. + Returns: + Dict[str, Any]: A dictionary containing the key 'count' and the count as a value. E.g. {count: 12}. Example ------- - >>> import vision_agent as va - >>> t = va.tools.DINOv() - >>> t(prompt=[{"mask":"balloon_mask.jpg", "image": "balloon.jpg"}], image="balloon.jpg"]) - [{'scores': [0.512, 0.212], - 'masks': [array([[0, 0, 0, ..., 0, 0, 0], - ..., - [0, 0, 0, ..., 0, 0, 0]], dtype=uint8)}, - array([[0, 0, 0, ..., 0, 0, 0], - ..., - [1, 1, 1, ..., 1, 1, 1]], dtype=uint8)]}] + >>> visual_prompt_counting(image, {"bbox": [0.1, 0.1, 0.4, 0.42]}) + {'count': 45}, + """ - name = "dinov_" - description = "'dinov_' is a tool that can detect and segment similar objects given a reference segmentation mask." - usage = { - "required_parameters": [ - {"name": "prompt", "type": "List[Dict[str, str]]"}, - {"name": "image", "type": "str"}, - ], - "examples": [ - { - "scenario": "Can you find all the balloons in this image that is similar to the provided masked area? Image name: input.jpg Reference image: balloon.jpg Reference mask: balloon_mask.jpg", - "parameters": { - "prompt": [ - {"mask": "balloon_mask.jpg", "image": "balloon.jpg"}, - ], - "image": "input.jpg", - }, - }, - { - "scenario": "Detect all the objects in this image that are similar to the provided mask. Image name: original.jpg Reference image: mask.png Reference mask: background.png", - "parameters": { - "prompt": [ - {"mask": "mask.png", "image": "background.png"}, - ], - "image": "original.jpg", - }, - }, - ], + image_size = get_image_size(image) + bbox = visual_prompt["bbox"] + bbox_str = ", ".join(map(str, denormalize_bbox(bbox, image_size))) + image_b64 = convert_to_b64(image) + + data = { + "image": image_b64, + "prompt": bbox_str, + "tool": "few_shot_counting", } + resp_data = _send_inference_request(data, "tools") + resp_data["heat_map"] = np.array(b64_to_pil(resp_data["heat_map"][0])) + return resp_data - def __call__( - self, prompt: List[Dict[str, str]], image: Union[str, ImageType] - ) -> Dict: - """Invoke the DINOv model. - - Parameters: - prompt: a list of visual prompts in the form of {'mask': 'MASK_FILE_PATH', 'image': 'IMAGE_FILE_PATH'}. - image: the input image to segment. - - Returns: - A dictionary of the below keys: 'scores', 'masks' and 'mask_shape', which stores a list of detected segmentation masks and its scores. - """ - image_b64 = convert_to_b64(image) - for p in prompt: - p["mask"] = convert_to_b64(p["mask"]) - p["image"] = convert_to_b64(p["image"]) - request_data = { - "prompt": prompt, - "image": image_b64, - } - data: Dict[str, Any] = _send_inference_request(request_data, "dinov") - if "bboxes" in data: - data["bboxes"] = [ - normalize_bbox(box, data["mask_shape"]) for box in data["bboxes"] - ] - if "masks" in data: - data["masks"] = [ - rle_decode(mask_rle=mask, shape=data["mask_shape"]) - for mask in data["masks"] - ] - data["labels"] = ["visual prompt" for _ in range(len(data["masks"]))] - mask_shape = data.pop("mask_shape", None) - data["image_size"] = (mask_shape[0], mask_shape[1]) if mask_shape else None - return data - - -class AgentDINOv(DINOv): - def __call__( - self, - prompt: List[Dict[str, str]], - image: Union[str, ImageType], - ) -> Dict: - rets = super().__call__(prompt, image) - mask_files = [] - for mask in rets["masks"]: - with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp: - file_name = Path(tmp.name).with_suffix(".mask.png") - Image.fromarray(mask * 255).save(file_name) - mask_files.append(str(file_name)) - rets["masks"] = mask_files - return rets - - -class AgentGroundingSAM(GroundingSAM): - r"""AgentGroundingSAM is the same as GroundingSAM but it saves the masks as files - returns the file name. This makes it easier for agents to use. - """ - def __call__( - self, - prompt: str, - image: Union[str, ImageType], - box_threshold: float = 0.2, - iou_threshold: float = 0.75, - ) -> Dict: - rets = super().__call__(prompt, image, box_threshold, iou_threshold) - mask_files = [] - for mask in rets["masks"]: - with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp: - file_name = Path(tmp.name).with_suffix(".mask.png") - Image.fromarray(mask * 255).save(file_name) - mask_files.append(str(file_name)) - rets["masks"] = mask_files - return rets - - -class ZeroShotCounting(Tool): - r"""ZeroShotCounting is a tool that can count total number of instances of an object - present in an image belonging to same class without a text or visual prompt. +def image_question_answering(image: np.ndarray, prompt: str) -> str: + """'image_question_answering_' is a tool that can answer questions about the visual contents of an image given a question and an image. + It returns an answer to the question + + Parameters: + image (np.ndarray): The reference image used for the question + prompt (str): The question about the image + + Returns: + str: A string which is the answer to the given prompt. E.g. {'text': 'This image contains a cat sitting on a table with a bowl of milk.'}. Example ------- - >>> import vision_agent as va - >>> zshot_count = va.tools.ZeroShotCounting() - >>> zshot_count("image1.jpg") - {'count': 45} - """ + >>> image_question_answering(image, 'What is the cat doing ?') + 'drinking milk' - name = "zero_shot_counting_" - description = "'zero_shot_counting_' is a tool that counts foreground items given only an image and no other information. It returns only the count of the objects in the image" + """ - usage = { - "required_parameters": [ - {"name": "image", "type": "str"}, - ], - "examples": [ - { - "scenario": "Can you count the items in the image? Image name: lids.jpg", - "parameters": {"image": "lids.jpg"}, - }, - { - "scenario": "Can you count the total number of objects in this image? Image name: tray.jpg", - "parameters": {"image": "tray.jpg"}, - }, - { - "scenario": "Can you build me an object counting tool? Image name: shirts.jpg", - "parameters": { - "image": "shirts.jpg", - }, - }, - ], + image_b64 = convert_to_b64(image) + data = { + "image": image_b64, + "prompt": prompt, + "tool": "image_question_answering", } - # TODO: Add support for input multiple images, which aligns with the output type. - def __call__(self, image: Union[str, ImageType]) -> Dict: - """Invoke the Zero shot counting model. + answer = _send_inference_request(data, "tools") + return answer["text"][0] # type: ignore - Parameters: - image: the input image. - Returns: - A dictionary containing the key 'count' and the count as value. E.g. {count: 12} - """ - image_b64 = convert_to_b64(image) - data = { - "image": image_b64, - "tool": "zero_shot_counting", - } - resp_data = _send_inference_request(data, "tools") - resp_data["heat_map"] = np.array(b64_to_pil(resp_data["heat_map"][0])) - return resp_data +def clip(image: np.ndarray, classes: List[str]) -> Dict[str, Any]: + """'clip' is a tool that can classify an image given a list of input classes or tags. + It returns the same list of the input classes along with their probability scores based on image content. + Parameters: + image (np.ndarray): The image to classify or tag + classes (List[str]): The list of classes or tags that is associated with the image -class VisualPromptCounting(Tool): - r"""VisualPromptCounting is a tool that can count total number of instances of an object - present in an image belonging to same class with help of an visual prompt which is a bounding box. + Returns: + Dict[str, Any]: A dictionary containing the labels and scores. One dictionary contains a list of given labels and other a list of scores. Example ------- - >>> import vision_agent as va - >>> prompt_count = va.tools.VisualPromptCounting() - >>> prompt_count(image="image1.jpg", prompt={"bbox": [0.1, 0.1, 0.4, 0.42]}) - {'count': 23} - """ + >>> clip(image, ['dog', 'cat', 'bird']) + {"labels": ["dog", "cat", "bird"], "scores": [0.68, 0.30, 0.02]}, - name = "visual_prompt_counting_" - description = "'visual_prompt_counting_' is a tool that counts foreground items in an image given a visual prompt which is a bounding box describing the object. It returns only the count of the objects in the image." + """ - usage = { - "required_parameters": [ - {"name": "image", "type": "str"}, - {"name": "prompt", "type": "Dict[str, List[float]"}, - ], - "examples": [ - { - "scenario": "Here is an example of a lid '0.1, 0.1, 0.14, 0.2', Can you count the items in the image ? Image name: lids.jpg", - "parameters": { - "image": "lids.jpg", - "prompt": {"bbox": [0.1, 0.1, 0.14, 0.2]}, - }, - }, - { - "scenario": "Can you count the total number of objects in this image ? Image name: tray.jpg, reference_data: {'bbox': [0.1, 0.1, 0.2, 0.25]}", - "parameters": { - "image": "tray.jpg", - "prompt": {"bbox": [0.1, 0.1, 0.2, 0.25]}, - }, - }, - { - "scenario": "Can you count this item based on an example, reference_data: {'bbox': [100, 115, 200, 200]} ? Image name: shirts.jpg", - "parameters": { - "image": "shirts.jpg", - "prompt": {"bbox": [100, 115, 200, 200]}, - }, - }, - { - "scenario": "Can you build me a counting tool based on an example prompt ? Image name: shoes.jpg, reference_data: {'bbox': [0.1, 0.1, 0.6, 0.65]}", - "parameters": { - "image": "shoes.jpg", - "prompt": {"bbox": [0.1, 0.1, 0.6, 0.65]}, - }, - }, - ], + image_b64 = convert_to_b64(image) + data = { + "prompt": ",".join(classes), + "image": image_b64, + "tool": "closed_set_image_classification", } + resp_data = _send_inference_request(data, "tools") + resp_data["scores"] = [round(prob, 4) for prob in resp_data["scores"]] + return resp_data - def __call__( - self, image: Union[str, ImageType], prompt: Dict[str, List[float]] - ) -> Dict: - """Invoke the few shot counting model. - - Parameters: - image: the input image. - prompt: the visual prompt which is a bounding box describing the object. - - Returns: - A dictionary containing the key 'count' and the count as value. E.g. {count: 12} - """ - image_size = get_image_size(image) - bbox = prompt["bbox"] - bbox_str = ", ".join(map(str, denormalize_bbox(bbox, image_size))) - image_b64 = convert_to_b64(image) - data = { - "image": image_b64, - "prompt": bbox_str, - "tool": "few_shot_counting", - } - resp_data = _send_inference_request(data, "tools") - resp_data["heat_map"] = np.array(b64_to_pil(resp_data["heat_map"][0])) - return resp_data +def image_caption(image: np.ndarray) -> str: + """'image_caption' is a tool that can caption an image based on its contents. + It returns a text describing the image. + Parameters: + image (np.ndarray): The image to caption -class VisualQuestionAnswering(Tool): - r"""VisualQuestionAnswering is a tool that can explain contents of an image and answer questions about the same + Returns: + str: A string which is the caption for the given image. Example ------- - >>> import vision_agent as va - >>> vqa_tool = va.tools.VisualQuestionAnswering() - >>> vqa_tool(image="image1.jpg", prompt="describe this image in detail") - {'text': "The image contains a cat sitting on a table with a bowl of milk."} - """ + >>> image_caption(image) + 'This image contains a cat sitting on a table with a bowl of milk.' - name = "visual_question_answering_" - description = "'visual_question_answering_' is a tool that can answer basic questions about the image given a question and an image. It returns a text describing the image and the answer to the question" + """ - usage = { - "required_parameters": [ - {"name": "image", "type": "str"}, - {"name": "prompt", "type": "str"}, - ], - "examples": [ - { - "scenario": "Describe this image in detail. Image name: cat.jpg", - "parameters": { - "image": "cats.jpg", - "prompt": "Describe this image in detail", - }, - }, - { - "scenario": "Can you help me with this street sign in this image ? What does it say ? Image name: sign.jpg", - "parameters": { - "image": "sign.jpg", - "prompt": "Can you help me with this street sign ? What does it say ?", - }, - }, - { - "scenario": "Describe the weather in the image for me ? Image name: weather.jpg", - "parameters": { - "image": "weather.jpg", - "prompt": "Describe the weather in the image for me ", - }, - }, - { - "scenario": "Which 2 are the least frequent bins in this histogram ? Image name: chart.jpg", - "parameters": { - "image": "chart.jpg", - "prompt": "Which 2 are the least frequent bins in this histogram", - }, - }, - ], + image_b64 = convert_to_b64(image) + data = { + "image": image_b64, + "tool": "image_captioning", } - def __call__(self, image: str, prompt: str) -> Dict: - """Invoke the visual question answering model. + answer = _send_inference_request(data, "tools") + return answer["text"][0] # type: ignore - Parameters: - image: the input image. - Returns: - A dictionary containing the key 'text' and the answer to the prompt. E.g. {'text': 'This image contains a cat sitting on a table with a bowl of milk.'} - """ - - gpt = OpenAILMM() - return {"text": gpt(input=prompt, images=[image])} +def closest_mask_distance(mask1: np.ndarray, mask2: np.ndarray) -> float: + """'closest_mask_distance' calculates the closest distance between two masks. + Parameters: + mask1 (np.ndarray): The first mask. + mask2 (np.ndarray): The second mask. -class ImageQuestionAnswering(Tool): - r"""ImageQuestionAnswering is a tool that can explain contents of an image and answer questions about the same - It is same as VisualQuestionAnswering but this tool is not used by agents. It is used when user requests a tool for VQA using generate_image_qa_tool function. - It is also useful if the user wants the data to be not exposed to OpenAI endpoints + Returns: + float: The closest distance between the two masks. Example ------- - >>> import vision_agent as va - >>> vqa_tool = va.tools.ImageQuestionAnswering() - >>> vqa_tool(image="image1.jpg", prompt="describe this image in detail") - {'text': "The image contains a cat sitting on a table with a bowl of milk."} + >>> closest_mask_distance(mask1, mask2) + 0.5 """ - name = "image_question_answering_" - description = "'image_question_answering_' is a tool that can answer basic questions about the image given a question and an image. It returns a text describing the image and the answer to the question" - - usage = { - "required_parameters": [ - {"name": "image", "type": "str"}, - {"name": "prompt", "type": "str"}, - ], - "examples": [ - { - "scenario": "Describe this image in detail. Image name: cat.jpg", - "parameters": { - "image": "cats.jpg", - "prompt": "Describe this image in detail", - }, - }, - { - "scenario": "Can you help me with this street sign in this image ? What does it say ? Image name: sign.jpg", - "parameters": { - "image": "sign.jpg", - "prompt": "Can you help me with this street sign ? What does it say ?", - }, - }, - { - "scenario": "Describe the weather in the image for me ? Image name: weather.jpg", - "parameters": { - "image": "weather.jpg", - "prompt": "Describe the weather in the image for me ", - }, - }, - { - "scenario": "Can you generate an image question answering tool ? Image name: chart.jpg, prompt: Which 2 are the least frequent bins in this histogram", - "parameters": { - "image": "chart.jpg", - "prompt": "Which 2 are the least frequent bins in this histogram", - }, - }, - ], - } + mask1 = np.clip(mask1, 0, 1) + mask2 = np.clip(mask2, 0, 1) + mask1_points = np.transpose(np.nonzero(mask1)) + mask2_points = np.transpose(np.nonzero(mask2)) + dist_matrix = distance.cdist(mask1_points, mask2_points, "euclidean") + return cast(float, np.min(dist_matrix)) - def __call__(self, image: Union[str, ImageType], prompt: str) -> Dict: - """Invoke the visual question answering model. - Parameters: - image: the input image. +def closest_box_distance( + box1: List[float], box2: List[float], image_size: Tuple[int, int] +) -> float: + """'closest_box_distance' calculates the closest distance between two bounding boxes. - Returns: - A dictionary containing the key 'text' and the answer to the prompt. E.g. {'text': 'This image contains a cat sitting on a table with a bowl of milk.'} - """ + Parameters: + box1 (List[float]): The first bounding box. + box2 (List[float]): The second bounding box. + image_size (Tuple[int, int]): The size of the image given as (height, width). - image_b64 = convert_to_b64(image) - data = { - "image": image_b64, - "prompt": prompt, - "tool": "image_question_answering", - } + Returns: + float: The closest distance between the two bounding boxes. - return _send_inference_request(data, "tools") + Example + ------- + >>> closest_box_distance([100, 100, 200, 200], [300, 300, 400, 400]) + 141.42 + """ + x11, y11, x12, y12 = denormalize_bbox(box1, image_size) + x21, y21, x22, y22 = denormalize_bbox(box2, image_size) -class Crop(Tool): - r"""Crop crops an image given a bounding box and returns a file name of the cropped image.""" + horizontal_distance = np.max([0, x21 - x12, x11 - x22]) + vertical_distance = np.max([0, y21 - y12, y11 - y22]) + return cast(float, np.sqrt(horizontal_distance**2 + vertical_distance**2)) - name = "crop_" - description = "'crop_' crops an image given a bounding box and returns a file name of the cropped image. It returns a file with the cropped image." - usage = { - "required_parameters": [ - {"name": "bbox", "type": "List[float]"}, - {"name": "image", "type": "str"}, - ], - "examples": [ - { - "scenario": "Can you crop the image to the bounding box [0.1, 0.1, 0.9, 0.9]? Image name: image.jpg", - "parameters": {"bbox": [0.1, 0.1, 0.9, 0.9], "image": "image.jpg"}, - }, - { - "scenario": "Cut out the image to the bounding box [0.2, 0.2, 0.8, 0.8]. Image name: car.jpg", - "parameters": {"bbox": [0.2, 0.2, 0.8, 0.8], "image": "car.jpg"}, - }, - ], - } - def __call__(self, bbox: List[float], image: Union[str, Path]) -> Dict: - pil_image = Image.open(image) - width, height = pil_image.size - bbox = [ - int(bbox[0] * width), - int(bbox[1] * height), - int(bbox[2] * width), - int(bbox[3] * height), - ] - cropped_image = pil_image.crop(bbox) # type: ignore - with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp: - cropped_image.save(tmp.name) +# Utility and visualization functions - return {"image": tmp.name} +def save_json(data: Any, file_path: str) -> None: + """'save_json' is a utility function that saves data as a JSON file. It is helpful + for saving data that contains NumPy arrays which are not JSON serializable. -class BboxStats(Tool): - r"""BboxStats returns the height, width and area of the bounding box in pixels to 2 decimal places.""" + Parameters: + data (Any): The data to save. + file_path (str): The path to save the JSON file. - name = "bbox_stats_" - description = "'bbox_stats_' returns the height, width and area of the given bounding box in pixels to 2 decimal places." - usage = { - "required_parameters": [ - {"name": "bboxes", "type": "List[int]"}, - {"name": "image_size", "type": "Tuple[int]"}, - ], - "examples": [ - { - "scenario": "Calculate the width and height of the bounding box [0.2, 0.21, 0.34, 0.42]", - "parameters": { - "bboxes": [[0.2, 0.21, 0.34, 0.42]], - "image_size": (500, 1200), - }, - }, - { - "scenario": "Calculate the area of the bounding box [0.2, 0.21, 0.34, 0.42]", - "parameters": { - "bboxes": [[0.2, 0.21, 0.34, 0.42]], - "image_size": (640, 480), - }, - }, - ], - } + Example + ------- + >>> save_json(data, "path/to/file.json") + """ - def __call__( - self, bboxes: List[List[int]], image_size: Tuple[int, int] - ) -> List[Dict]: - areas = [] - height, width = image_size - for bbox in bboxes: - x1, y1, x2, y2 = bbox - areas.append( - { - "width": round((x2 - x1) * width, 2), - "height": round((y2 - y1) * height, 2), - "area": round((x2 - x1) * (y2 - y1) * width * height, 2), - } - ) - - return areas - - -class SegArea(Tool): - r"""SegArea returns the area of the segmentation mask in pixels normalized to 2 decimal places.""" - - name = "seg_area_" - description = "'seg_area_' returns the area of the given segmentation mask in pixels normalized to 2 decimal places." - usage = { - "required_parameters": [{"name": "masks", "type": "str"}], - "examples": [ - { - "scenario": "If you want to calculate the area of the segmentation mask, pass the masks file name.", - "parameters": {"masks": "mask_file.jpg"}, - }, - ], - } + class NumpyEncoder(json.JSONEncoder): + def default(self, obj: Any): # type: ignore + if isinstance(obj, np.ndarray): + return obj.tolist() + elif isinstance(obj, np.bool_): + return bool(obj) + return json.JSONEncoder.default(self, obj) - def __call__(self, masks: Union[str, Path]) -> float: - pil_mask = Image.open(str(masks)) - np_mask = np.array(pil_mask) - np_mask = np.clip(np_mask, 0, 1) - return cast(float, round(np.sum(np_mask), 2)) - - -class BboxIoU(Tool): - name = "bbox_iou_" - description = "'bbox_iou_' returns the intersection over union of two bounding boxes. This is a good tool for determining if two objects are overlapping." - usage = { - "required_parameters": [ - {"name": "bbox1", "type": "List[int]"}, - {"name": "bbox2", "type": "List[int]"}, - ], - "examples": [ - { - "scenario": "If you want to calculate the intersection over union of the bounding boxes [0.2, 0.21, 0.34, 0.42] and [0.3, 0.31, 0.44, 0.52]", - "parameters": { - "bbox1": [0.2, 0.21, 0.34, 0.42], - "bbox2": [0.3, 0.31, 0.44, 0.52], - }, - } - ], - } + with open(file_path, "w") as f: + json.dump(data, f, cls=NumpyEncoder) - def __call__(self, bbox1: List[int], bbox2: List[int]) -> float: - x1, y1, x2, y2 = bbox1 - x3, y3, x4, y4 = bbox2 - xA = max(x1, x3) - yA = max(y1, y3) - xB = min(x2, x4) - yB = min(y2, y4) - inter_area = max(0, xB - xA) * max(0, yB - yA) - boxa_area = (x2 - x1) * (y2 - y1) - boxb_area = (x4 - x3) * (y4 - y3) - iou = inter_area / float(boxa_area + boxb_area - inter_area) - return round(iou, 2) - - -class SegIoU(Tool): - name = "seg_iou_" - description = "'seg_iou_' returns the intersection over union of two segmentation masks given their segmentation mask files." - usage = { - "required_parameters": [ - {"name": "mask1", "type": "str"}, - {"name": "mask2", "type": "str"}, - ], - "examples": [ - { - "scenario": "Calculate the intersection over union of the segmentation masks for mask_file1.jpg and mask_file2.jpg", - "parameters": {"mask1": "mask_file1.png", "mask2": "mask_file2.png"}, - } - ], - } - def __call__(self, mask1: Union[str, Path], mask2: Union[str, Path]) -> float: - pil_mask1 = Image.open(str(mask1)) - pil_mask2 = Image.open(str(mask2)) - np_mask1 = np.clip(np.array(pil_mask1), 0, 1) - np_mask2 = np.clip(np.array(pil_mask2), 0, 1) - intersection = np.logical_and(np_mask1, np_mask2) - union = np.logical_or(np_mask1, np_mask2) - iou = np.sum(intersection) / np.sum(union) - return cast(float, round(iou, 2)) - - -class BboxContains(Tool): - name = "bbox_contains_" - description = "Given two bounding boxes, a target bounding box and a region bounding box, 'bbox_contains_' returns the intersection of the two bounding boxes which is the percentage area of the target bounding box overlaps with the region bounding box. This is a good tool for determining if the region object contains the target object." - usage = { - "required_parameters": [ - {"name": "target", "type": "List[int]"}, - {"name": "target_class", "type": "str"}, - {"name": "region", "type": "List[int]"}, - {"name": "region_class", "type": "str"}, - ], - "examples": [ - { - "scenario": "Determine if the dog on the couch, bounding box of the dog: [0.2, 0.21, 0.34, 0.42], bounding box of the couch: [0.3, 0.31, 0.44, 0.52]", - "parameters": { - "target": [0.2, 0.21, 0.34, 0.42], - "target_class": "dog", - "region": [0.3, 0.31, 0.44, 0.52], - "region_class": "couch", - }, - }, - { - "scenario": "Check if the kid is in the pool? bounding box of the kid: [0.2, 0.21, 0.34, 0.42], bounding box of the pool: [0.3, 0.31, 0.44, 0.52]", - "parameters": { - "target": [0.2, 0.21, 0.34, 0.42], - "target_class": "kid", - "region": [0.3, 0.31, 0.44, 0.52], - "region_class": "pool", - }, - }, - ], - } +def load_image(image_path: str) -> np.ndarray: + """'load_image' is a utility function that loads an image from the given path. - def __call__( - self, target: List[int], target_class: str, region: List[int], region_class: str - ) -> Dict[str, Union[str, float]]: - x1, y1, x2, y2 = target - x3, y3, x4, y4 = region - xA = max(x1, x3) - yA = max(y1, y3) - xB = min(x2, x4) - yB = min(y2, y4) - inter_area = max(0, xB - xA) * max(0, yB - yA) - boxa_area = (x2 - x1) * (y2 - y1) - iou = inter_area / float(boxa_area) - area = round(iou, 2) - return { - "target_class": target_class, - "region_class": region_class, - "intersection": area, - } - - -class ObjectDistance(Tool): - name = "object_distance_" - description = "'object_distance_' calculates the distance between two objects in an image. It returns the minimum distance between the two objects." - usage = { - "required_parameters": [ - {"name": "object1", "type": "Dict[str, Any]"}, - {"name": "object2", "type": "Dict[str, Any]"}, - ], - "examples": [ - { - "scenario": "Calculate the distance between these two objects {bboxes: [0.2, 0.21, 0.34, 0.42], masks: 'mask_file1.png'}, {bboxes: [0.3, 0.31, 0.44, 0.52], masks: 'mask_file2.png'}", - "parameters": { - "object1": { - "bboxes": [0.2, 0.21, 0.34, 0.42], - "scores": 0.54, - "masks": "mask_file1.png", - }, - "object2": { - "bboxes": [0.3, 0.31, 0.44, 0.52], - "scores": 0.66, - "masks": "mask_file2.png", - }, - }, - } - ], - } + Parameters: + image_path (str): The path to the image. - def __call__(self, object1: Dict[str, Any], object2: Dict[str, Any]) -> float: - if "masks" in object1 and "masks" in object2: - mask1 = object1["masks"] - mask2 = object2["masks"] - return MaskDistance()(mask1, mask2) - elif "bboxes" in object1 and "bboxes" in object2: - bbox1 = object1["bboxes"] - bbox2 = object2["bboxes"] - return BoxDistance()(bbox1, bbox2) - else: - raise ValueError("Either of the objects should have masks or bboxes") - - -class BoxDistance(Tool): - name = "box_distance_" - description = "'box_distance_' calculates distance between two bounding boxes. It returns the minumum distance between the given bounding boxes" - usage = { - "required_parameters": [ - {"name": "bbox1", "type": "List[int]"}, - {"name": "bbox2", "type": "List[int]"}, - ], - "examples": [ - { - "scenario": "Calculate the distance between these two bounding boxes [0.2, 0.21, 0.34, 0.42] and [0.3, 0.31, 0.44, 0.52]", - "parameters": { - "bbox1": [0.2, 0.21, 0.34, 0.42], - "bbox2": [0.3, 0.31, 0.44, 0.52], - }, - } - ], - } + Returns: + np.ndarray: The image as a NumPy array. - def __call__(self, bbox1: List[int], bbox2: List[int]) -> float: - x11, y11, x12, y12 = bbox1 - x21, y21, x22, y22 = bbox2 + Example + ------- + >>> load_image("path/to/image.jpg") + """ - horizontal_dist = np.max([0, x21 - x12, x11 - x22]) - vertical_dist = np.max([0, y21 - y12, y11 - y22]) + image = Image.open(image_path).convert("RGB") + return np.array(image) - return cast(float, round(np.sqrt(horizontal_dist**2 + vertical_dist**2), 2)) +def save_image(image: np.ndarray) -> str: + """'save_image' is a utility function that saves an image as a temporary file. -class MaskDistance(Tool): - name = "mask_distance_" - description = "'mask_distance_' calculates distance between two masks. It is helpful in checking proximity of two objects. It returns the minumum distance between the given masks" - usage = { - "required_parameters": [ - {"name": "mask1", "type": "str"}, - {"name": "mask2", "type": "str"}, - ], - "examples": [ - { - "scenario": "Calculate the distance between the segmentation masks for mask_file1.jpg and mask_file2.jpg", - "parameters": {"mask1": "mask_file1.png", "mask2": "mask_file2.png"}, - } - ], - } + Parameters: + image (np.ndarray): The image to save. - def __call__(self, mask1: Union[str, Path], mask2: Union[str, Path]) -> float: - pil_mask1 = Image.open(str(mask1)) - pil_mask2 = Image.open(str(mask2)) - np_mask1 = np.clip(np.array(pil_mask1), 0, 1) - np_mask2 = np.clip(np.array(pil_mask2), 0, 1) + Returns: + str: The path to the saved image. - mask1_points = np.transpose(np.nonzero(np_mask1)) - mask2_points = np.transpose(np.nonzero(np_mask2)) - dist_matrix = distance.cdist(mask1_points, mask2_points, "euclidean") - return cast(float, np.round(np.min(dist_matrix), 2)) + Example + ------- + >>> save_image(image) + "/tmp/tmpabc123.png" + """ + with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as f: + pil_image = Image.fromarray(image.astype(np.uint8)) + pil_image.save(f, "PNG") + return f.name -class ExtractFrames(Tool): - r"""Extract frames from a video.""" - name = "extract_frames_" - description = "'extract_frames_' extracts frames from a video every 2 seconds, returns a list of tuples (frame, timestamp), where timestamp is the relative time in seconds where the frame was captured. The frame is a local image file path." - usage = { - "required_parameters": [{"name": "video_uri", "type": "str"}], - "optional_parameters": [{"name": "frames_every", "type": "float"}], - "examples": [ - { - "scenario": "Can you extract the frames from this video? Video: www.foobar.com/video?name=test.mp4", - "parameters": {"video_uri": "www.foobar.com/video?name=test.mp4"}, - }, - { - "scenario": "Can you extract the images from this video file at every 2 seconds ? Video path: tests/data/test.mp4", - "parameters": {"video_uri": "tests/data/test.mp4", "frames_every": 2}, - }, - ], - } +def overlay_bounding_boxes( + image: np.ndarray, bboxes: List[Dict[str, Any]] +) -> np.ndarray: + """'display_bounding_boxes' is a utility function that displays bounding boxes on + an image. - def __call__( - self, video_uri: str, frames_every: float = 2 - ) -> List[Tuple[str, float]]: - """Extract frames from a video. + Parameters: + image (np.ndarray): The image to display the bounding boxes on. + bboxes (List[Dict[str, Any]]): A list of dictionaries containing the bounding + boxes. + Returns: + np.ndarray: The image with the bounding boxes, labels and scores displayed. - Parameters: - video_uri: the path to the video file or a url points to the video data + Example + ------- + >>> image_with_bboxes = display_bounding_boxes( + image, [{'score': 0.99, 'label': 'dinosaur', 'bbox': [0.1, 0.11, 0.35, 0.4]}], + ) + """ + pil_image = Image.fromarray(image.astype(np.uint8)) - Returns: - a list of tuples containing the extracted frame and the timestamp in seconds. E.g. [(path_to_frame1, 0.0), (path_to_frame2, 0.5), ...]. The timestamp is the time in seconds from the start of the video. E.g. 12.125 means 12.125 seconds from the start of the video. The frames are sorted by the timestamp in ascending order. - """ - frames = extract_frames_from_video(video_uri, fps=round(1 / frames_every, 2)) - result = [] - _LOGGER.info( - f"Extracted {len(frames)} frames from video {video_uri}. Temporarily saving them as images to disk for downstream tasks." + if len(set([box["label"] for box in bboxes])) > len(COLORS): + _LOGGER.warning( + "Number of unique labels exceeds the number of available colors. Some labels may have the same color." ) - for frame, ts in frames: - with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp: - file_name = Path(tmp.name).with_suffix(".frame.png") - Image.fromarray(frame).save(file_name) - result.append((str(file_name), ts)) - return result - - -class OCR(Tool): - name = "ocr_" - description = "'ocr_' extracts text from an image. It returns a list of detected text, bounding boxes, and confidence scores." - usage = { - "required_parameters": [ - {"name": "image", "type": "str"}, - ], - "examples": [ - { - "scenario": "Can you extract the text from this image? Image name: image.png", - "parameters": {"image": "image.png"}, - }, - ], + + color = { + label: COLORS[i % len(COLORS)] + for i, label in enumerate(set([box["label"] for box in bboxes])) } - _API_KEY = "land_sk_WVYwP00xA3iXely2vuar6YUDZ3MJT9yLX6oW5noUkwICzYLiDV" - _URL = "https://app.landing.ai/ocr/v1/detect-text" - - def __call__(self, image: str) -> dict: - pil_image = Image.open(image).convert("RGB") - image_size = pil_image.size[::-1] - image_buffer = io.BytesIO() - pil_image.save(image_buffer, format="PNG") - buffer_bytes = image_buffer.getvalue() - image_buffer.close() - - res = requests.post( - self._URL, - files={"images": buffer_bytes}, - data={"language": "en"}, - headers={"contentType": "multipart/form-data", "apikey": self._API_KEY}, - ) - if res.status_code != 200: - _LOGGER.error(f"Request failed: {res.text}") - raise ValueError(f"Request failed: {res.text}") - - data = res.json() - output: Dict[str, List] = {"labels": [], "bboxes": [], "scores": []} - for det in data[0]: - output["labels"].append(det["text"]) - box = [ - det["location"][0]["x"], - det["location"][0]["y"], - det["location"][2]["x"], - det["location"][2]["y"], - ] - box = normalize_bbox(box, image_size) - output["bboxes"].append(box) - output["scores"].append(round(det["score"], 2)) - return output - - -class Calculator(Tool): - r"""Calculator is a tool that can perform basic arithmetic operations.""" - - name = "calculator_" - description = ( - "'calculator_' is a tool that can perform basic arithmetic operations." + + width, height = pil_image.size + fontsize = max(12, int(min(width, height) / 40)) + draw = ImageDraw.Draw(pil_image) + font = ImageFont.truetype( + str(resources.files("vision_agent.fonts").joinpath("default_font_ch_en.ttf")), + fontsize, ) - usage = { - "required_parameters": [{"name": "equation", "type": "str"}], - "examples": [ - { - "scenario": "If you want to calculate (2 * 3) + 4", - "parameters": {"equation": "2 + 4"}, - }, - { - "scenario": "If you want to calculate (4 + 2.5) / 2.1", - "parameters": {"equation": "(4 + 2.5) / 2.1"}, - }, - ], - } - def __call__(self, equation: str) -> float: - return cast(float, round(eval(equation), 2)) - - -TOOLS = { - i: {"name": c.name, "description": c.description, "usage": c.usage, "class": c} - for i, c in enumerate( - [ - NoOp, - CLIP, - GroundingDINO, - AgentGroundingSAM, - ZeroShotCounting, - VisualPromptCounting, - VisualQuestionAnswering, - AgentDINOv, - ExtractFrames, - Crop, - BboxStats, - SegArea, - ObjectDistance, - BboxContains, - SegIoU, - OCR, - Calculator, + for elt in bboxes: + label = elt["label"] + box = elt["bbox"] + scores = elt["score"] + + box = [ + int(box[0] * width), + int(box[1] * height), + int(box[2] * width), + int(box[3] * height), ] - ) - if (hasattr(c, "name") and hasattr(c, "description") and hasattr(c, "usage")) -} + draw.rectangle(box, outline=color[label], width=4) + text = f"{label}: {scores:.2f}" + text_box = draw.textbbox((box[0], box[1]), text=text, font=font) + draw.rectangle((box[0], box[1], text_box[2], text_box[3]), fill=color[label]) + draw.text((box[0], box[1]), text, fill="black", font=font) + return np.array(pil_image.convert("RGB")) -def register_tool(tool: Type[Tool]) -> Type[Tool]: - r"""Add a tool to the list of available tools. +def overlay_segmentation_masks( + image: np.ndarray, masks: List[Dict[str, Any]] +) -> np.ndarray: + """'display_segmentation_masks' is a utility function that displays segmentation + masks. Parameters: - tool: The tool to add. + image (np.ndarray): The image to display the masks on. + masks (List[Dict[str, Any]]): A list of dictionaries containing the masks. + + Returns: + np.ndarray: The image with the masks displayed. + + Example + ------- + >>> image_with_masks = display_segmentation_masks( + image, + [{ + 'score': 0.99, + 'label': 'dinosaur', + 'mask': array([[0, 0, 0, ..., 0, 0, 0], + [0, 0, 0, ..., 0, 0, 0], + ..., + [0, 0, 0, ..., 0, 0, 0], + [0, 0, 0, ..., 0, 0, 0]], dtype=uint8), + }], + ) """ + pil_image = Image.fromarray(image.astype(np.uint8)).convert("RGBA") - if ( - not hasattr(tool, "name") - or not hasattr(tool, "description") - or not hasattr(tool, "usage") - ): - raise ValueError( - "The tool must have 'name', 'description' and 'usage' attributes." + if len(set([mask["label"] for mask in masks])) > len(COLORS): + _LOGGER.warning( + "Number of unique labels exceeds the number of available colors. Some labels may have the same color." ) - TOOLS[len(TOOLS)] = { - "name": tool.name, - "description": tool.description, - "usage": tool.usage, - "class": tool, + color = { + label: COLORS[i % len(COLORS)] + for i, label in enumerate(set([mask["label"] for mask in masks])) } - return tool + + for elt in masks: + mask = elt["mask"] + label = elt["label"] + np_mask = np.zeros((pil_image.size[1], pil_image.size[0], 4)) + np_mask[mask > 0, :] = color[label] + (255 * 0.5,) + mask_img = Image.fromarray(np_mask.astype(np.uint8)) + pil_image = Image.alpha_composite(pil_image, mask_img) + return np.array(pil_image.convert("RGB")) + + +def get_tool_documentation(funcs: List[Callable[..., Any]]) -> str: + docstrings = "" + for func in funcs: + docstrings += f"{func.__name__}{inspect.signature(func)}:\n{func.__doc__}\n\n" + + return docstrings + + +def get_tool_descriptions(funcs: List[Callable[..., Any]]) -> str: + descriptions = "" + for func in funcs: + description = func.__doc__ + if description is None: + description = "" + + description = ( + description[: description.find("Parameters:")].replace("\n", " ").strip() + ) + description = " ".join(description.split()) + descriptions += f"- {func.__name__}{inspect.signature(func)}: {description}\n" + return descriptions + + +def get_tools_df(funcs: List[Callable[..., Any]]) -> pd.DataFrame: + data: Dict[str, List[str]] = {"desc": [], "doc": []} + + for func in funcs: + desc = func.__doc__ + if desc is None: + desc = "" + desc = desc[: desc.find("Parameters:")].replace("\n", " ").strip() + desc = " ".join(desc.split()) + + doc = f"{func.__name__}{inspect.signature(func)}:\n{func.__doc__}" + data["desc"].append(desc) + data["doc"].append(doc) + + return pd.DataFrame(data) # type: ignore + + +TOOLS = [ + grounding_dino, + grounding_sam, + extract_frames, + ocr, + clip, + zero_shot_counting, + visual_prompt_counting, + image_question_answering, + image_caption, + closest_mask_distance, + closest_box_distance, + save_json, + load_image, + save_image, + overlay_bounding_boxes, + overlay_segmentation_masks, +] +TOOLS_DF = get_tools_df(TOOLS) # type: ignore +TOOL_DESCRIPTIONS = get_tool_descriptions(TOOLS) # type: ignore +TOOL_DOCSTRING = get_tool_documentation(TOOLS) # type: ignore +UTILITIES_DOCSTRING = get_tool_documentation( + [save_json, load_image, save_image, overlay_bounding_boxes] +) diff --git a/vision_agent/tools/tools_v2.py b/vision_agent/tools/tools_v2.py deleted file mode 100644 index 8e202856..00000000 --- a/vision_agent/tools/tools_v2.py +++ /dev/null @@ -1,685 +0,0 @@ -import inspect -import io -import json -import logging -import tempfile -from importlib import resources -from pathlib import Path -from typing import Any, Callable, Dict, List, Tuple, Union, cast - -import numpy as np -import pandas as pd -import requests -from PIL import Image, ImageDraw, ImageFont -from scipy.spatial import distance # type: ignore - -from vision_agent.tools.tool_utils import _send_inference_request -from vision_agent.utils import extract_frames_from_video -from vision_agent.utils.image_utils import ( - b64_to_pil, - convert_to_b64, - denormalize_bbox, - get_image_size, - normalize_bbox, - rle_decode, -) - -COLORS = [ - (158, 218, 229), - (219, 219, 141), - (23, 190, 207), - (188, 189, 34), - (199, 199, 199), - (247, 182, 210), - (127, 127, 127), - (227, 119, 194), - (196, 156, 148), - (197, 176, 213), - (140, 86, 75), - (148, 103, 189), - (255, 152, 150), - (152, 223, 138), - (214, 39, 40), - (44, 160, 44), - (255, 187, 120), - (174, 199, 232), - (255, 127, 14), - (31, 119, 180), -] -_API_KEY = "land_sk_WVYwP00xA3iXely2vuar6YUDZ3MJT9yLX6oW5noUkwICzYLiDV" -_OCR_URL = "https://app.landing.ai/ocr/v1/detect-text" -logging.basicConfig(level=logging.INFO) -_LOGGER = logging.getLogger(__name__) - - -def grounding_dino( - prompt: str, - image: np.ndarray, - box_threshold: float = 0.20, - iou_threshold: float = 0.20, -) -> List[Dict[str, Any]]: - """'grounding_dino' is a tool that can detect and count objects given a text prompt - such as category names or referring expressions. It returns a list and count of - bounding boxes, label names and associated probability scores. - - Parameters: - prompt (str): The prompt to ground to the image. - image (np.ndarray): The image to ground the prompt to. - box_threshold (float, optional): The threshold for the box detection. Defaults - to 0.20. - iou_threshold (float, optional): The threshold for the Intersection over Union - (IoU). Defaults to 0.20. - - Returns: - List[Dict[str, Any]]: A list of dictionaries containing the score, label, and - bounding box of the detected objects with normalized coordinates - (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left and - xmax and ymax are the coordinates of the bottom-right of the bounding box. - - Example - ------- - >>> grounding_dino("car. dinosaur", image) - [ - {'score': 0.99, 'label': 'dinosaur', 'bbox': [0.1, 0.11, 0.35, 0.4]}, - {'score': 0.98, 'label': 'car', 'bbox': [0.2, 0.21, 0.45, 0.5}, - ] - """ - image_size = image.shape[:2] - image_b64 = convert_to_b64(image) - request_data = { - "prompt": prompt, - "image": image_b64, - "tool": "visual_grounding", - "kwargs": {"box_threshold": box_threshold, "iou_threshold": iou_threshold}, - } - data: Dict[str, Any] = _send_inference_request(request_data, "tools") - return_data = [] - for i in range(len(data["bboxes"])): - return_data.append( - { - "score": round(data["scores"][i], 2), - "label": data["labels"][i], - "bbox": normalize_bbox(data["bboxes"][i], image_size), - } - ) - return return_data - - -def grounding_sam( - prompt: str, - image: np.ndarray, - box_threshold: float = 0.20, - iou_threshold: float = 0.20, -) -> List[Dict[str, Any]]: - """'grounding_sam' is a tool that can detect and segment objects given a text - prompt such as category names or referring expressions. It returns a list of - bounding boxes, label names and masks file names and associated probability scores. - - Parameters: - prompt (str): The prompt to ground to the image. - image (np.ndarray): The image to ground the prompt to. - box_threshold (float, optional): The threshold for the box detection. Defaults - to 0.20. - iou_threshold (float, optional): The threshold for the Intersection over Union - (IoU). Defaults to 0.20. - - Returns: - List[Dict[str, Any]]: A list of dictionaries containing the score, label, - bounding box, and mask of the detected objects with normalized coordinates - (xmin, ymin, xmax, ymax). xmin and ymin are the coordinates of the top-left and - xmax and ymax are the coordinates of the bottom-right of the bounding box. - The mask is binary 2D numpy array where 1 indicates the object and 0 indicates - the background. - - Example - ------- - >>> grounding_sam("car. dinosaur", image) - [ - { - 'score': 0.99, - 'label': 'dinosaur', - 'bbox': [0.1, 0.11, 0.35, 0.4], - 'mask': array([[0, 0, 0, ..., 0, 0, 0], - [0, 0, 0, ..., 0, 0, 0], - ..., - [0, 0, 0, ..., 0, 0, 0], - [0, 0, 0, ..., 0, 0, 0]], dtype=uint8), - }, - ] - """ - image_size = image.shape[:2] - image_b64 = convert_to_b64(image) - request_data = { - "prompt": prompt, - "image": image_b64, - "tool": "visual_grounding_segment", - "kwargs": {"box_threshold": box_threshold, "iou_threshold": iou_threshold}, - } - data: Dict[str, Any] = _send_inference_request(request_data, "tools") - return_data = [] - for i in range(len(data["bboxes"])): - return_data.append( - { - "score": round(data["scores"][i], 2), - "label": data["labels"][i], - "bbox": normalize_bbox(data["bboxes"][i], image_size), - "mask": rle_decode(mask_rle=data["masks"][i], shape=data["mask_shape"]), - } - ) - return return_data - - -def extract_frames( - video_uri: Union[str, Path], fps: float = 0.5 -) -> List[Tuple[np.ndarray, float]]: - """'extract_frames' extracts frames from a video, returns a list of tuples (frame, - timestamp), where timestamp is the relative time in seconds where the frame was - captured. The frame is a local image file path. - - Parameters: - video_uri (Union[str, Path]): The path to the video file. - fps (float, optional): The frame rate per second to extract the frames. Defaults - to 0.5. - - Returns: - List[Tuple[np.ndarray, float]]: A list of tuples containing the extracted frame - and the timestamp in seconds. - - Example - ------- - >>> extract_frames("path/to/video.mp4") - [(frame1, 0.0), (frame2, 0.5), ...] - """ - - return extract_frames_from_video(str(video_uri), fps) - - -def ocr(image: np.ndarray) -> List[Dict[str, Any]]: - """'ocr' extracts text from an image. It returns a list of detected text, bounding - boxes, and confidence scores. - - Parameters: - image (np.ndarray): The image to extract text from. - - Returns: - List[Dict[str, Any]]: A list of dictionaries containing the detected text, bbox, - and confidence score. - - Example - ------- - >>> ocr(image) - [ - {'label': 'some text', 'bbox': [0.1, 0.11, 0.35, 0.4], 'score': 0.99}, - ] - """ - - pil_image = Image.fromarray(image).convert("RGB") - image_size = pil_image.size[::-1] - image_buffer = io.BytesIO() - pil_image.save(image_buffer, format="PNG") - buffer_bytes = image_buffer.getvalue() - image_buffer.close() - - res = requests.post( - _OCR_URL, - files={"images": buffer_bytes}, - data={"language": "en"}, - headers={"contentType": "multipart/form-data", "apikey": _API_KEY}, - ) - - if res.status_code != 200: - raise ValueError(f"OCR request failed with status code {res.status_code}") - - data = res.json() - output = [] - for det in data[0]: - label = det["text"] - box = [ - det["location"][0]["x"], - det["location"][0]["y"], - det["location"][2]["x"], - det["location"][2]["y"], - ] - box = normalize_bbox(box, image_size) - output.append({"label": label, "bbox": box, "score": round(det["score"], 2)}) - - return output - - -def zero_shot_counting(image: np.ndarray) -> Dict[str, Any]: - """'zero_shot_counting' is a tool that counts the dominant foreground object given an image and no other information about the content. - It returns only the count of the objects in the image. - - Parameters: - image (np.ndarray): The image that contains lot of instances of a single object - - Returns: - Dict[str, Any]: A dictionary containing the key 'count' and the count as a value. E.g. {count: 12}. - - Example - ------- - >>> zero_shot_counting(image) - {'count': 45}, - - """ - - image_b64 = convert_to_b64(image) - data = { - "image": image_b64, - "tool": "zero_shot_counting", - } - resp_data = _send_inference_request(data, "tools") - resp_data["heat_map"] = np.array(b64_to_pil(resp_data["heat_map"][0])) - return resp_data - - -def visual_prompt_counting( - image: np.ndarray, visual_prompt: Dict[str, List[float]] -) -> Dict[str, Any]: - """'visual_prompt_counting' is a tool that counts the dominant foreground object given an image and a visual prompt which is a bounding box describing the object. - It returns only the count of the objects in the image. - - Parameters: - image (np.ndarray): The image that contains lot of instances of a single object - - Returns: - Dict[str, Any]: A dictionary containing the key 'count' and the count as a value. E.g. {count: 12}. - - Example - ------- - >>> visual_prompt_counting(image, {"bbox": [0.1, 0.1, 0.4, 0.42]}) - {'count': 45}, - - """ - - image_size = get_image_size(image) - bbox = visual_prompt["bbox"] - bbox_str = ", ".join(map(str, denormalize_bbox(bbox, image_size))) - image_b64 = convert_to_b64(image) - - data = { - "image": image_b64, - "prompt": bbox_str, - "tool": "few_shot_counting", - } - resp_data = _send_inference_request(data, "tools") - resp_data["heat_map"] = np.array(b64_to_pil(resp_data["heat_map"][0])) - return resp_data - - -def image_question_answering(image: np.ndarray, prompt: str) -> str: - """'image_question_answering_' is a tool that can answer questions about the visual contents of an image given a question and an image. - It returns an answer to the question - - Parameters: - image (np.ndarray): The reference image used for the question - prompt (str): The question about the image - - Returns: - str: A string which is the answer to the given prompt. E.g. {'text': 'This image contains a cat sitting on a table with a bowl of milk.'}. - - Example - ------- - >>> image_question_answering(image, 'What is the cat doing ?') - 'drinking milk' - - """ - - image_b64 = convert_to_b64(image) - data = { - "image": image_b64, - "prompt": prompt, - "tool": "image_question_answering", - } - - answer = _send_inference_request(data, "tools") - return answer["text"][0] # type: ignore - - -def clip(image: np.ndarray, classes: List[str]) -> Dict[str, Any]: - """'clip' is a tool that can classify an image given a list of input classes or tags. - It returns the same list of the input classes along with their probability scores based on image content. - - Parameters: - image (np.ndarray): The image to classify or tag - classes (List[str]): The list of classes or tags that is associated with the image - - Returns: - Dict[str, Any]: A dictionary containing the labels and scores. One dictionary contains a list of given labels and other a list of scores. - - Example - ------- - >>> clip(image, ['dog', 'cat', 'bird']) - {"labels": ["dog", "cat", "bird"], "scores": [0.68, 0.30, 0.02]}, - - """ - - image_b64 = convert_to_b64(image) - data = { - "prompt": ",".join(classes), - "image": image_b64, - "tool": "closed_set_image_classification", - } - resp_data = _send_inference_request(data, "tools") - resp_data["scores"] = [round(prob, 4) for prob in resp_data["scores"]] - return resp_data - - -def image_caption(image: np.ndarray) -> str: - """'image_caption' is a tool that can caption an image based on its contents. - It returns a text describing the image. - - Parameters: - image (np.ndarray): The image to caption - - Returns: - str: A string which is the caption for the given image. - - Example - ------- - >>> image_caption(image) - 'This image contains a cat sitting on a table with a bowl of milk.' - - """ - - image_b64 = convert_to_b64(image) - data = { - "image": image_b64, - "tool": "image_captioning", - } - - answer = _send_inference_request(data, "tools") - return answer["text"][0] # type: ignore - - -def closest_mask_distance(mask1: np.ndarray, mask2: np.ndarray) -> float: - """'closest_mask_distance' calculates the closest distance between two masks. - - Parameters: - mask1 (np.ndarray): The first mask. - mask2 (np.ndarray): The second mask. - - Returns: - float: The closest distance between the two masks. - - Example - ------- - >>> closest_mask_distance(mask1, mask2) - 0.5 - """ - - mask1 = np.clip(mask1, 0, 1) - mask2 = np.clip(mask2, 0, 1) - mask1_points = np.transpose(np.nonzero(mask1)) - mask2_points = np.transpose(np.nonzero(mask2)) - dist_matrix = distance.cdist(mask1_points, mask2_points, "euclidean") - return cast(float, np.min(dist_matrix)) - - -def closest_box_distance( - box1: List[float], box2: List[float], image_size: Tuple[int, int] -) -> float: - """'closest_box_distance' calculates the closest distance between two bounding boxes. - - Parameters: - box1 (List[float]): The first bounding box. - box2 (List[float]): The second bounding box. - image_size (Tuple[int, int]): The size of the image given as (height, width). - - Returns: - float: The closest distance between the two bounding boxes. - - Example - ------- - >>> closest_box_distance([100, 100, 200, 200], [300, 300, 400, 400]) - 141.42 - """ - - x11, y11, x12, y12 = denormalize_bbox(box1, image_size) - x21, y21, x22, y22 = denormalize_bbox(box2, image_size) - - horizontal_distance = np.max([0, x21 - x12, x11 - x22]) - vertical_distance = np.max([0, y21 - y12, y11 - y22]) - return cast(float, np.sqrt(horizontal_distance**2 + vertical_distance**2)) - - -# Utility and visualization functions - - -def save_json(data: Any, file_path: str) -> None: - """'save_json' is a utility function that saves data as a JSON file. It is helpful - for saving data that contains NumPy arrays which are not JSON serializable. - - Parameters: - data (Any): The data to save. - file_path (str): The path to save the JSON file. - - Example - ------- - >>> save_json(data, "path/to/file.json") - """ - - class NumpyEncoder(json.JSONEncoder): - def default(self, obj: Any): # type: ignore - if isinstance(obj, np.ndarray): - return obj.tolist() - elif isinstance(obj, np.bool_): - return bool(obj) - return json.JSONEncoder.default(self, obj) - - with open(file_path, "w") as f: - json.dump(data, f, cls=NumpyEncoder) - - -def load_image(image_path: str) -> np.ndarray: - """'load_image' is a utility function that loads an image from the given path. - - Parameters: - image_path (str): The path to the image. - - Returns: - np.ndarray: The image as a NumPy array. - - Example - ------- - >>> load_image("path/to/image.jpg") - """ - - image = Image.open(image_path).convert("RGB") - return np.array(image) - - -def save_image(image: np.ndarray) -> str: - """'save_image' is a utility function that saves an image as a temporary file. - - Parameters: - image (np.ndarray): The image to save. - - Returns: - str: The path to the saved image. - - Example - ------- - >>> save_image(image) - "/tmp/tmpabc123.png" - """ - - with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as f: - pil_image = Image.fromarray(image.astype(np.uint8)) - pil_image.save(f, "PNG") - return f.name - - -def overlay_bounding_boxes( - image: np.ndarray, bboxes: List[Dict[str, Any]] -) -> np.ndarray: - """'display_bounding_boxes' is a utility function that displays bounding boxes on - an image. - - Parameters: - image (np.ndarray): The image to display the bounding boxes on. - bboxes (List[Dict[str, Any]]): A list of dictionaries containing the bounding - boxes. - - Returns: - np.ndarray: The image with the bounding boxes, labels and scores displayed. - - Example - ------- - >>> image_with_bboxes = display_bounding_boxes( - image, [{'score': 0.99, 'label': 'dinosaur', 'bbox': [0.1, 0.11, 0.35, 0.4]}], - ) - """ - pil_image = Image.fromarray(image.astype(np.uint8)) - - if len(set([box["label"] for box in bboxes])) > len(COLORS): - _LOGGER.warning( - "Number of unique labels exceeds the number of available colors. Some labels may have the same color." - ) - - color = { - label: COLORS[i % len(COLORS)] - for i, label in enumerate(set([box["label"] for box in bboxes])) - } - - width, height = pil_image.size - fontsize = max(12, int(min(width, height) / 40)) - draw = ImageDraw.Draw(pil_image) - font = ImageFont.truetype( - str(resources.files("vision_agent.fonts").joinpath("default_font_ch_en.ttf")), - fontsize, - ) - - for elt in bboxes: - label = elt["label"] - box = elt["bbox"] - scores = elt["score"] - - box = [ - int(box[0] * width), - int(box[1] * height), - int(box[2] * width), - int(box[3] * height), - ] - draw.rectangle(box, outline=color[label], width=4) - text = f"{label}: {scores:.2f}" - text_box = draw.textbbox((box[0], box[1]), text=text, font=font) - draw.rectangle((box[0], box[1], text_box[2], text_box[3]), fill=color[label]) - draw.text((box[0], box[1]), text, fill="black", font=font) - return np.array(pil_image.convert("RGB")) - - -def overlay_segmentation_masks( - image: np.ndarray, masks: List[Dict[str, Any]] -) -> np.ndarray: - """'display_segmentation_masks' is a utility function that displays segmentation - masks. - - Parameters: - image (np.ndarray): The image to display the masks on. - masks (List[Dict[str, Any]]): A list of dictionaries containing the masks. - - Returns: - np.ndarray: The image with the masks displayed. - - Example - ------- - >>> image_with_masks = display_segmentation_masks( - image, - [{ - 'score': 0.99, - 'label': 'dinosaur', - 'mask': array([[0, 0, 0, ..., 0, 0, 0], - [0, 0, 0, ..., 0, 0, 0], - ..., - [0, 0, 0, ..., 0, 0, 0], - [0, 0, 0, ..., 0, 0, 0]], dtype=uint8), - }], - ) - """ - pil_image = Image.fromarray(image.astype(np.uint8)).convert("RGBA") - - if len(set([mask["label"] for mask in masks])) > len(COLORS): - _LOGGER.warning( - "Number of unique labels exceeds the number of available colors. Some labels may have the same color." - ) - - color = { - label: COLORS[i % len(COLORS)] - for i, label in enumerate(set([mask["label"] for mask in masks])) - } - - for elt in masks: - mask = elt["mask"] - label = elt["label"] - np_mask = np.zeros((pil_image.size[1], pil_image.size[0], 4)) - np_mask[mask > 0, :] = color[label] + (255 * 0.5,) - mask_img = Image.fromarray(np_mask.astype(np.uint8)) - pil_image = Image.alpha_composite(pil_image, mask_img) - return np.array(pil_image.convert("RGB")) - - -def get_tool_documentation(funcs: List[Callable[..., Any]]) -> str: - docstrings = "" - for func in funcs: - docstrings += f"{func.__name__}{inspect.signature(func)}:\n{func.__doc__}\n\n" - - return docstrings - - -def get_tool_descriptions(funcs: List[Callable[..., Any]]) -> str: - descriptions = "" - for func in funcs: - description = func.__doc__ - if description is None: - description = "" - - description = ( - description[: description.find("Parameters:")].replace("\n", " ").strip() - ) - description = " ".join(description.split()) - descriptions += f"- {func.__name__}{inspect.signature(func)}: {description}\n" - return descriptions - - -def get_tools_df(funcs: List[Callable[..., Any]]) -> pd.DataFrame: - data: Dict[str, List[str]] = {"desc": [], "doc": []} - - for func in funcs: - desc = func.__doc__ - if desc is None: - desc = "" - desc = desc[: desc.find("Parameters:")].replace("\n", " ").strip() - desc = " ".join(desc.split()) - - doc = f"{func.__name__}{inspect.signature(func)}:\n{func.__doc__}" - data["desc"].append(desc) - data["doc"].append(doc) - - return pd.DataFrame(data) # type: ignore - - -TOOLS = [ - grounding_dino, - grounding_sam, - extract_frames, - ocr, - clip, - zero_shot_counting, - visual_prompt_counting, - image_question_answering, - image_caption, - closest_mask_distance, - closest_box_distance, - save_json, - load_image, - save_image, - overlay_bounding_boxes, - overlay_segmentation_masks, -] -TOOLS_DF = get_tools_df(TOOLS) # type: ignore -TOOL_DESCRIPTIONS = get_tool_descriptions(TOOLS) # type: ignore -TOOL_DOCSTRING = get_tool_documentation(TOOLS) # type: ignore -UTILITIES_DOCSTRING = get_tool_documentation( - [save_json, load_image, save_image, overlay_bounding_boxes] -)