From a7484ca7929ff3842114f4789aeae1a1802e2dbf Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Fri, 26 Apr 2024 10:37:00 -0700 Subject: [PATCH 1/6] added new tool docs --- README.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 835b8b99..1ee54443 100644 --- a/README.md +++ b/README.md @@ -137,8 +137,10 @@ to pick it based on the tool description and use it based on the usage provided. | BoxDistance | BoxDistance returns the minimum distance between two bounding boxes normalized to 2 decimal places. | | BboxContains | BboxContains returns the intersection of two boxes over the target box area. It is good for check if one box is contained within another box. | | ExtractFrames | ExtractFrames extracts frames with motion from a video. | -| ZeroShotCounting | ZeroShotCounting returns the total number of objects belonging to a single class in a given image | -| VisualPromptCounting | VisualPromptCounting returns the total number of objects belonging to a single class given an image and visual prompt | +| ZeroShotCounting | ZeroShotCounting returns the total number of objects belonging to a single class in a given image. | +| VisualPromptCounting | VisualPromptCounting returns the total number of objects belonging to a single class given an image and visual prompt. | +| VisualQuestionAnswering | VisualQuestionAnswering is a tool that can explain the contents of an image and answer questions about the image. | +| ImageQuestionAnswering | ImageQuestionAnswering is similar to VisualQuestionAnswering but does not rely on OpenAI and instead uses a dedicated model for the task. | | OCR | OCR returns the text detected in an image along with the location. | From 97d7fd6ab0951153618497ea6268ed56ae9012d2 Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Fri, 26 Apr 2024 11:06:14 -0700 Subject: [PATCH 2/6] added example tutorials --- README.md | 59 +++++++++++++++++++++++---------- examples/custom_tools/README.md | 29 ++++++++++++++++ examples/mask_app/README.md | 31 +++++++++++++++++ 3 files changed, 102 insertions(+), 17 deletions(-) create mode 100644 examples/custom_tools/README.md create mode 100644 examples/mask_app/README.md diff --git a/README.md b/README.md index 1ee54443..53a36be1 100644 --- a/README.md +++ b/README.md @@ -78,6 +78,30 @@ the individual steps and tools to get the answer: {"visualize_output": "final_output.png"}] ``` +You can also provide reference data for the model to utilize. For example, if you want +to utilize VisualPromptCounting: + +```python +agent( + "How many apples are in this image?", + image="apples.jpg", + reference_data={"bbox": [0.1, 0.11, 0.24, 0.25]}, +) +``` +Where `[0.1, 0.11, 0.24, 0.25]` is the normalized bounding box coordinates of an apple. +Similarly for DINOv you can provide a reference image and mask: + +```python +agent( + "Can you detect all of the objects similar to the mask I've provided?", + image="image.jpg", + reference_data={"mask": "reference_mask.png", "image": "reference_image.png"}, +) +``` +Here, `reference_mask.png` and `reference_image.png` in `reference_data` could be any +image with it's corresponding mask that is the object you want to detect in `image.jpg`. +You can find a demo app to generate masks for DINOv [here](examples/mask_app/README.md) + ### Tools There are a variety of tools for the model or the user to use. Some are executed locally while others are hosted for you. You can also ask an LLM directly to build a tool for @@ -100,25 +124,26 @@ you. For example: You can also add your own custom tools for your vision agent to use: ```python ->>> from vision_agent.tools import Tool, register_tool ->>> @register_tool ->>> class NumItems(Tool): ->>> name = "num_items_" ->>> description = "Returns the number of items in a list." ->>> usage = { ->>> "required_parameters": [{"name": "prompt", "type": "list"}], ->>> "examples": [ ->>> { ->>> "scenario": "How many items are in this list? ['a', 'b', 'c']", ->>> "parameters": {"prompt": "['a', 'b', 'c']"}, ->>> } ->>> ], ->>> } ->>> def __call__(self, prompt: list[str]) -> int: ->>> return len(prompt) +from vision_agent.tools import Tool, register_tool +@register_tool +class NumItems(Tool): + name = "num_items_" + description = "Returns the number of items in a list." + usage = { + "required_parameters": [{"name": "prompt", "type": "list"}], + "examples": [ + { + "scenario": "How many items are in this list? ['a', 'b', 'c']", + "parameters": {"prompt": "['a', 'b', 'c']"}, + } + ], + } + def __call__(self, prompt: list[str]) -> int: + return len(prompt) ``` This will register it with the list of tools Vision Agent has access to. It will be able -to pick it based on the tool description and use it based on the usage provided. +to pick it based on the tool description and use it based on the usage provided. You can +find an example that creates a custom tool for template matching [here](examples/custom_tool/README.md). #### Tool List | Tool | Description | diff --git a/examples/custom_tools/README.md b/examples/custom_tools/README.md new file mode 100644 index 00000000..5ad342ac --- /dev/null +++ b/examples/custom_tools/README.md @@ -0,0 +1,29 @@ +# Template Matching Custom Tool + +This demo shows you how to create a custom tool for template matching that your Vision +Agent can then use to help you answer questions. To get started, you can install the +requirements by running: + +```bash +pip install -r requirements.txt +``` + +You can then run the custom tool by running: + +```bash +python run_custom_tool.py +``` + +Tool choice can be difficult for the agent to get, so sometimes it helps to explicitly +call out which tool you want to use. For example: + +```python +import vision_agent as va + +agent = va.agent.VisionAgent(verbose=True) +agent( + "Can you use the 'template_match_' tool to find the location of pid_template.png in pid.png?", + image="pid.png", + reference_data={"image": "pid_template.png"}, +) +``` diff --git a/examples/mask_app/README.md b/examples/mask_app/README.md new file mode 100644 index 00000000..a4dba53c --- /dev/null +++ b/examples/mask_app/README.md @@ -0,0 +1,31 @@ +# Generate Masks for DINOv + +This application allows you to generate masks to use for the DINOv tool. To get started +install the requirements by running: + +```bash +pip install -r requirements.txt +``` + +Then you can run the streamlit app by running: + +```bash +streamlit run app.py +``` + +From here you can upload an image, paint a mask over the image, and then save the mask. +This can be used as input for the DINOv tool. + +```python +import vision_agent as va + +data = { + "prompt": [{"mask": "baggage.png", "image": "baggage_mask.png"}], + "image": "baggage2.png", +} +tool = va.tools.DINOv() +output = res(**data) +image = va.image_utils.overlay_masks("baggage2.png", output) +image = va.image_utils.overlay_bboxes(image, output) +image.show() +``` From c20bbfeab25801584da7b00e538c37a0e6d2fe9c Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Fri, 26 Apr 2024 11:07:40 -0700 Subject: [PATCH 3/6] fixed link --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 53a36be1..10fae94f 100644 --- a/README.md +++ b/README.md @@ -143,7 +143,7 @@ class NumItems(Tool): ``` This will register it with the list of tools Vision Agent has access to. It will be able to pick it based on the tool description and use it based on the usage provided. You can -find an example that creates a custom tool for template matching [here](examples/custom_tool/README.md). +find an example that creates a custom tool for template matching [here](examples/custom_tools/). #### Tool List | Tool | Description | From e5088d58f842246504a39c3ba58b9a228f02a278 Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Fri, 26 Apr 2024 11:08:06 -0700 Subject: [PATCH 4/6] fixed link --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 10fae94f..286e4376 100644 --- a/README.md +++ b/README.md @@ -100,7 +100,7 @@ agent( ``` Here, `reference_mask.png` and `reference_image.png` in `reference_data` could be any image with it's corresponding mask that is the object you want to detect in `image.jpg`. -You can find a demo app to generate masks for DINOv [here](examples/mask_app/README.md) +You can find a demo app to generate masks for DINOv [here](examples/mask_app/) ### Tools There are a variety of tools for the model or the user to use. Some are executed locally From 8608092071a89984e4b1e910254e9f0bf1c498a3 Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Fri, 26 Apr 2024 11:08:36 -0700 Subject: [PATCH 5/6] fixed link --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 286e4376..76bcc2e1 100644 --- a/README.md +++ b/README.md @@ -100,7 +100,7 @@ agent( ``` Here, `reference_mask.png` and `reference_image.png` in `reference_data` could be any image with it's corresponding mask that is the object you want to detect in `image.jpg`. -You can find a demo app to generate masks for DINOv [here](examples/mask_app/) +You can find a demo app to generate masks for DINOv [here](examples/mask_app/). ### Tools There are a variety of tools for the model or the user to use. Some are executed locally From 968cd54f15f3dafdf43211a2af29e0e84bcb829f Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Fri, 26 Apr 2024 12:39:21 -0700 Subject: [PATCH 6/6] spelling mistakes --- vision_agent/agent/vision_agent.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vision_agent/agent/vision_agent.py b/vision_agent/agent/vision_agent.py index b72e89eb..d881d6ae 100644 --- a/vision_agent/agent/vision_agent.py +++ b/vision_agent/agent/vision_agent.py @@ -577,7 +577,7 @@ def chat_with_workflow( self.task_model, question, self.tools, reflections ) - task_depend = {"Original Quesiton": question} + task_depend = {"Original Question": question} previous_log = "" answers = [] for task in task_list: