Merge branch 'add_vqa_tool' of github.com:landing-ai/vision-agent int…

…o add_vqa_tool
landing-ai · Apr 24, 2024 · 3ae4877 · 3ae4877
2 parents d72de9f + 5ef5c2f
commit 3ae4877
Show file tree

Hide file tree

Showing 12 changed files with 390 additions and 103 deletions.
diff --git a/README.md b/README.md
@@ -31,7 +31,7 @@ pip install vision-agent
 ```
 
 Ensure you have an OpenAI API key and set it as an environment variable (if you are
-using Azure OpenAI please see the additional setup section):
+using Azure OpenAI please see the Azure setup section):
 
 ```bash
 export OPENAI_API_KEY="your-api-key"
@@ -96,26 +96,55 @@ you. For example:
 }]
 ```
 
+#### Custom Tools
+You can also add your own custom tools for your vision agent to use:
+
+```python
+>>> from vision_agent.tools import Tool, register_tool
+>>> @register_tool
+>>> class NumItems(Tool):
+>>> name = "num_items_"
+>>> description = "Returns the number of items in a list."
+>>> usage = {
+>>> "required_parameters": [{"name": "prompt", "type": "list"}],
+>>> "examples": [
+>>> {
+>>> "scenario": "How many items are in this list? ['a', 'b', 'c']",
+>>> "parameters": {"prompt": "['a', 'b', 'c']"},
+>>> }
+>>> ],
+>>> }
+>>> def __call__(self, prompt: list[str]) -> int:
+>>> return len(prompt)
+```
+This will register it with the list of tools Vision Agent has access to. It will be able
+to pick it based on the tool description and use it based on the usage provided.
+
+#### Tool List
 | Tool | Description |
 | --- | --- |
 | CLIP | CLIP is a tool that can classify or tag any image given a set of input classes or tags. |
+| ImageCaption| ImageCaption is a tool that can generate a caption for an image. |
 | GroundingDINO | GroundingDINO is a tool that can detect arbitrary objects with inputs such as category names or referring expressions. |
 | GroundingSAM | GroundingSAM is a tool that can detect and segment arbitrary objects with inputs such as category names or referring expressions. |
-| Counter | Counter detects and counts the number of objects in an image given an input such as a category name or referring expression. |
+| DINOv | DINOv is a tool that can detect arbitrary objects with using a referring mask. |
+| ExtractFrames | ExtractFrames extracts frames with motion from a video. |
 | Crop | Crop crops an image given a bounding box and returns a file name of the cropped image. |
 | BboxArea | BboxArea returns the area of the bounding box in pixels normalized to 2 decimal places. |
 | SegArea | SegArea returns the area of the segmentation mask in pixels normalized to 2 decimal places. |
 | BboxIoU | BboxIoU returns the intersection over union of two bounding boxes normalized to 2 decimal places. |
 | SegIoU | SegIoU returns the intersection over union of two segmentation masks normalized to 2 decimal places. |
-| ExtractFrames | ExtractFrames extracts frames with motion from a video. |
+| BoxDistance | BoxDistance returns the minimum distance between two bounding boxes normalized to 2 decimal places. |
+| BboxContains | BboxContains returns the intersection of two boxes over the target box area. It is good for check if one box is contained within another box. |
 | ExtractFrames | ExtractFrames extracts frames with motion from a video. |
 | ZeroShotCounting | ZeroShotCounting returns the total number of objects belonging to a single class in a given image |
 | VisualPromptCounting | VisualPromptCounting returns the total number of objects belonging to a single class given an image and visual prompt |
+| OCR | OCR returns the text detected in an image along with the location. |
 
 
 It also has a basic set of calculate tools such as add, subtract, multiply and divide.
 
-### Additional Setup
+### Azure Setup
 If you want to use Azure OpenAI models, you can set the environment variable:
 
 ```bash

diff --git a/docs/lmms.md b/docs/lmms.md
@@ -0,0 +1,20 @@
+### LMMs
+One of the problems of dealing with image data is it can be difficult to organize and
+search. For example, you might have a bunch of pictures of houses and want to count how
+many yellow houses you have, or how many houses with adobe roofs. The vision agent
+library uses LMMs to help create tags or descriptions of images to allow you to search
+over them, or use them in a database to carry out other operations.
+
+To get started, you can use an LMM to start generating text from images. The following
+code will use the LLaVA-1.6 34B model to generate a description of the image you pass it.
+
+```python
+import vision_agent as va
+
+model = va.lmm.get_lmm("llava")
+model.generate("Describe this image", "image.png")
+>>> "A yellow house with a green lawn."
+```
+
+**WARNING** We are hosting the LLaVA-1.6 34B model, if it times out please wait ~3-5
+min for the server to warm up as it shuts down when usage is low.
diff --git a/docs/lmms_and_datastore.md b/docs/lmms_and_datastore.md
diff --git a/examples/custom_tools/pid.png b/examples/custom_tools/pid.png
diff --git a/examples/custom_tools/pid_template.png b/examples/custom_tools/pid_template.png
diff --git a/examples/custom_tools/run_custom_tool.py b/examples/custom_tools/run_custom_tool.py
@@ -0,0 +1,49 @@
+from template_match import template_matching_with_rotation
+
+import vision_agent as va
+from vision_agent.image_utils import get_image_size, normalize_bbox
+from vision_agent.tools import Tool, register_tool
+
+
+@register_tool
+class TemplateMatch(Tool):
+ name = "template_match_"
+ description = "'template_match_' takes a template image and finds all locations where that template appears in the input image."
+ usage = {
+ "required_parameters": [
+ {"name": "target_image", "type": "str"},
+ {"name": "template_image", "type": "str"},
+ ],
+ "examples": [
+ {
+ "scenario": "Can you detect the location of the template in the target image? Image name: target.png Reference image: template.png",
+ "parameters": {
+ "target_image": "target.png",
+ "template_image": "template.png",
+ },
+ },
+ ],
+ }
+
+ def __call__(self, target_image: str, template_image: str) -> dict:
+ image_size = get_image_size(target_image)
+ matches = template_matching_with_rotation(target_image, template_image)
+ matches["bboxes"] = [
+ normalize_bbox(box, image_size) for box in matches["bboxes"]
+ ]
+ return matches
+
+
+if __name__ == "__main__":
+ agent = va.agent.VisionAgent(verbose=True)
+ resp, tools = agent.chat_with_workflow(
+ [
+ {
+ "role": "user",
+ "content": "Can you find the locations of the pid_template.png in pid.png and tell me if any are nearby 'NOTE 5'?",
+ }
+ ],
+ image="pid.png",
+ reference_data={"image": "pid_template.png"},
+ visualize_output=True,
+ )
diff --git a/examples/custom_tools/template_match.py b/examples/custom_tools/template_match.py
@@ -0,0 +1,96 @@
+import cv2
+import numpy as np
+import torch
+from torchvision.ops import nms
+
+
+def rotate_image(mat, angle):
+ """
+ Rotates an image (angle in degrees) and expands image to avoid cropping
+ """
+
+ height, width = mat.shape[:2] # image shape has 3 dimensions
+ image_center = (
+ width / 2,
+ height / 2,
+ ) # getRotationMatrix2D needs coordinates in reverse order (width, height) compared to shape
+
+ rotation_mat = cv2.getRotationMatrix2D(image_center, angle, 1.0)
+
+ # rotation calculates the cos and sin, taking absolutes of those.
+ abs_cos = abs(rotation_mat[0, 0])
+ abs_sin = abs(rotation_mat[0, 1])
+
+ # find the new width and height bounds
+ bound_w = int(height * abs_sin + width * abs_cos)
+ bound_h = int(height * abs_cos + width * abs_sin)
+
+ # subtract old image center (bringing image back to origo) and adding the new image center coordinates
+ rotation_mat[0, 2] += bound_w / 2 - image_center[0]
+ rotation_mat[1, 2] += bound_h / 2 - image_center[1]
+
+ # rotate image with the new bounds and translated rotation matrix
+ rotated_mat = cv2.warpAffine(mat, rotation_mat, (bound_w, bound_h))
+ return rotated_mat
+
+
+def template_matching_with_rotation(
+ main_image_path: str,
+ template_path: str,
+ max_rotation: int = 360,
+ step: int = 90,
+ threshold: float = 0.75,
+ visualize: bool = False,
+) -> dict:
+ main_image = cv2.imread(main_image_path)
+ template = cv2.imread(template_path)
+ template_height, template_width = template.shape[:2]
+
+ # Convert images to grayscale
+ main_image_gray = cv2.cvtColor(main_image, cv2.COLOR_BGR2GRAY)
+ template_gray = cv2.cvtColor(template, cv2.COLOR_BGR2GRAY)
+
+ boxes = []
+ scores = []
+
+ for angle in range(0, max_rotation, step):
+ # Rotate the template
+ rotated_template = rotate_image(template_gray, angle)
+
+ # Perform template matching
+ result = cv2.matchTemplate(
+ main_image_gray,
+ rotated_template,
+ cv2.TM_CCOEFF_NORMED,
+ )
+
+ y_coords, x_coords = np.where(result >= threshold)
+ for x, y in zip(x_coords, y_coords):
+ boxes.append(
+ (x, y, x + rotated_template.shape[1], y + rotated_template.shape[0])
+ )
+ scores.append(result[y, x])
+
+ indices = (
+ nms(
+ torch.tensor(boxes).float(),
+ torch.tensor(scores).float(),
+ 0.2,
+ )
+ .numpy()
+ .tolist()
+ )
+ boxes = [boxes[i] for i in indices]
+ scores = [scores[i] for i in indices]
+
+ if visualize:
+ # Draw a rectangle around the best match
+ for box in boxes:
+ cv2.rectangle(main_image, (box[0], box[1]), (box[2], box[3]), 255, 2)
+
+ # Display the result
+ cv2.imshow("Best Match", main_image)
+ cv2.waitKey(0)
+ cv2.destroyAllWindows()
+
+ return {"bboxes": boxes, "scores": scores}
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
 
 [tool.poetry]
 name = "vision-agent"
-version = "0.2.2"
+version = "0.2.3"
 description = "Toolset for Vision Agent"
 authors = ["Landing AI <[email protected]>"]
 readme = "README.md"

diff --git a/tests/tools/test_tools.py b/tests/tools/test_tools.py
@@ -2,8 +2,10 @@
 import tempfile
 
 import numpy as np
+import pytest
 from PIL import Image
 
+from vision_agent.tools import TOOLS, Tool, register_tool
 from vision_agent.tools.tools import BboxIoU, BoxDistance, SegArea, SegIoU
 
 
@@ -65,3 +67,71 @@ def test_box_distance():
  box1 = [0, 0, 2, 2]
  box2 = [1, 1, 3, 3]
  assert box_dist(box1, box2) == 0.0
+
+
+def test_register_tool():
+ assert TOOLS[len(TOOLS) - 1]["name"] != "test_tool_"
+
+ @register_tool
+ class TestTool(Tool):
+ name = "test_tool_"
+ description = "Test Tool"
+ usage = {
+ "required_parameters": [{"name": "prompt", "type": "str"}],
+ "examples": [
+ {
+ "scenario": "Test",
+ "parameters": {"prompt": "Test Prompt"},
+ }
+ ],
+ }
+
+ def __call__(self, prompt: str) -> str:
+ return prompt
+
+ assert TOOLS[len(TOOLS) - 1]["name"] == "test_tool_"
+
+
+def test_register_tool_incorrect():
+ with pytest.raises(ValueError):
+
+ @register_tool
+ class NoAttributes(Tool):
+ pass
+
+ with pytest.raises(ValueError):
+
+ @register_tool
+ class NoName(Tool):
+ description = "Test Tool"
+ usage = {
+ "required_parameters": [{"name": "prompt", "type": "str"}],
+ "examples": [
+ {
+ "scenario": "Test",
+ "parameters": {"prompt": "Test Prompt"},
+ }
+ ],
+ }
+
+ with pytest.raises(ValueError):
+
+ @register_tool
+ class NoDescription(Tool):
+ name = "test_tool_"
+ usage = {
+ "required_parameters": [{"name": "prompt", "type": "str"}],
+ "examples": [
+ {
+ "scenario": "Test",
+ "parameters": {"prompt": "Test Prompt"},
+ }
+ ],
+ }
+
+ with pytest.raises(ValueError):
+
+ @register_tool
+ class NoUsage(Tool):
+ name = "test_tool_"
+ description = "Test Tool"