From 97a18de9b03525e48b66e1067ee53a17e96cb241 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Mon, 22 Apr 2024 10:09:34 -0700
Subject: [PATCH 1/6] added new tools to README

---
 README.md | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 741b8ff2..7664a25c 100644
--- a/README.md
+++ b/README.md
@@ -99,15 +99,17 @@ you. For example:
 | Tool | Description |
 | --- | --- |
 | CLIP | CLIP is a tool that can classify or tag any image given a set of input classes or tags. |
+| ImageCaption| ImageCaption is a tool that can generate a caption for an image. |
 | GroundingDINO | GroundingDINO is a tool that can detect arbitrary objects with inputs such as category names or referring expressions. |
 | GroundingSAM | GroundingSAM is a tool that can detect and segment arbitrary objects with inputs such as category names or referring expressions. |
-| Counter | Counter detects and counts the number of objects in an image given an input such as a category name or referring expression. |
+| DINOv | DINOv is a tool that can detect arbitrary objects with using a referring mask. |
+| ExtractFrames | ExtractFrames extracts frames with motion from a video. |
 | Crop | Crop crops an image given a bounding box and returns a file name of the cropped image. |
 | BboxArea | BboxArea returns the area of the bounding box in pixels normalized to 2 decimal places. |
 | SegArea | SegArea returns the area of the segmentation mask in pixels normalized to 2 decimal places. |
 | BboxIoU | BboxIoU returns the intersection over union of two bounding boxes normalized to 2 decimal places. |
 | SegIoU | SegIoU returns the intersection over union of two segmentation masks normalized to 2 decimal places. |
-| ExtractFrames | ExtractFrames extracts frames with motion from a video. |
+| BoxDistance | BoxDistance returns the minimum distance between two bounding boxes normalized to 2 decimal places. |
 
 
 It also has a basic set of calculate tools such as add, subtract, multiply and divide.

From d6ab39d0bae38bcc9deecd2ffadc5d54b4c872f7 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Mon, 22 Apr 2024 10:11:42 -0700
Subject: [PATCH 2/6] added documentation for chat with workflow

---
 vision_agent/agent/vision_agent.py | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/vision_agent/agent/vision_agent.py b/vision_agent/agent/vision_agent.py
index bbd2c1a5..5e7c6228 100644
--- a/vision_agent/agent/vision_agent.py
+++ b/vision_agent/agent/vision_agent.py
@@ -414,7 +414,7 @@ def __init__(
     ):
         """VisionAgent constructor.
 
-        Parameters
+        Parameters:
             task_model: the model to use for task decomposition.
             answer_model: the model to use for reasoning and concluding the answer.
             reflect_model: the model to use for self reflection.
@@ -479,6 +479,21 @@ def chat_with_workflow(
         reference_data: Optional[Dict[str, str]] = None,
         visualize_output: Optional[bool] = False,
     ) -> Tuple[str, List[Dict]]:
+        """Chat with the vision agent and return the final answer and all tool results.
+
+        Parameters:
+            chat: a conversation in the format of
+                [{"role": "user", "content": "describe your task here..."}].
+            image: the input image referenced in the chat parameter.
+            reference_data: a dictionary containing the reference image and mask. in the
+                format of {"image": "image.jpg", "mask": "mask.jpg}
+            visualize_output: whether to visualize the output.
+
+        Returns:
+            A tuple where the first item is the final answer and the second item is a
+            list of all the tool results. The last item in the tool results also
+            contains the visualized output.
+        """
         question = chat[0]["content"]
         if image:
             question += f" Image name: {image}"

From b373a74bf81cf7403c2e3f8ff0868440b579363d Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Mon, 22 Apr 2024 10:13:39 -0700
Subject: [PATCH 3/6] fixed usage for image caption

---
 vision_agent/tools/tools.py | 15 ++++-----------
 1 file changed, 4 insertions(+), 11 deletions(-)

diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py
index 6d2a7b47..aa48df8f 100644
--- a/vision_agent/tools/tools.py
+++ b/vision_agent/tools/tools.py
@@ -131,8 +131,7 @@ def __call__(self, prompt: str, image: Union[str, ImageType]) -> Dict:
 
 
 class ImageCaption(Tool):
-    r"""ImageCaption is a tool that can caption an image based on its contents
-    or tags.
+    r"""ImageCaption is a tool that can caption an image based on its contents or tags.
 
     Example
     -------
@@ -143,26 +142,20 @@ class ImageCaption(Tool):
     """
 
     name = "image_caption_"
-    description = "'image_caption_' is a tool that can caption an image based on its contents or tags. It returns a text describing the image"
+    description = "'image_caption_' is a tool that can caption an image based on its contents or tags. It returns a text describing the image."
     usage = {
         "required_parameters": [
             {"name": "image", "type": "str"},
         ],
         "examples": [
             {
-                "scenario": "Can you describe this image ? Image name: cat.jpg",
+                "scenario": "Can you describe this image? Image name: cat.jpg",
                 "parameters": {"image": "cat.jpg"},
             },
             {
-                "scenario": "Can you caption this image with their main contents ? Image name: cat_dog.jpg",
+                "scenario": "Can you caption this image with their main contents? Image name: cat_dog.jpg",
                 "parameters": {"image": "cat_dog.jpg"},
             },
-            {
-                "scenario": "Can you build me a image captioning tool ? Image name: shirts.jpg",
-                "parameters": {
-                    "image": "shirts.jpg",
-                },
-            },
         ],
     }
 

From 0144e93e00f9475621013548a742e5358aecd3f4 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Mon, 22 Apr 2024 14:41:16 -0700
Subject: [PATCH 4/6] formatting fix

---
 vision_agent/tools/tools.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py
index 57db1ebb..6528c795 100644
--- a/vision_agent/tools/tools.py
+++ b/vision_agent/tools/tools.py
@@ -480,15 +480,15 @@ class ZeroShotCounting(Tool):
         ],
         "examples": [
             {
-                "scenario": "Can you count the lids in the image ? Image name: lids.jpg",
+                "scenario": "Can you count the lids in the image? Image name: lids.jpg",
                 "parameters": {"image": "lids.jpg"},
             },
             {
-                "scenario": "Can you count the total number of objects in this image ? Image name: tray.jpg",
+                "scenario": "Can you count the total number of objects in this image? Image name: tray.jpg",
                 "parameters": {"image": "tray.jpg"},
             },
             {
-                "scenario": "Can you build me an object counting tool ? Image name: shirts.jpg",
+                "scenario": "Can you build me an object counting tool? Image name: shirts.jpg",
                 "parameters": {
                     "image": "shirts.jpg",
                 },

From f49cb98bd7c7d3a0710c42fd0954816f6098306b Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Mon, 22 Apr 2024 17:37:37 -0700
Subject: [PATCH 5/6] remove datastore docs

---
 docs/lmms.md               | 20 +++++++++++
 docs/lmms_and_datastore.md | 69 --------------------------------------
 2 files changed, 20 insertions(+), 69 deletions(-)
 create mode 100644 docs/lmms.md
 delete mode 100644 docs/lmms_and_datastore.md

diff --git a/docs/lmms.md b/docs/lmms.md
new file mode 100644
index 00000000..21c329e0
--- /dev/null
+++ b/docs/lmms.md
@@ -0,0 +1,20 @@
+### LMMs
+One of the problems of dealing with image data is it can be difficult to organize and
+search. For example, you might have a bunch of pictures of houses and want to count how
+many yellow houses you have, or how many houses with adobe roofs. The vision agent
+library uses LMMs to help create tags or descriptions of images to allow you to search
+over them, or use them in a database to carry out other operations.
+
+To get started, you can use an LMM to start generating text from images. The following
+code will use the LLaVA-1.6 34B model to generate a description of the image you pass it.
+
+```python
+import vision_agent as va
+
+model = va.lmm.get_lmm("llava")
+model.generate("Describe this image", "image.png")
+>>> "A yellow house with a green lawn."
+```
+
+**WARNING** We are hosting the LLaVA-1.6 34B model, if it times out please wait ~3-5
+min for the server to warm up as it shuts down when usage is low.
diff --git a/docs/lmms_and_datastore.md b/docs/lmms_and_datastore.md
deleted file mode 100644
index 302b9732..00000000
--- a/docs/lmms_and_datastore.md
+++ /dev/null
@@ -1,69 +0,0 @@
-### LMMs
-One of the problems of dealing with image data is it can be difficult to organize and
-search. For example, you might have a bunch of pictures of houses and want to count how
-many yellow houses you have, or how many houses with adobe roofs. The vision agent
-library uses LMMs to help create tags or descriptions of images to allow you to search
-over them, or use them in a database to carry out other operations.
-
-To get started, you can use an LMM to start generating text from images. The following
-code will use the LLaVA-1.6 34B model to generate a description of the image you pass it.
-
-```python
-import vision_agent as va
-
-model = va.lmm.get_lmm("llava")
-model.generate("Describe this image", "image.png")
->>> "A yellow house with a green lawn."
-```
-
-**WARNING** We are hosting the LLaVA-1.6 34B model, if it times out please wait ~3-5
-min for the server to warm up as it shuts down when usage is low.
-
-### DataStore
-You can use the `DataStore` class to store your images, add new metadata to them such
-as descriptions, and search over different columns.
-
-```python
-import vision_agent as va
-import pandas as pd
-
-df = pd.DataFrame({"image_paths": ["image1.png", "image2.png", "image3.png"]})
-ds = va.data.DataStore(df)
-ds = ds.add_lmm(va.lmm.get_lmm("llava"))
-ds = ds.add_embedder(va.emb.get_embedder("sentence-transformer"))
-
-ds = ds.add_column("descriptions", "Describe this image.")
-```
-
-This will use the prompt you passed, "Describe this image.", and the LMM to create a
-new column of descriptions for your image. Your data will now contain a new column with
-the descriptions of each image:
-
-| image\_paths | image\_id | descriptions |
-| --- | --- | --- |
-| image1.png | 1 | "A yellow house with a green lawn." |
-| image2.png | 2 | "A white house with a two door garage." |
-| image3.png | 3 | "A wooden house in the middle of the forest." |
-
-You can now create an index on the descriptions column and search over it to find images
-that match your query.
-
-```python
-ds = ds.build_index("descriptions")
-ds.search("A yellow house.", top_k=1)
->>> [{'image_paths': 'image1.png', 'image_id': 1, 'descriptions': 'A yellow house with a green lawn.'}]
-```
-
-You can also create other columns for you data such as `is_yellow`:
-
-```python
-ds = ds.add_column("is_yellow", "Is the house in this image yellow? Please answer yes or no.")
-```
-
-which would give you a dataset similar to this:
-
-| image\_paths | image\_id | descriptions | is\_yellow |
-| --- | --- | --- | --- |
-| image1.png | 1 | "A yellow house with a green lawn." | "yes" |
-| image2.png | 2 | "A white house with a two door garage." | "no" |
-| image3.png | 3 | "A wooden house in the middle of the forest." | "no" |

From 25cca94926e6005f3bc02b817569d8c14a687b98 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Tue, 23 Apr 2024 17:11:54 -0700
Subject: [PATCH 6/6] add box contains

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 49defdf6..48675938 100644
--- a/README.md
+++ b/README.md
@@ -110,6 +110,7 @@ you. For example:
 | BboxIoU | BboxIoU returns the intersection over union of two bounding boxes normalized to 2 decimal places. |
 | SegIoU | SegIoU returns the intersection over union of two segmentation masks normalized to 2 decimal places. |
 | BoxDistance | BoxDistance returns the minimum distance between two bounding boxes normalized to 2 decimal places. |
+| BboxContains | BboxContains returns the intersection of two boxes over the target box area. It is good for check if one box is contained within another box. |
 | ExtractFrames | ExtractFrames extracts frames with motion from a video. |
 | ZeroShotCounting | ZeroShotCounting returns the total number of objects belonging to a single class in a given image |
 | VisualPromptCounting | VisualPromptCounting returns the total number of objects belonging to a single class given an image and visual prompt |