Minor updates to comments (#58)

* added new tools to README * added documentation for chat with workflow * fixed usage for image caption * formatting fix * remove datastore docs * add box contains
landing-ai · Apr 24, 2024 · 431dae4 · 431dae4
1 parent cec32f7
commit 431dae4
Show file tree

Hide file tree

Showing 5 changed files with 48 additions and 86 deletions.
diff --git a/README.md b/README.md
@@ -99,15 +99,18 @@ you. For example:
 | Tool | Description |
 | --- | --- |
 | CLIP | CLIP is a tool that can classify or tag any image given a set of input classes or tags. |
+| ImageCaption| ImageCaption is a tool that can generate a caption for an image. |
 | GroundingDINO | GroundingDINO is a tool that can detect arbitrary objects with inputs such as category names or referring expressions. |
 | GroundingSAM | GroundingSAM is a tool that can detect and segment arbitrary objects with inputs such as category names or referring expressions. |
-| Counter | Counter detects and counts the number of objects in an image given an input such as a category name or referring expression. |
+| DINOv | DINOv is a tool that can detect arbitrary objects with using a referring mask. |
+| ExtractFrames | ExtractFrames extracts frames with motion from a video. |
 | Crop | Crop crops an image given a bounding box and returns a file name of the cropped image. |
 | BboxArea | BboxArea returns the area of the bounding box in pixels normalized to 2 decimal places. |
 | SegArea | SegArea returns the area of the segmentation mask in pixels normalized to 2 decimal places. |
 | BboxIoU | BboxIoU returns the intersection over union of two bounding boxes normalized to 2 decimal places. |
 | SegIoU | SegIoU returns the intersection over union of two segmentation masks normalized to 2 decimal places. |
-| ExtractFrames | ExtractFrames extracts frames with motion from a video. |
+| BoxDistance | BoxDistance returns the minimum distance between two bounding boxes normalized to 2 decimal places. |
+| BboxContains | BboxContains returns the intersection of two boxes over the target box area. It is good for check if one box is contained within another box. |
 | ExtractFrames | ExtractFrames extracts frames with motion from a video. |
 | ZeroShotCounting | ZeroShotCounting returns the total number of objects belonging to a single class in a given image |
 | VisualPromptCounting | VisualPromptCounting returns the total number of objects belonging to a single class given an image and visual prompt |

diff --git a/docs/lmms.md b/docs/lmms.md
@@ -0,0 +1,20 @@
+### LMMs
+One of the problems of dealing with image data is it can be difficult to organize and
+search. For example, you might have a bunch of pictures of houses and want to count how
+many yellow houses you have, or how many houses with adobe roofs. The vision agent
+library uses LMMs to help create tags or descriptions of images to allow you to search
+over them, or use them in a database to carry out other operations.
+
+To get started, you can use an LMM to start generating text from images. The following
+code will use the LLaVA-1.6 34B model to generate a description of the image you pass it.
+
+```python
+import vision_agent as va
+
+model = va.lmm.get_lmm("llava")
+model.generate("Describe this image", "image.png")
+>>> "A yellow house with a green lawn."
+```
+
+**WARNING** We are hosting the LLaVA-1.6 34B model, if it times out please wait ~3-5
+min for the server to warm up as it shuts down when usage is low.
diff --git a/docs/lmms_and_datastore.md b/docs/lmms_and_datastore.md
diff --git a/vision_agent/agent/vision_agent.py b/vision_agent/agent/vision_agent.py
@@ -428,7 +428,7 @@ def __init__(
  ):
  """VisionAgent constructor.
 
- Parameters
+ Parameters:
  task_model: the model to use for task decomposition.
  answer_model: the model to use for reasoning and concluding the answer.
  reflect_model: the model to use for self reflection.
@@ -504,6 +504,21 @@ def chat_with_workflow(
  reference_data: Optional[Dict[str, str]] = None,
  visualize_output: Optional[bool] = False,
  ) -> Tuple[str, List[Dict]]:
+ """Chat with the vision agent and return the final answer and all tool results.
+
+ Parameters:
+ chat: a conversation in the format of
+ [{"role": "user", "content": "describe your task here..."}].
+ image: the input image referenced in the chat parameter.
+ reference_data: a dictionary containing the reference image and mask. in the
+ format of {"image": "image.jpg", "mask": "mask.jpg}
+ visualize_output: whether to visualize the output.
+
+ Returns:
+ A tuple where the first item is the final answer and the second item is a
+ list of all the tool results. The last item in the tool results also
+ contains the visualized output.
+ """
  question = chat[0]["content"]
  if image:
  question += f" Image name: {image}"

diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py
@@ -108,8 +108,7 @@ def __call__(self, prompt: str, image: Union[str, ImageType]) -> Dict:
 
 
 class ImageCaption(Tool):
- r"""ImageCaption is a tool that can caption an image based on its contents
- or tags.
+ r"""ImageCaption is a tool that can caption an image based on its contents or tags.
 
  Example
  -------
@@ -120,26 +119,20 @@ class ImageCaption(Tool):
  """
 
  name = "image_caption_"
- description = "'image_caption_' is a tool that can caption an image based on its contents or tags. It returns a text describing the image"
+ description = "'image_caption_' is a tool that can caption an image based on its contents or tags. It returns a text describing the image."
  usage = {
  "required_parameters": [
  {"name": "image", "type": "str"},
  ],
  "examples": [
  {
- "scenario": "Can you describe this image ? Image name: cat.jpg",
+ "scenario": "Can you describe this image? Image name: cat.jpg",
  "parameters": {"image": "cat.jpg"},
  },
  {
- "scenario": "Can you caption this image with their main contents ? Image name: cat_dog.jpg",
+ "scenario": "Can you caption this image with their main contents? Image name: cat_dog.jpg",
  "parameters": {"image": "cat_dog.jpg"},
  },
- {
- "scenario": "Can you build me a image captioning tool ? Image name: shirts.jpg",
- "parameters": {
- "image": "shirts.jpg",
- },
- },
  ],
  }
 
@@ -487,15 +480,15 @@ class ZeroShotCounting(Tool):
  ],
  "examples": [
  {
- "scenario": "Can you count the lids in the image ? Image name: lids.jpg",
+ "scenario": "Can you count the lids in the image? Image name: lids.jpg",
  "parameters": {"image": "lids.jpg"},
  },
  {
- "scenario": "Can you count the total number of objects in this image ? Image name: tray.jpg",
+ "scenario": "Can you count the total number of objects in this image? Image name: tray.jpg",
  "parameters": {"image": "tray.jpg"},
  },
  {
- "scenario": "Can you build me an object counting tool ? Image name: shirts.jpg",
+ "scenario": "Can you build me an object counting tool? Image name: shirts.jpg",
  "parameters": {
  "image": "shirts.jpg",
  },