From 1b3806626cf2a478d35b25e44b78d19134cac57d Mon Sep 17 00:00:00 2001
From: Shankar <90070882+shankar-vision-eng@users.noreply.github.com>
Date: Thu, 30 May 2024 13:16:30 -0700
Subject: [PATCH] fix tool desc (#102)

---
 vision_agent/tools/tools.py | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py
index 4102ed91..ca3f1565 100644
--- a/vision_agent/tools/tools.py
+++ b/vision_agent/tools/tools.py
@@ -58,9 +58,10 @@ def grounding_dino(
     box_threshold: float = 0.20,
     iou_threshold: float = 0.20,
 ) -> List[Dict[str, Any]]:
-    """'grounding_dino' is a tool that can detect and count objects given a text prompt
-    such as category names or referring expressions. It returns a list and count of
-    bounding boxes, label names and associated probability scores.
+    """'grounding_dino' is a tool that can detect and count multiple objects given a text
+    prompt such as category names or referring expressions. The categories in text prompt
+    are separated by commas or periods. It returns a list and count of bounding boxes,
+    label names and associated probability scores.
 
     Parameters:
         prompt (str): The prompt to ground to the image.
@@ -111,9 +112,10 @@ def grounding_sam(
     box_threshold: float = 0.20,
     iou_threshold: float = 0.20,
 ) -> List[Dict[str, Any]]:
-    """'grounding_sam' is a tool that can detect and segment objects given a text
-    prompt such as category names or referring expressions. It returns a list of
-    bounding boxes, label names and masks file names and associated probability scores.
+    """'grounding_sam' is a tool that can detect and segment multiple objects given a
+    text prompt such as category names or referring expressions. The categories in text
+    prompt are separated by commas or periods. It returns a list of bounding boxes,
+    label names, mask file names and associated probability scores.
 
     Parameters:
         prompt (str): The prompt to ground to the image.
@@ -343,9 +345,9 @@ def image_question_answering(image: np.ndarray, prompt: str) -> str:
 
 
 def clip(image: np.ndarray, classes: List[str]) -> Dict[str, Any]:
-    """'clip' is a tool that can classify an image given a list of input classes or tags.
-    It returns the same list of the input classes along with their probability scores
-    based on image content.
+    """'clip' is a tool that can classify an image or a cropped detection given a list
+    of input classes or tags. It returns the same list of the input classes along with
+    their probability scores based on image content.
 
     Parameters:
         image (np.ndarray): The image to classify or tag