diff --git a/pyproject.toml b/pyproject.toml
index e27a9e2a..6e77b41a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -28,6 +28,7 @@ moviepy = "1.*"
 opencv-python-headless = "4.*"
 tabulate = "^0.9.0"
 pydantic-settings = "^2.2.1"
+scipy = "1.13.*"
 
 [tool.poetry.group.dev.dependencies]
 autoflake = "1.*"
diff --git a/vision_agent/tools/__init__.py b/vision_agent/tools/__init__.py
index 10daf7eb..0241ac11 100644
--- a/vision_agent/tools/__init__.py
+++ b/vision_agent/tools/__init__.py
@@ -6,6 +6,7 @@
     BboxArea,
     BboxIoU,
     BoxDistance,
+    MaskDistance,
     Crop,
     DINOv,
     ExtractFrames,
diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py
index a9eca833..55052eee 100644
--- a/vision_agent/tools/tools.py
+++ b/vision_agent/tools/tools.py
@@ -9,6 +9,7 @@
 import requests
 from PIL import Image
 from PIL.Image import Image as ImageType
+from scipy.spatial import distance
 
 from vision_agent.image_utils import (
     b64_to_pil,
@@ -544,7 +545,7 @@ class VisualPromptCounting(Tool):
     -------
         >>> import vision_agent as va
         >>> prompt_count = va.tools.VisualPromptCounting()
-        >>> prompt_count(image="image1.jpg", prompt="0.1, 0.1, 0.4, 0.42")
+        >>> prompt_count(image="image1.jpg", prompt={"bbox": [0.1, 0.1, 0.4, 0.42]})
         {'count': 23}
     """
 
@@ -554,46 +555,54 @@ class VisualPromptCounting(Tool):
     usage = {
         "required_parameters": [
             {"name": "image", "type": "str"},
-            {"name": "prompt", "type": "str"},
+            {"name": "prompt", "type": "Dict[str, List[float]"},
         ],
         "examples": [
             {
                 "scenario": "Here is an example of a lid '0.1, 0.1, 0.14, 0.2', Can you count the items in the image ? Image name: lids.jpg",
-                "parameters": {"image": "lids.jpg", "prompt": "0.1, 0.1, 0.14, 0.2"},
+                "parameters": {
+                    "image": "lids.jpg",
+                    "prompt": {"bbox": [0.1, 0.1, 0.14, 0.2]},
+                },
             },
             {
-                "scenario": "Can you count the total number of objects in this image ? Image name: tray.jpg",
-                "parameters": {"image": "tray.jpg", "prompt": "0.1, 0.1, 0.2, 0.25"},
+                "scenario": "Can you count the total number of objects in this image ? Image name: tray.jpg, reference_data: {'bbox': [0.1, 0.1, 0.2, 0.25]}",
+                "parameters": {
+                    "image": "tray.jpg",
+                    "prompt": {"bbox": [0.1, 0.1, 0.2, 0.25]},
+                },
             },
             {
-                "scenario": "Can you count this item based on an example, reference_data: '0.1, 0.15, 0.2, 0.2' ? Image name: shirts.jpg",
+                "scenario": "Can you count this item based on an example, reference_data: {'bbox': [100, 115, 200, 200]} ? Image name: shirts.jpg",
                 "parameters": {
                     "image": "shirts.jpg",
-                    "prompt": "0.1, 0.15, 0.2, 0.2",
+                    "prompt": {"bbox": [100, 115, 200, 200]},
                 },
             },
             {
-                "scenario": "Can you build me a counting tool based on an example prompt ? Image name: shoes.jpg",
+                "scenario": "Can you build me a counting tool based on an example prompt ? Image name: shoes.jpg, reference_data: {'bbox': [0.1, 0.1, 0.6, 0.65]}",
                 "parameters": {
                     "image": "shoes.jpg",
-                    "prompt": "0.1, 0.1, 0.6, 0.65",
+                    "prompt": {"bbox": [0.1, 0.1, 0.6, 0.65]},
                 },
             },
         ],
     }
 
-    # TODO: Add support for input multiple images, which aligns with the output type.
-    def __call__(self, image: Union[str, ImageType], prompt: str) -> Dict:
+    def __call__(
+        self, image: Union[str, ImageType], prompt: Dict[str, List[float]]
+    ) -> Dict:
         """Invoke the few shot counting model.
 
         Parameters:
             image: the input image.
+            prompt: the visual prompt which is a bounding box describing the object.
 
         Returns:
             A dictionary containing the key 'count' and the count as value. E.g. {count: 12}
         """
         image_size = get_image_size(image)
-        bbox = [float(x) for x in prompt.split(",")]
+        bbox = prompt["bbox"]
         prompt = ", ".join(map(str, denormalize_bbox(bbox, image_size)))
         image_b64 = convert_to_b64(image)
 
@@ -878,7 +887,7 @@ class SegIoU(Tool):
         ],
         "examples": [
             {
-                "scenario": "If you want to calculate the intersection over union of the segmentation masks for mask_file1.jpg and mask_file2.jpg",
+                "scenario": "Calculate the intersection over union of the segmentation masks for mask_file1.jpg and mask_file2.jpg",
                 "parameters": {"mask1": "mask_file1.png", "mask2": "mask_file2.png"},
             }
         ],
@@ -976,6 +985,33 @@ def __call__(self, bbox1: List[int], bbox2: List[int]) -> float:
         return cast(float, round(np.sqrt(horizontal_dist**2 + vertical_dist**2), 2))
 
 
+class MaskDistance(Tool):
+    name = "mask_distance_"
+    description = "'mask_distance_' calculates distance between two masks. It is helpful in checking proximity of two objects. It returns the minumum distance between the given masks"
+    usage = {
+        "required_parameters": [
+            {"name": "mask1", "type": "str"},
+            {"name": "mask2", "type": "str"},
+        ],
+        "examples": [
+            {
+                "scenario": "Calculate the distance between the segmentation masks for mask_file1.jpg and mask_file2.jpg",
+                "parameters": {"mask1": "mask_file1.png", "mask2": "mask_file2.png"},
+            }
+        ],
+    }
+
+    def __call__(self, mask1: Union[str, Path], mask2: Union[str, Path]) -> float:
+        pil_mask1 = Image.open(str(mask1))
+        pil_mask2 = Image.open(str(mask2))
+        np_mask1 = np.clip(np.array(pil_mask1), 0, 1)
+        np_mask2 = np.clip(np.array(pil_mask2), 0, 1)
+        mask1_points = np.transpose(np.nonzero(np_mask1))
+        mask2_points = np.transpose(np.nonzero(np_mask2))
+        dist_matrix = distance.cdist(mask1_points, mask2_points, "euclidean")
+        return cast(float, np.round(np.min(dist_matrix), 2))
+
+
 class ExtractFrames(Tool):
     r"""Extract frames from a video."""
 
@@ -1110,8 +1146,8 @@ def __call__(self, equation: str) -> float:
             Crop,
             BboxArea,
             SegArea,
-            BboxIoU,
             SegIoU,
+            MaskDistance,
             BboxContains,
             BoxDistance,
             OCR,