From 7d7243915bbbd461537571b0a01293b93ac78983 Mon Sep 17 00:00:00 2001
From: Asia <2736300+humpydonkey@users.noreply.github.com>
Date: Thu, 18 Apr 2024 17:14:53 -0700
Subject: [PATCH] Add DINOv as a new tool (#44)

* Add DINOv as a new tool

* Fix lint errors

* Update docs

* Fix param name mismatch (#45)

Co-authored-by: Yazhou Cao <yazhou.cao@landing.ai>

* Grammar/Spelling fixes (#46)

* Update prompts.py

* Update vision_agent_prompts.py

* Update reflexion_prompts.py

* Update vision_agent_prompts.py

* Update easytool_prompts.py

* Update prompts.py

* Update vision_agent_prompts.py

* Switch to the tools endpoint (#40)

* get endpoint ready for demo

fixed tools.json

Update vision_agent/tools/tools.py

Bug fixes

* Fix linter errors

* Fix a bug in result parsing

* Include scores in the G-SAM model response

* Removed tools.json , need to find better format

* Fixing the endpoint for CLIP and adding thresholds for grounding tools

* fix mypy errors

* fixed example notebook

---------

Co-authored-by: Yazhou Cao <yazhou.cao@landing.ai>
Co-authored-by: shankar_ws3 <shankar.anand@landing.ai>

* Support streaming chat logs of an agent (#47)

Add a callback for reporting the chat progress of an agent

* Empty-Commit

* Empty-Commit: attempt to fix release

* [skip ci] chore(release): vision-agent 0.0.49

* Fix typo (#48)

Co-authored-by: Yazhou Cao <yazhou.cao@landing.ai>

* [skip ci] chore(release): vision-agent 0.0.50

* Fix a typo in log (#49)

Fix another typo

Co-authored-by: Yazhou Cao <yazhou.cao@landing.ai>

* [skip ci] chore(release): vision-agent 0.0.51

* Fix Baby Cam Use Case (#51)

* fix visualization error

* added font and score to viz

* changed to smaller font file

* Support streaming chat logs of an agent (#47)

Add a callback for reporting the chat progress of an agent

* fix visualize score issue

* updated descriptions, fixed counter bug

* added visualize_output

* make feedback more concrete

* made naming more consistent

* replaced individual calc ops with calculator tool

* fix random colors

* fix prompts for tools

* update reflection prompt

* update readme

* formatting fix

* fixed mypy errors

* fix merge issue

---------

Co-authored-by: Asia <2736300+humpydonkey@users.noreply.github.com>

* [skip ci] chore(release): vision-agent 0.0.52

* Add image caption tool (#52)

added image caption tool

* [skip ci] chore(release): vision-agent 0.0.53

* refactor: switch model endpoints (#54)

* Switch the host of model endpoint to api.dev.landing.ai
* DRY/Abstract out the inference code in tools
* Introduce LandingaiAPIKey and support loading from .env file
* Add integration tests for four model tools
* Minor tweaks/fixes
* Remove dead code
* Bump the minor version to 0.1.0

* [skip ci] chore(release): vision-agent 0.1.1

* Pool Demo (#53)

* visualized output/reflection to handle extract_frames_

* remove ipdb

* added json mode for lmm, upgraded gpt-4-turbo

* updated reflection prompt

* refactor to make function simpler

* updated reflection prompt, add tool usage doc

* fixed format issue

* fixed type issue

* fixed test case

* [skip ci] chore(release): vision-agent 0.1.2

* feat: allow disable motion detection in frame extraction function (#55)

* Tweak frame extraction function

* remove default motion detection, extract at 0.5 fps

* lmm now take multiple images

* removed counter

* tweaked prompt

* updated vision agent to reflect on multiple images

* fix test case

* added box distance

* adjusted prompts

---------

Co-authored-by: Yazhou Cao <yazhou.cao@landing.ai>
Co-authored-by: Dillon Laird <dillonalaird@gmail.com>

* [skip ci] chore(release): vision-agent 0.1.3

* doc changes

* fixed merge issues

* fix color issue

* add dinov with updated endpoint

* formatting fix

* added reference mask support

* fix linting

---------

Co-authored-by: Yazhou Cao <yazhou.cao@landing.ai>
Co-authored-by: Cameron Maloney <cameron.maloney@warriorlife.net>
Co-authored-by: shankar_ws3 <shankar.anand@landing.ai>
Co-authored-by: Dillon Laird <dillonalaird@gmail.com>
Co-authored-by: Shankar <90070882+shankar-landing-ai@users.noreply.github.com>
---
 examples/mask_app/app.py           | 34 ++++++++++
 examples/mask_app/requirements.txt |  2 +
 vision_agent/agent/vision_agent.py |  8 +++
 vision_agent/image_utils.py        |  4 +-
 vision_agent/tools/__init__.py     |  1 +
 vision_agent/tools/tools.py        | 99 ++++++++++++++++++++++++++++++
 6 files changed, 147 insertions(+), 1 deletion(-)
 create mode 100644 examples/mask_app/app.py
 create mode 100644 examples/mask_app/requirements.txt

diff --git a/examples/mask_app/app.py b/examples/mask_app/app.py
new file mode 100644
index 00000000..23a5fc78
--- /dev/null
+++ b/examples/mask_app/app.py
@@ -0,0 +1,34 @@
+import cv2
+import streamlit as st
+from PIL import Image
+from streamlit_drawable_canvas import st_canvas
+
+st.title("Image Segmentation Mask App")
+
+uploaded_file = st.file_uploader("Choose an image...", type=["jpg", "png"])
+if uploaded_file is not None:
+    image = Image.open(uploaded_file)
+    orig_size = image.size
+
+stroke_width = st.sidebar.slider("Stroke width: ", 1, 50, 25)
+stroke_color = st.sidebar.color_picker("Stroke color hex: ")
+
+canvas_result = st_canvas(
+    fill_color="rgba(255, 165, 0, 0.3)",  # Fixed fill color with some opacity
+    stroke_width=stroke_width,
+    stroke_color=stroke_color,
+    background_color="#eee",
+    background_image=Image.open(uploaded_file) if uploaded_file else None,
+    update_streamlit=True,
+    height=500,
+    drawing_mode="freedraw",
+    key="canvas",
+)
+
+if canvas_result.image_data is not None:
+    mask = canvas_result.image_data.astype("uint8")[..., 3]
+    mask[mask > 0] = 255
+    if st.button("Save Mask Image") and orig_size:
+        mask = cv2.resize(mask, orig_size, interpolation=cv2.INTER_NEAREST)
+        cv2.imwrite("mask.png", mask)
+        st.success("Mask Image saved successfully.")
diff --git a/examples/mask_app/requirements.txt b/examples/mask_app/requirements.txt
new file mode 100644
index 00000000..3ce2aea0
--- /dev/null
+++ b/examples/mask_app/requirements.txt
@@ -0,0 +1,2 @@
+streamlit
+streamlit-drawable-canvas
diff --git a/vision_agent/agent/vision_agent.py b/vision_agent/agent/vision_agent.py
index a3f09b82..44c3aa08 100644
--- a/vision_agent/agent/vision_agent.py
+++ b/vision_agent/agent/vision_agent.py
@@ -365,6 +365,7 @@ def visualize_result(all_tool_results: List[Dict]) -> Sequence[Union[str, Path]]
             "grounding_sam_",
             "grounding_dino_",
             "extract_frames_",
+            "dinov_",
         ]:
             continue
 
@@ -469,11 +470,18 @@ def chat_with_workflow(
         self,
         chat: List[Dict[str, str]],
         image: Optional[Union[str, Path]] = None,
+        reference_data: Optional[Dict[str, str]] = None,
         visualize_output: Optional[bool] = False,
     ) -> Tuple[str, List[Dict]]:
         question = chat[0]["content"]
         if image:
             question += f" Image name: {image}"
+        if reference_data:
+            if not ("image" in reference_data and "mask" in reference_data):
+                raise ValueError(
+                    f"Reference data must contain 'image' and 'mask'. but got {reference_data}"
+                )
+            question += f" Reference image: {reference_data['image']}, Reference mask: {reference_data['mask']}"
 
         reflections = ""
         final_answer = ""
diff --git a/vision_agent/image_utils.py b/vision_agent/image_utils.py
index d0164e11..f36a2033 100644
--- a/vision_agent/image_utils.py
+++ b/vision_agent/image_utils.py
@@ -103,7 +103,9 @@ def overlay_bboxes(
     elif isinstance(image, np.ndarray):
         image = Image.fromarray(image)
 
-    color = {label: COLORS[i % len(COLORS)] for i, label in enumerate(bboxes["labels"])}
+    color = {
+        label: COLORS[i % len(COLORS)] for i, label in enumerate(set(bboxes["labels"]))
+    }
 
     width, height = image.size
     fontsize = max(12, int(min(width, height) / 40))
diff --git a/vision_agent/tools/__init__.py b/vision_agent/tools/__init__.py
index 63931c9f..1c1c6e73 100644
--- a/vision_agent/tools/__init__.py
+++ b/vision_agent/tools/__init__.py
@@ -6,6 +6,7 @@
     BboxIoU,
     BoxDistance,
     Crop,
+    DINOv,
     ExtractFrames,
     GroundingDINO,
     GroundingSAM,
diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py
index 3a5c8a4f..6d2a7b47 100644
--- a/vision_agent/tools/tools.py
+++ b/vision_agent/tools/tools.py
@@ -372,6 +372,104 @@ def __call__(
         return ret_pred
 
 
+class DINOv(Tool):
+    r"""DINOv is a tool that can detect and segment similar objects with the given input masks.
+
+    Example
+    -------
+        >>> import vision_agent as va
+        >>> t = va.tools.DINOv()
+        >>> t(prompt=[{"mask":"balloon_mask.jpg", "image": "balloon.jpg"}], image="balloon.jpg"])
+        [{'scores': [0.512, 0.212],
+        'masks': [array([[0, 0, 0, ..., 0, 0, 0],
+           ...,
+           [0, 0, 0, ..., 0, 0, 0]], dtype=uint8)},
+        array([[0, 0, 0, ..., 0, 0, 0],
+           ...,
+           [1, 1, 1, ..., 1, 1, 1]], dtype=uint8)]}]
+    """
+
+    name = "dinov_"
+    description = "'dinov_' is a tool that can detect and segment similar objects given a reference segmentation mask."
+    usage = {
+        "required_parameters": [
+            {"name": "prompt", "type": "List[Dict[str, str]]"},
+            {"name": "image", "type": "str"},
+        ],
+        "examples": [
+            {
+                "scenario": "Can you find all the balloons in this image that is similar to the provided masked area? Image name: input.jpg Reference image: balloon.jpg Reference mask: balloon_mask.jpg",
+                "parameters": {
+                    "prompt": [
+                        {"mask": "balloon_mask.jpg", "image": "balloon.jpg"},
+                    ],
+                    "image": "input.jpg",
+                },
+            },
+            {
+                "scenario": "Detect all the objects in this image that are similar to the provided mask. Image name: original.jpg Reference image: mask.png Reference mask: background.png",
+                "parameters": {
+                    "prompt": [
+                        {"mask": "mask.png", "image": "background.png"},
+                    ],
+                    "image": "original.jpg",
+                },
+            },
+        ],
+    }
+
+    def __call__(
+        self, prompt: List[Dict[str, str]], image: Union[str, ImageType]
+    ) -> Dict:
+        """Invoke the DINOv model.
+
+        Parameters:
+            prompt: a list of visual prompts in the form of {'mask': 'MASK_FILE_PATH', 'image': 'IMAGE_FILE_PATH'}.
+            image: the input image to segment.
+
+        Returns:
+            A dictionary of the below keys: 'scores', 'masks' and 'mask_shape', which stores a list of detected segmentation masks and its scores.
+        """
+        image_b64 = convert_to_b64(image)
+        for p in prompt:
+            p["mask"] = convert_to_b64(p["mask"])
+            p["image"] = convert_to_b64(p["image"])
+        request_data = {
+            "prompt": prompt,
+            "image": image_b64,
+            "tool": "dinov",
+        }
+        data: Dict[str, Any] = _send_inference_request(request_data, "dinov")
+        if "bboxes" in data:
+            data["bboxes"] = [
+                normalize_bbox(box, data["mask_shape"]) for box in data["bboxes"]
+            ]
+        if "masks" in data:
+            data["masks"] = [
+                rle_decode(mask_rle=mask, shape=data["mask_shape"])
+                for mask in data["masks"]
+            ]
+        data["labels"] = ["visual prompt" for _ in range(len(data["masks"]))]
+        return data
+
+
+class AgentDINOv(DINOv):
+    def __call__(
+        self,
+        prompt: List[Dict[str, str]],
+        image: Union[str, ImageType],
+    ) -> Dict:
+        rets = super().__call__(prompt, image)
+        mask_files = []
+        for mask in rets["masks"]:
+            with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
+                file_name = Path(tmp.name).with_suffix(".mask.png")
+                Image.fromarray(mask * 255).save(file_name)
+                mask_files.append(str(file_name))
+        rets["masks"] = mask_files
+        return rets
+
+
 class AgentGroundingSAM(GroundingSAM):
     r"""AgentGroundingSAM is the same as GroundingSAM but it saves the masks as files
     returns the file name. This makes it easier for agents to use.
@@ -652,6 +750,7 @@ def __call__(self, equation: str) -> float:
             ImageCaption,
             GroundingDINO,
             AgentGroundingSAM,
+            AgentDINOv,
             ExtractFrames,
             Crop,
             BboxArea,