Add DINOv as a new tool (#44)

* Add DINOv as a new tool * Fix lint errors * Update docs * Fix param name mismatch (#45) Co-authored-by: Yazhou Cao <[email protected]> * Grammar/Spelling fixes (#46) * Update prompts.py * Update vision_agent_prompts.py * Update reflexion_prompts.py * Update vision_agent_prompts.py * Update easytool_prompts.py * Update prompts.py * Update vision_agent_prompts.py * Switch to the tools endpoint (#40) * get endpoint ready for demo fixed tools.json Update vision_agent/tools/tools.py Bug fixes * Fix linter errors * Fix a bug in result parsing * Include scores in the G-SAM model response * Removed tools.json , need to find better format * Fixing the endpoint for CLIP and adding thresholds for grounding tools * fix mypy errors * fixed example notebook --------- Co-authored-by: Yazhou Cao <[email protected]> Co-authored-by: shankar_ws3 <[email protected]> * Support streaming chat logs of an agent (#47) Add a callback for reporting the chat progress of an agent * Empty-Commit * Empty-Commit: attempt to fix release * [skip ci] chore(release): vision-agent 0.0.49 * Fix typo (#48) Co-authored-by: Yazhou Cao <[email protected]> * [skip ci] chore(release): vision-agent 0.0.50 * Fix a typo in log (#49) Fix another typo Co-authored-by: Yazhou Cao <[email protected]> * [skip ci] chore(release): vision-agent 0.0.51 * Fix Baby Cam Use Case (#51) * fix visualization error * added font and score to viz * changed to smaller font file * Support streaming chat logs of an agent (#47) Add a callback for reporting the chat progress of an agent * fix visualize score issue * updated descriptions, fixed counter bug * added visualize_output * make feedback more concrete * made naming more consistent * replaced individual calc ops with calculator tool * fix random colors * fix prompts for tools * update reflection prompt * update readme * formatting fix * fixed mypy errors * fix merge issue --------- Co-authored-by: Asia <[email protected]> * [skip ci] chore(release): vision-agent 0.0.52 * Add image caption tool (#52) added image caption tool * [skip ci] chore(release): vision-agent 0.0.53 * refactor: switch model endpoints (#54) * Switch the host of model endpoint to api.dev.landing.ai * DRY/Abstract out the inference code in tools * Introduce LandingaiAPIKey and support loading from .env file * Add integration tests for four model tools * Minor tweaks/fixes * Remove dead code * Bump the minor version to 0.1.0 * [skip ci] chore(release): vision-agent 0.1.1 * Pool Demo (#53) * visualized output/reflection to handle extract_frames_ * remove ipdb * added json mode for lmm, upgraded gpt-4-turbo * updated reflection prompt * refactor to make function simpler * updated reflection prompt, add tool usage doc * fixed format issue * fixed type issue * fixed test case * [skip ci] chore(release): vision-agent 0.1.2 * feat: allow disable motion detection in frame extraction function (#55) * Tweak frame extraction function * remove default motion detection, extract at 0.5 fps * lmm now take multiple images * removed counter * tweaked prompt * updated vision agent to reflect on multiple images * fix test case * added box distance * adjusted prompts --------- Co-authored-by: Yazhou Cao <[email protected]> Co-authored-by: Dillon Laird <[email protected]> * [skip ci] chore(release): vision-agent 0.1.3 * doc changes * fixed merge issues * fix color issue * add dinov with updated endpoint * formatting fix * added reference mask support * fix linting --------- Co-authored-by: Yazhou Cao <[email protected]> Co-authored-by: Cameron Maloney <[email protected]> Co-authored-by: shankar_ws3 <[email protected]> Co-authored-by: Dillon Laird <[email protected]> Co-authored-by: Shankar <[email protected]>
landing-ai · Apr 19, 2024 · 7d72439 · 7d72439
1 parent 85a6170
commit 7d72439
Show file tree

Hide file tree

Showing 6 changed files with 147 additions and 1 deletion.
diff --git a/examples/mask_app/app.py b/examples/mask_app/app.py
@@ -0,0 +1,34 @@
+import cv2
+import streamlit as st
+from PIL import Image
+from streamlit_drawable_canvas import st_canvas
+
+st.title("Image Segmentation Mask App")
+
+uploaded_file = st.file_uploader("Choose an image...", type=["jpg", "png"])
+if uploaded_file is not None:
+ image = Image.open(uploaded_file)
+ orig_size = image.size
+
+stroke_width = st.sidebar.slider("Stroke width: ", 1, 50, 25)
+stroke_color = st.sidebar.color_picker("Stroke color hex: ")
+
+canvas_result = st_canvas(
+ fill_color="rgba(255, 165, 0, 0.3)", # Fixed fill color with some opacity
+ stroke_width=stroke_width,
+ stroke_color=stroke_color,
+ background_color="#eee",
+ background_image=Image.open(uploaded_file) if uploaded_file else None,
+ update_streamlit=True,
+ height=500,
+ drawing_mode="freedraw",
+ key="canvas",
+)
+
+if canvas_result.image_data is not None:
+ mask = canvas_result.image_data.astype("uint8")[..., 3]
+ mask[mask > 0] = 255
+ if st.button("Save Mask Image") and orig_size:
+ mask = cv2.resize(mask, orig_size, interpolation=cv2.INTER_NEAREST)
+ cv2.imwrite("mask.png", mask)
+ st.success("Mask Image saved successfully.")
diff --git a/examples/mask_app/requirements.txt b/examples/mask_app/requirements.txt
@@ -0,0 +1,2 @@
+streamlit
+streamlit-drawable-canvas
diff --git a/vision_agent/agent/vision_agent.py b/vision_agent/agent/vision_agent.py
@@ -365,6 +365,7 @@ def visualize_result(all_tool_results: List[Dict]) -> Sequence[Union[str, Path]]
  "grounding_sam_",
  "grounding_dino_",
  "extract_frames_",
+ "dinov_",
  ]:
  continue
 
@@ -469,11 +470,18 @@ def chat_with_workflow(
  self,
  chat: List[Dict[str, str]],
  image: Optional[Union[str, Path]] = None,
+ reference_data: Optional[Dict[str, str]] = None,
  visualize_output: Optional[bool] = False,
  ) -> Tuple[str, List[Dict]]:
  question = chat[0]["content"]
  if image:
  question += f" Image name: {image}"
+ if reference_data:
+ if not ("image" in reference_data and "mask" in reference_data):
+ raise ValueError(
+ f"Reference data must contain 'image' and 'mask'. but got {reference_data}"
+ )
+ question += f" Reference image: {reference_data['image']}, Reference mask: {reference_data['mask']}"
 
  reflections = ""
  final_answer = ""

diff --git a/vision_agent/image_utils.py b/vision_agent/image_utils.py
@@ -103,7 +103,9 @@ def overlay_bboxes(
  elif isinstance(image, np.ndarray):
  image = Image.fromarray(image)
 
- color = {label: COLORS[i % len(COLORS)] for i, label in enumerate(bboxes["labels"])}
+ color = {
+ label: COLORS[i % len(COLORS)] for i, label in enumerate(set(bboxes["labels"]))
+ }
 
  width, height = image.size
  fontsize = max(12, int(min(width, height) / 40))

diff --git a/vision_agent/tools/__init__.py b/vision_agent/tools/__init__.py
@@ -6,6 +6,7 @@
  BboxIoU,
  BoxDistance,
  Crop,
+ DINOv,
  ExtractFrames,
  GroundingDINO,
  GroundingSAM,

diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py
@@ -372,6 +372,104 @@ def __call__(
  return ret_pred
 
 
+class DINOv(Tool):
+ r"""DINOv is a tool that can detect and segment similar objects with the given input masks.
+
+ Example
+ -------
+ >>> import vision_agent as va
+ >>> t = va.tools.DINOv()
+ >>> t(prompt=[{"mask":"balloon_mask.jpg", "image": "balloon.jpg"}], image="balloon.jpg"])
+ [{'scores': [0.512, 0.212],
+ 'masks': [array([[0, 0, 0, ..., 0, 0, 0],
+ ...,
+ [0, 0, 0, ..., 0, 0, 0]], dtype=uint8)},
+ array([[0, 0, 0, ..., 0, 0, 0],
+ ...,
+ [1, 1, 1, ..., 1, 1, 1]], dtype=uint8)]}]
+ """
+
+ name = "dinov_"
+ description = "'dinov_' is a tool that can detect and segment similar objects given a reference segmentation mask."
+ usage = {
+ "required_parameters": [
+ {"name": "prompt", "type": "List[Dict[str, str]]"},
+ {"name": "image", "type": "str"},
+ ],
+ "examples": [
+ {
+ "scenario": "Can you find all the balloons in this image that is similar to the provided masked area? Image name: input.jpg Reference image: balloon.jpg Reference mask: balloon_mask.jpg",
+ "parameters": {
+ "prompt": [
+ {"mask": "balloon_mask.jpg", "image": "balloon.jpg"},
+ ],
+ "image": "input.jpg",
+ },
+ },
+ {
+ "scenario": "Detect all the objects in this image that are similar to the provided mask. Image name: original.jpg Reference image: mask.png Reference mask: background.png",
+ "parameters": {
+ "prompt": [
+ {"mask": "mask.png", "image": "background.png"},
+ ],
+ "image": "original.jpg",
+ },
+ },
+ ],
+ }
+
+ def __call__(
+ self, prompt: List[Dict[str, str]], image: Union[str, ImageType]
+ ) -> Dict:
+ """Invoke the DINOv model.
+
+ Parameters:
+ prompt: a list of visual prompts in the form of {'mask': 'MASK_FILE_PATH', 'image': 'IMAGE_FILE_PATH'}.
+ image: the input image to segment.
+
+ Returns:
+ A dictionary of the below keys: 'scores', 'masks' and 'mask_shape', which stores a list of detected segmentation masks and its scores.
+ """
+ image_b64 = convert_to_b64(image)
+ for p in prompt:
+ p["mask"] = convert_to_b64(p["mask"])
+ p["image"] = convert_to_b64(p["image"])
+ request_data = {
+ "prompt": prompt,
+ "image": image_b64,
+ "tool": "dinov",
+ }
+ data: Dict[str, Any] = _send_inference_request(request_data, "dinov")
+ if "bboxes" in data:
+ data["bboxes"] = [
+ normalize_bbox(box, data["mask_shape"]) for box in data["bboxes"]
+ ]
+ if "masks" in data:
+ data["masks"] = [
+ rle_decode(mask_rle=mask, shape=data["mask_shape"])
+ for mask in data["masks"]
+ ]
+ data["labels"] = ["visual prompt" for _ in range(len(data["masks"]))]
+ return data
+
+
+class AgentDINOv(DINOv):
+ def __call__(
+ self,
+ prompt: List[Dict[str, str]],
+ image: Union[str, ImageType],
+ ) -> Dict:
+ rets = super().__call__(prompt, image)
+ mask_files = []
+ for mask in rets["masks"]:
+ with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
+ file_name = Path(tmp.name).with_suffix(".mask.png")
+ Image.fromarray(mask * 255).save(file_name)
+ mask_files.append(str(file_name))
+ rets["masks"] = mask_files
+ return rets
+
+
 class AgentGroundingSAM(GroundingSAM):
  r"""AgentGroundingSAM is the same as GroundingSAM but it saves the masks as files
  returns the file name. This makes it easier for agents to use.
@@ -652,6 +750,7 @@ def __call__(self, equation: str) -> float:
  ImageCaption,
  GroundingDINO,
  AgentGroundingSAM,
+ AgentDINOv,
  ExtractFrames,
  Crop,
  BboxArea,