From 7d7243915bbbd461537571b0a01293b93ac78983 Mon Sep 17 00:00:00 2001 From: Asia <2736300+humpydonkey@users.noreply.github.com> Date: Thu, 18 Apr 2024 17:14:53 -0700 Subject: [PATCH] Add DINOv as a new tool (#44) * Add DINOv as a new tool * Fix lint errors * Update docs * Fix param name mismatch (#45) Co-authored-by: Yazhou Cao * Grammar/Spelling fixes (#46) * Update prompts.py * Update vision_agent_prompts.py * Update reflexion_prompts.py * Update vision_agent_prompts.py * Update easytool_prompts.py * Update prompts.py * Update vision_agent_prompts.py * Switch to the tools endpoint (#40) * get endpoint ready for demo fixed tools.json Update vision_agent/tools/tools.py Bug fixes * Fix linter errors * Fix a bug in result parsing * Include scores in the G-SAM model response * Removed tools.json , need to find better format * Fixing the endpoint for CLIP and adding thresholds for grounding tools * fix mypy errors * fixed example notebook --------- Co-authored-by: Yazhou Cao Co-authored-by: shankar_ws3 * Support streaming chat logs of an agent (#47) Add a callback for reporting the chat progress of an agent * Empty-Commit * Empty-Commit: attempt to fix release * [skip ci] chore(release): vision-agent 0.0.49 * Fix typo (#48) Co-authored-by: Yazhou Cao * [skip ci] chore(release): vision-agent 0.0.50 * Fix a typo in log (#49) Fix another typo Co-authored-by: Yazhou Cao * [skip ci] chore(release): vision-agent 0.0.51 * Fix Baby Cam Use Case (#51) * fix visualization error * added font and score to viz * changed to smaller font file * Support streaming chat logs of an agent (#47) Add a callback for reporting the chat progress of an agent * fix visualize score issue * updated descriptions, fixed counter bug * added visualize_output * make feedback more concrete * made naming more consistent * replaced individual calc ops with calculator tool * fix random colors * fix prompts for tools * update reflection prompt * update readme * formatting fix * fixed mypy errors * fix merge issue --------- Co-authored-by: Asia <2736300+humpydonkey@users.noreply.github.com> * [skip ci] chore(release): vision-agent 0.0.52 * Add image caption tool (#52) added image caption tool * [skip ci] chore(release): vision-agent 0.0.53 * refactor: switch model endpoints (#54) * Switch the host of model endpoint to api.dev.landing.ai * DRY/Abstract out the inference code in tools * Introduce LandingaiAPIKey and support loading from .env file * Add integration tests for four model tools * Minor tweaks/fixes * Remove dead code * Bump the minor version to 0.1.0 * [skip ci] chore(release): vision-agent 0.1.1 * Pool Demo (#53) * visualized output/reflection to handle extract_frames_ * remove ipdb * added json mode for lmm, upgraded gpt-4-turbo * updated reflection prompt * refactor to make function simpler * updated reflection prompt, add tool usage doc * fixed format issue * fixed type issue * fixed test case * [skip ci] chore(release): vision-agent 0.1.2 * feat: allow disable motion detection in frame extraction function (#55) * Tweak frame extraction function * remove default motion detection, extract at 0.5 fps * lmm now take multiple images * removed counter * tweaked prompt * updated vision agent to reflect on multiple images * fix test case * added box distance * adjusted prompts --------- Co-authored-by: Yazhou Cao Co-authored-by: Dillon Laird * [skip ci] chore(release): vision-agent 0.1.3 * doc changes * fixed merge issues * fix color issue * add dinov with updated endpoint * formatting fix * added reference mask support * fix linting --------- Co-authored-by: Yazhou Cao Co-authored-by: Cameron Maloney Co-authored-by: shankar_ws3 Co-authored-by: Dillon Laird Co-authored-by: Shankar <90070882+shankar-landing-ai@users.noreply.github.com> --- examples/mask_app/app.py | 34 ++++++++++ examples/mask_app/requirements.txt | 2 + vision_agent/agent/vision_agent.py | 8 +++ vision_agent/image_utils.py | 4 +- vision_agent/tools/__init__.py | 1 + vision_agent/tools/tools.py | 99 ++++++++++++++++++++++++++++++ 6 files changed, 147 insertions(+), 1 deletion(-) create mode 100644 examples/mask_app/app.py create mode 100644 examples/mask_app/requirements.txt diff --git a/examples/mask_app/app.py b/examples/mask_app/app.py new file mode 100644 index 00000000..23a5fc78 --- /dev/null +++ b/examples/mask_app/app.py @@ -0,0 +1,34 @@ +import cv2 +import streamlit as st +from PIL import Image +from streamlit_drawable_canvas import st_canvas + +st.title("Image Segmentation Mask App") + +uploaded_file = st.file_uploader("Choose an image...", type=["jpg", "png"]) +if uploaded_file is not None: + image = Image.open(uploaded_file) + orig_size = image.size + +stroke_width = st.sidebar.slider("Stroke width: ", 1, 50, 25) +stroke_color = st.sidebar.color_picker("Stroke color hex: ") + +canvas_result = st_canvas( + fill_color="rgba(255, 165, 0, 0.3)", # Fixed fill color with some opacity + stroke_width=stroke_width, + stroke_color=stroke_color, + background_color="#eee", + background_image=Image.open(uploaded_file) if uploaded_file else None, + update_streamlit=True, + height=500, + drawing_mode="freedraw", + key="canvas", +) + +if canvas_result.image_data is not None: + mask = canvas_result.image_data.astype("uint8")[..., 3] + mask[mask > 0] = 255 + if st.button("Save Mask Image") and orig_size: + mask = cv2.resize(mask, orig_size, interpolation=cv2.INTER_NEAREST) + cv2.imwrite("mask.png", mask) + st.success("Mask Image saved successfully.") diff --git a/examples/mask_app/requirements.txt b/examples/mask_app/requirements.txt new file mode 100644 index 00000000..3ce2aea0 --- /dev/null +++ b/examples/mask_app/requirements.txt @@ -0,0 +1,2 @@ +streamlit +streamlit-drawable-canvas diff --git a/vision_agent/agent/vision_agent.py b/vision_agent/agent/vision_agent.py index a3f09b82..44c3aa08 100644 --- a/vision_agent/agent/vision_agent.py +++ b/vision_agent/agent/vision_agent.py @@ -365,6 +365,7 @@ def visualize_result(all_tool_results: List[Dict]) -> Sequence[Union[str, Path]] "grounding_sam_", "grounding_dino_", "extract_frames_", + "dinov_", ]: continue @@ -469,11 +470,18 @@ def chat_with_workflow( self, chat: List[Dict[str, str]], image: Optional[Union[str, Path]] = None, + reference_data: Optional[Dict[str, str]] = None, visualize_output: Optional[bool] = False, ) -> Tuple[str, List[Dict]]: question = chat[0]["content"] if image: question += f" Image name: {image}" + if reference_data: + if not ("image" in reference_data and "mask" in reference_data): + raise ValueError( + f"Reference data must contain 'image' and 'mask'. but got {reference_data}" + ) + question += f" Reference image: {reference_data['image']}, Reference mask: {reference_data['mask']}" reflections = "" final_answer = "" diff --git a/vision_agent/image_utils.py b/vision_agent/image_utils.py index d0164e11..f36a2033 100644 --- a/vision_agent/image_utils.py +++ b/vision_agent/image_utils.py @@ -103,7 +103,9 @@ def overlay_bboxes( elif isinstance(image, np.ndarray): image = Image.fromarray(image) - color = {label: COLORS[i % len(COLORS)] for i, label in enumerate(bboxes["labels"])} + color = { + label: COLORS[i % len(COLORS)] for i, label in enumerate(set(bboxes["labels"])) + } width, height = image.size fontsize = max(12, int(min(width, height) / 40)) diff --git a/vision_agent/tools/__init__.py b/vision_agent/tools/__init__.py index 63931c9f..1c1c6e73 100644 --- a/vision_agent/tools/__init__.py +++ b/vision_agent/tools/__init__.py @@ -6,6 +6,7 @@ BboxIoU, BoxDistance, Crop, + DINOv, ExtractFrames, GroundingDINO, GroundingSAM, diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py index 3a5c8a4f..6d2a7b47 100644 --- a/vision_agent/tools/tools.py +++ b/vision_agent/tools/tools.py @@ -372,6 +372,104 @@ def __call__( return ret_pred +class DINOv(Tool): + r"""DINOv is a tool that can detect and segment similar objects with the given input masks. + + Example + ------- + >>> import vision_agent as va + >>> t = va.tools.DINOv() + >>> t(prompt=[{"mask":"balloon_mask.jpg", "image": "balloon.jpg"}], image="balloon.jpg"]) + [{'scores': [0.512, 0.212], + 'masks': [array([[0, 0, 0, ..., 0, 0, 0], + ..., + [0, 0, 0, ..., 0, 0, 0]], dtype=uint8)}, + array([[0, 0, 0, ..., 0, 0, 0], + ..., + [1, 1, 1, ..., 1, 1, 1]], dtype=uint8)]}] + """ + + name = "dinov_" + description = "'dinov_' is a tool that can detect and segment similar objects given a reference segmentation mask." + usage = { + "required_parameters": [ + {"name": "prompt", "type": "List[Dict[str, str]]"}, + {"name": "image", "type": "str"}, + ], + "examples": [ + { + "scenario": "Can you find all the balloons in this image that is similar to the provided masked area? Image name: input.jpg Reference image: balloon.jpg Reference mask: balloon_mask.jpg", + "parameters": { + "prompt": [ + {"mask": "balloon_mask.jpg", "image": "balloon.jpg"}, + ], + "image": "input.jpg", + }, + }, + { + "scenario": "Detect all the objects in this image that are similar to the provided mask. Image name: original.jpg Reference image: mask.png Reference mask: background.png", + "parameters": { + "prompt": [ + {"mask": "mask.png", "image": "background.png"}, + ], + "image": "original.jpg", + }, + }, + ], + } + + def __call__( + self, prompt: List[Dict[str, str]], image: Union[str, ImageType] + ) -> Dict: + """Invoke the DINOv model. + + Parameters: + prompt: a list of visual prompts in the form of {'mask': 'MASK_FILE_PATH', 'image': 'IMAGE_FILE_PATH'}. + image: the input image to segment. + + Returns: + A dictionary of the below keys: 'scores', 'masks' and 'mask_shape', which stores a list of detected segmentation masks and its scores. + """ + image_b64 = convert_to_b64(image) + for p in prompt: + p["mask"] = convert_to_b64(p["mask"]) + p["image"] = convert_to_b64(p["image"]) + request_data = { + "prompt": prompt, + "image": image_b64, + "tool": "dinov", + } + data: Dict[str, Any] = _send_inference_request(request_data, "dinov") + if "bboxes" in data: + data["bboxes"] = [ + normalize_bbox(box, data["mask_shape"]) for box in data["bboxes"] + ] + if "masks" in data: + data["masks"] = [ + rle_decode(mask_rle=mask, shape=data["mask_shape"]) + for mask in data["masks"] + ] + data["labels"] = ["visual prompt" for _ in range(len(data["masks"]))] + return data + + +class AgentDINOv(DINOv): + def __call__( + self, + prompt: List[Dict[str, str]], + image: Union[str, ImageType], + ) -> Dict: + rets = super().__call__(prompt, image) + mask_files = [] + for mask in rets["masks"]: + with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp: + file_name = Path(tmp.name).with_suffix(".mask.png") + Image.fromarray(mask * 255).save(file_name) + mask_files.append(str(file_name)) + rets["masks"] = mask_files + return rets + + class AgentGroundingSAM(GroundingSAM): r"""AgentGroundingSAM is the same as GroundingSAM but it saves the masks as files returns the file name. This makes it easier for agents to use. @@ -652,6 +750,7 @@ def __call__(self, equation: str) -> float: ImageCaption, GroundingDINO, AgentGroundingSAM, + AgentDINOv, ExtractFrames, Crop, BboxArea,