Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add DINOv as a new tool #44

Merged
merged 32 commits into from
Apr 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
44feb20
Add DINOv as a new tool
AsiaCao Apr 9, 2024
b18eabe
Fix lint errors
AsiaCao Apr 9, 2024
05b1ad5
Update docs
AsiaCao Apr 9, 2024
cc2035c
Fix param name mismatch (#45)
humpydonkey Apr 9, 2024
5ace291
Grammar/Spelling fixes (#46)
cmaloney111 Apr 10, 2024
0cd57ef
Switch to the tools endpoint (#40)
humpydonkey Apr 10, 2024
073d40b
Support streaming chat logs of an agent (#47)
humpydonkey Apr 11, 2024
dfce50b
Empty-Commit
AsiaCao Apr 11, 2024
1498307
Empty-Commit: attempt to fix release
AsiaCao Apr 11, 2024
109eb87
[skip ci] chore(release): vision-agent 0.0.49
AsiaCao Apr 11, 2024
32c4738
Fix typo (#48)
humpydonkey Apr 11, 2024
a11c12d
[skip ci] chore(release): vision-agent 0.0.50
AsiaCao Apr 11, 2024
4da5d72
Fix a typo in log (#49)
humpydonkey Apr 11, 2024
e062992
[skip ci] chore(release): vision-agent 0.0.51
AsiaCao Apr 11, 2024
b7cdbee
Fix Baby Cam Use Case (#51)
dillonalaird Apr 12, 2024
ec1a73b
[skip ci] chore(release): vision-agent 0.0.52
AsiaCao Apr 12, 2024
fbe404c
Add image caption tool (#52)
shankar-vision-eng Apr 15, 2024
9542d22
[skip ci] chore(release): vision-agent 0.0.53
AsiaCao Apr 15, 2024
248070e
refactor: switch model endpoints (#54)
humpydonkey Apr 15, 2024
66217a4
[skip ci] chore(release): vision-agent 0.1.1
AsiaCao Apr 15, 2024
fc94a2e
Pool Demo (#53)
dillonalaird Apr 16, 2024
1055aea
[skip ci] chore(release): vision-agent 0.1.2
AsiaCao Apr 16, 2024
c439bde
feat: allow disable motion detection in frame extraction function (#55)
humpydonkey Apr 17, 2024
da818ad
[skip ci] chore(release): vision-agent 0.1.3
AsiaCao Apr 17, 2024
33cdf31
Merge branch 'main' into add-dinov
dillonalaird Apr 17, 2024
94f94ba
doc changes
dillonalaird Apr 17, 2024
d1d3268
fixed merge issues
dillonalaird Apr 17, 2024
81f3cf3
fix color issue
dillonalaird Apr 18, 2024
2a69082
add dinov with updated endpoint
dillonalaird Apr 18, 2024
34580e1
formatting fix
dillonalaird Apr 18, 2024
6b232ed
added reference mask support
dillonalaird Apr 18, 2024
20a687a
fix linting
dillonalaird Apr 18, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 34 additions & 0 deletions examples/mask_app/app.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import cv2
import streamlit as st
from PIL import Image
from streamlit_drawable_canvas import st_canvas

st.title("Image Segmentation Mask App")

uploaded_file = st.file_uploader("Choose an image...", type=["jpg", "png"])
if uploaded_file is not None:
image = Image.open(uploaded_file)
orig_size = image.size

stroke_width = st.sidebar.slider("Stroke width: ", 1, 50, 25)
stroke_color = st.sidebar.color_picker("Stroke color hex: ")

canvas_result = st_canvas(
fill_color="rgba(255, 165, 0, 0.3)", # Fixed fill color with some opacity
stroke_width=stroke_width,
stroke_color=stroke_color,
background_color="#eee",
background_image=Image.open(uploaded_file) if uploaded_file else None,
update_streamlit=True,
height=500,
drawing_mode="freedraw",
key="canvas",
)

if canvas_result.image_data is not None:
mask = canvas_result.image_data.astype("uint8")[..., 3]
mask[mask > 0] = 255
if st.button("Save Mask Image") and orig_size:
mask = cv2.resize(mask, orig_size, interpolation=cv2.INTER_NEAREST)
cv2.imwrite("mask.png", mask)
st.success("Mask Image saved successfully.")
2 changes: 2 additions & 0 deletions examples/mask_app/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
streamlit
streamlit-drawable-canvas
8 changes: 8 additions & 0 deletions vision_agent/agent/vision_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -365,6 +365,7 @@ def visualize_result(all_tool_results: List[Dict]) -> Sequence[Union[str, Path]]
"grounding_sam_",
"grounding_dino_",
"extract_frames_",
"dinov_",
]:
continue

Expand Down Expand Up @@ -469,11 +470,18 @@ def chat_with_workflow(
self,
chat: List[Dict[str, str]],
image: Optional[Union[str, Path]] = None,
reference_data: Optional[Dict[str, str]] = None,
visualize_output: Optional[bool] = False,
) -> Tuple[str, List[Dict]]:
question = chat[0]["content"]
if image:
question += f" Image name: {image}"
if reference_data:
if not ("image" in reference_data and "mask" in reference_data):
raise ValueError(
f"Reference data must contain 'image' and 'mask'. but got {reference_data}"
)
question += f" Reference image: {reference_data['image']}, Reference mask: {reference_data['mask']}"

reflections = ""
final_answer = ""
Expand Down
4 changes: 3 additions & 1 deletion vision_agent/image_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,9 @@ def overlay_bboxes(
elif isinstance(image, np.ndarray):
image = Image.fromarray(image)

color = {label: COLORS[i % len(COLORS)] for i, label in enumerate(bboxes["labels"])}
color = {
label: COLORS[i % len(COLORS)] for i, label in enumerate(set(bboxes["labels"]))
}

width, height = image.size
fontsize = max(12, int(min(width, height) / 40))
Expand Down
1 change: 1 addition & 0 deletions vision_agent/tools/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
BboxIoU,
BoxDistance,
Crop,
DINOv,
ExtractFrames,
GroundingDINO,
GroundingSAM,
Expand Down
99 changes: 99 additions & 0 deletions vision_agent/tools/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -372,6 +372,104 @@ def __call__(
return ret_pred


class DINOv(Tool):
r"""DINOv is a tool that can detect and segment similar objects with the given input masks.

Example
-------
>>> import vision_agent as va
>>> t = va.tools.DINOv()
>>> t(prompt=[{"mask":"balloon_mask.jpg", "image": "balloon.jpg"}], image="balloon.jpg"])
[{'scores': [0.512, 0.212],
'masks': [array([[0, 0, 0, ..., 0, 0, 0],
...,
[0, 0, 0, ..., 0, 0, 0]], dtype=uint8)},
array([[0, 0, 0, ..., 0, 0, 0],
...,
[1, 1, 1, ..., 1, 1, 1]], dtype=uint8)]}]
"""

name = "dinov_"
description = "'dinov_' is a tool that can detect and segment similar objects given a reference segmentation mask."
usage = {
"required_parameters": [
{"name": "prompt", "type": "List[Dict[str, str]]"},
{"name": "image", "type": "str"},
],
"examples": [
{
"scenario": "Can you find all the balloons in this image that is similar to the provided masked area? Image name: input.jpg Reference image: balloon.jpg Reference mask: balloon_mask.jpg",
"parameters": {
"prompt": [
{"mask": "balloon_mask.jpg", "image": "balloon.jpg"},
],
"image": "input.jpg",
},
},
{
"scenario": "Detect all the objects in this image that are similar to the provided mask. Image name: original.jpg Reference image: mask.png Reference mask: background.png",
"parameters": {
"prompt": [
{"mask": "mask.png", "image": "background.png"},
],
"image": "original.jpg",
},
},
],
}

def __call__(
self, prompt: List[Dict[str, str]], image: Union[str, ImageType]
) -> Dict:
"""Invoke the DINOv model.

Parameters:
prompt: a list of visual prompts in the form of {'mask': 'MASK_FILE_PATH', 'image': 'IMAGE_FILE_PATH'}.
image: the input image to segment.

Returns:
A dictionary of the below keys: 'scores', 'masks' and 'mask_shape', which stores a list of detected segmentation masks and its scores.
"""
image_b64 = convert_to_b64(image)
for p in prompt:
p["mask"] = convert_to_b64(p["mask"])
p["image"] = convert_to_b64(p["image"])
request_data = {
"prompt": prompt,
"image": image_b64,
"tool": "dinov",
}
data: Dict[str, Any] = _send_inference_request(request_data, "dinov")
if "bboxes" in data:
data["bboxes"] = [
normalize_bbox(box, data["mask_shape"]) for box in data["bboxes"]
]
if "masks" in data:
data["masks"] = [
rle_decode(mask_rle=mask, shape=data["mask_shape"])
for mask in data["masks"]
]
data["labels"] = ["visual prompt" for _ in range(len(data["masks"]))]
return data


class AgentDINOv(DINOv):
def __call__(
self,
prompt: List[Dict[str, str]],
image: Union[str, ImageType],
) -> Dict:
rets = super().__call__(prompt, image)
mask_files = []
for mask in rets["masks"]:
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
file_name = Path(tmp.name).with_suffix(".mask.png")
Image.fromarray(mask * 255).save(file_name)
mask_files.append(str(file_name))
rets["masks"] = mask_files
return rets


class AgentGroundingSAM(GroundingSAM):
r"""AgentGroundingSAM is the same as GroundingSAM but it saves the masks as files
returns the file name. This makes it easier for agents to use.
Expand Down Expand Up @@ -652,6 +750,7 @@ def __call__(self, equation: str) -> float:
ImageCaption,
GroundingDINO,
AgentGroundingSAM,
AgentDINOv,
ExtractFrames,
Crop,
BboxArea,
Expand Down
Loading