Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add image caption tool #52

Merged
merged 1 commit into from
Apr 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion vision_agent/agent/vision_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -351,7 +351,7 @@ def __init__(
task_model: Optional[Union[LLM, LMM]] = None,
answer_model: Optional[Union[LLM, LMM]] = None,
reflect_model: Optional[Union[LLM, LMM]] = None,
max_retries: int = 3,
max_retries: int = 2,
verbose: bool = False,
report_progress_callback: Optional[Callable[[str], None]] = None,
):
Expand Down
1 change: 1 addition & 0 deletions vision_agent/tools/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
ExtractFrames,
GroundingDINO,
GroundingSAM,
ImageCaption,
SegArea,
SegIoU,
Tool,
Expand Down
69 changes: 69 additions & 0 deletions vision_agent/tools/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,74 @@ def __call__(self, prompt: str, image: Union[str, ImageType]) -> Dict:
return resp_json["data"] # type: ignore


class ImageCaption(Tool):
r"""ImageCaption is a tool that can caption an image based on its contents
or tags.

Example
-------
>>> import vision_agent as va
>>> caption = va.tools.ImageCaption()
>>> caption("image1.jpg")
{'text': ['a box of orange and white socks']}
"""

_ENDPOINT = "https://soi4ewr6fjqqdf5vuss6rrilee0kumxq.lambda-url.us-east-2.on.aws"

name = "image_caption_"
description = "'image_caption_' is a tool that can caption an image based on its contents or tags. It returns a text describing the image"
usage = {
"required_parameters": [
{"name": "image", "type": "str"},
],
"examples": [
{
"scenario": "Can you describe this image ? Image name: cat.jpg",
"parameters": {"image": "cat.jpg"},
},
{
"scenario": "Can you caption this image with their main contents ? Image name: cat_dog.jpg",
"parameters": {"image": "cat_dog.jpg"},
},
{
"scenario": "Can you build me a image captioning tool ? Image name: shirts.jpg",
"parameters": {
"image": "shirts.jpg",
},
},
],
}

# TODO: Add support for input multiple images, which aligns with the output type.
def __call__(self, image: Union[str, ImageType]) -> Dict:
"""Invoke the Image captioning model.

Parameters:
image: the input image to caption.

Returns:
A list of dictionaries containing the labels and scores. Each dictionary contains the classification result for an image. E.g. [{"labels": ["red line", "yellow dot"], "scores": [0.98, 0.02]}]
"""
image_b64 = convert_to_b64(image)
data = {
"image": image_b64,
"tool": "image_captioning",
}
res = requests.post(
self._ENDPOINT,
headers={"Content-Type": "application/json"},
json=data,
)
resp_json: Dict[str, Any] = res.json()
if (
"statusCode" in resp_json and resp_json["statusCode"] != 200
) or "statusCode" not in resp_json:
_LOGGER.error(f"Request failed: {resp_json}")
raise ValueError(f"Request failed: {resp_json}")

return resp_json["data"] # type: ignore


class GroundingDINO(Tool):
r"""Grounding DINO is a tool that can detect arbitrary objects with inputs such as
category names or referring expressions.
Expand Down Expand Up @@ -631,6 +699,7 @@ def __call__(self, equation: str) -> float:
[
NoOp,
CLIP,
ImageCaption,
GroundingDINO,
AgentGroundingSAM,
ExtractFrames,
Expand Down
Loading