Skip to content

Commit a018360

Browse files
Adding counting tools to vision agent
1 parent 85a6170 commit a018360

File tree

2 files changed

+125
-0
lines changed

2 files changed

+125
-0
lines changed

vision_agent/tools/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@
1010
GroundingDINO,
1111
GroundingSAM,
1212
ImageCaption,
13+
ZeroShotCounting,
14+
VisualPromptCounting,
1315
SegArea,
1416
SegIoU,
1517
Tool,

vision_agent/tools/tools.py

Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -395,6 +395,127 @@ def __call__(
395395
return rets
396396

397397

398+
class ZeroShotCounting(Tool):
399+
r"""ZeroShotCounting is a tool that can count total number of instances of an object
400+
present in an image belonging to same class without a text or visual prompt.
401+
402+
Example
403+
-------
404+
>>> import vision_agent as va
405+
>>> zshot_count = va.tools.ZeroShotCounting()
406+
>>> zshot_count("image1.jpg")
407+
{'count': 45}
408+
"""
409+
410+
name = "zero_shot_counting_"
411+
description = """'zero_shot_counting_' is a tool that can count total number of instances of an object present in an image belonging to the same class without a text or visual prompt.
412+
It returns the total count of the objects."""
413+
usage = {
414+
"required_parameters": [
415+
{"name": "image", "type": "str"},
416+
],
417+
"examples": [
418+
{
419+
"scenario": "Can you count the lids in the image ? Image name: lids.jpg",
420+
"parameters": {"image": "lids.jpg"},
421+
},
422+
{
423+
"scenario": "Can you count the total number of objects in this image ? Image name: tray.jpg",
424+
"parameters": {"image": "tray.jpg"},
425+
},
426+
{
427+
"scenario": "Can you build me an object counting tool ? Image name: shirts.jpg",
428+
"parameters": {
429+
"image": "shirts.jpg",
430+
},
431+
},
432+
],
433+
}
434+
435+
# TODO: Add support for input multiple images, which aligns with the output type.
436+
def __call__(self, image: Union[str, ImageType]) -> Dict:
437+
"""Invoke the Image captioning model.
438+
439+
Parameters:
440+
image: the input image.
441+
442+
Returns:
443+
A dictionary containing the key 'count' and the count as value. E.g. {count: 12}
444+
"""
445+
image_b64 = convert_to_b64(image)
446+
data = {
447+
"image": image_b64,
448+
"tool": "zero_shot_counting",
449+
}
450+
return _send_inference_request(data, "tools")
451+
452+
453+
class VisualPromptCounting(Tool):
454+
r"""VisualPromptCounting is a tool that can count total number of instances of an object
455+
present in an image belonging to same class with help of an visual prompt which is a bounding box.
456+
457+
Example
458+
-------
459+
>>> import vision_agent as va
460+
>>> prompt_count = va.tools.VisualPromptCounting()
461+
>>> prompt_count(image="image1.jpg", prompt="100, 100, 200, 250")
462+
{'count': 23}
463+
"""
464+
465+
name = "visual_prompt_counting_"
466+
description = """'visual_prompt_counting_' is a tool that can count total number of instances of an object present in an image belonging to the same class given an
467+
example bounding box around a single instance. It returns the total count of the objects."""
468+
469+
usage = {
470+
"required_parameters": [
471+
{"name": "image", "type": "str"},
472+
{"name": "prompt", "type": "str"},
473+
],
474+
"examples": [
475+
{
476+
"scenario": "Here is an example of a lid '200, 200, 250, 300', Can you count the lids in the image ? Image name: lids.jpg",
477+
"parameters": {"image": "lids.jpg", "prompt": "200, 200, 250, 300"},
478+
},
479+
{
480+
"scenario": "Can you count the total number of objects in this image ? Image name: tray.jpg",
481+
"parameters": {"image": "tray.jpg", "prompt": "100, 100, 200, 250"},
482+
},
483+
{
484+
"scenario": "Can you build me a few shot object counting tool ? Image name: shirts.jpg",
485+
"parameters": {
486+
"image": "shirts.jpg",
487+
"prompt": "100, 100, 200, 250",
488+
},
489+
},
490+
{
491+
"scenario": "Can you build me a counting tool based on an example prompt ? Image name: shoes.jpg",
492+
"parameters": {
493+
"image": "shoes.jpg",
494+
"prompt": "150, 100, 500, 550",
495+
},
496+
},
497+
],
498+
}
499+
500+
# TODO: Add support for input multiple images, which aligns with the output type.
501+
def __call__(self, image: Union[str, ImageType], prompt: str) -> Dict:
502+
"""Invoke the Image captioning model.
503+
504+
Parameters:
505+
image: the input image.
506+
507+
Returns:
508+
A dictionary containing the key 'count' and the count as value. E.g. {count: 12}
509+
"""
510+
image_b64 = convert_to_b64(image)
511+
data = {
512+
"image": image_b64,
513+
"prompt": prompt,
514+
"tool": "few_shot_counting",
515+
}
516+
return _send_inference_request(data, "tools")
517+
518+
398519
class Crop(Tool):
399520
r"""Crop crops an image given a bounding box and returns a file name of the cropped image."""
400521

@@ -652,6 +773,8 @@ def __call__(self, equation: str) -> float:
652773
ImageCaption,
653774
GroundingDINO,
654775
AgentGroundingSAM,
776+
ZeroShotCounting,
777+
VisualPromptCounting,
655778
ExtractFrames,
656779
Crop,
657780
BboxArea,

0 commit comments

Comments
 (0)