Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix issues 2 #267

Merged
merged 7 commits into from
Oct 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
<div align="center">
<img alt="vision_agent" height="200px" src="https://github.com/landing-ai/vision-agent/blob/main/assets/logo.jpg?raw=true">
<img alt="vision_agent" height="200px" src="https://github.com/landing-ai/vision-agent/blob/main/assets/logo.png?raw=true">

# 🔍🤖 Vision Agent
[![](https://dcbadge.vercel.app/api/server/wPdN8RCYew?compact=true&style=flat)](https://discord.gg/wPdN8RCYew)
Expand Down Expand Up @@ -345,6 +345,11 @@ result = agent.generate_code(conv)


## Additional Backends
### E2B Code Execution
If you wish to run your code on the E2B backend, make sure you have your `E2B_API_KEY`
set and then set `CODE_SANDBOX_RUNTIME=e2b` in your environment variables. This will
run all the agent generated code on the E2B backend.

### Anthropic
`AnthropicVisionAgentCoder` uses Anthropic. To get started you just need to get an
Anthropic API key and set it in your environment variables:
Expand Down
Binary file removed assets/logo.jpg
Binary file not shown.
Binary file added assets/logo.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
5 changes: 5 additions & 0 deletions docs/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -341,6 +341,11 @@ result = agent.generate_code(conv)


## Additional Backends
### E2B Code Execution
If you wish to run your code on the E2B backend, make sure you have your `E2B_API_KEY`
set and then set `CODE_SANDBOX_RUNTIME=e2b` in your environment variables. This will
run all the agent generated code on the E2B backend.

### Anthropic
`AnthropicVisionAgentCoder` uses Anthropic. To get started you just need to get an
Anthropic API key and set it in your environment variables:
Expand Down
118 changes: 92 additions & 26 deletions tests/integ/test_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@
grounding_dino,
grounding_sam,
ixc25_image_vqa,
ixc25_temporal_localization,
ixc25_video_vqa,
loca_visual_prompt_counting,
loca_zero_shot_counting,
Expand Down Expand Up @@ -71,6 +70,14 @@ def test_owl_v2_image():
assert all([all([0 <= x <= 1 for x in obj["bbox"]]) for obj in result])


def test_owl_v2_image_empty():
result = owl_v2_image(
prompt="coin",
image=np.zeros((0, 0, 3)).astype(np.uint8),
)
assert result == []


def test_owl_v2_fine_tune_id():
img = ski.data.coins()
result = owl_v2_image(
Expand Down Expand Up @@ -110,6 +117,14 @@ def test_florence2_phrase_grounding():
assert all([all([0 <= x <= 1 for x in obj["bbox"]]) for obj in result])


def test_florence2_phrase_grounding_empty():
result = florence2_phrase_grounding(
image=np.zeros((0, 0, 3)).astype(np.uint8),
prompt="coin",
)
assert result == []


def test_florence2_phrase_grounding_fine_tune_id():
img = ski.data.coins()
result = florence2_phrase_grounding(
Expand Down Expand Up @@ -147,7 +162,7 @@ def test_florence2_phrase_grounding_video_fine_tune_id():
fine_tune_id=FINE_TUNE_ID,
)
assert len(result) == 10
assert 16 <= len([res["label"] for res in result[0]]) <= 26
assert 12 <= len([res["label"] for res in result[0]]) <= 26
assert all([all([0 <= x <= 1 for x in obj["bbox"]]) for obj in result[0]])


Expand Down Expand Up @@ -195,6 +210,14 @@ def test_florence2_sam2_image_fine_tune_id():
assert len([res["mask"] for res in result]) == len(result)


def test_florence2_sam2_image_empty():
result = florence2_sam2_image(
prompt="coin",
image=np.zeros((0, 0, 3)).astype(np.uint8),
)
assert result == []


def test_florence2_sam2_video():
frames = [
np.array(Image.fromarray(ski.data.coins()).convert("RGB")) for _ in range(10)
Expand All @@ -208,7 +231,7 @@ def test_florence2_sam2_video():
assert len([res["mask"] for res in result[0]]) == 25


def test_segmentation():
def test_detr_segmentation():
img = ski.data.coins()
result = detr_segmentation(
image=img,
Expand All @@ -218,6 +241,13 @@ def test_segmentation():
assert len([res["mask"] for res in result]) == 1


def test_detr_segmentation_empty():
result = detr_segmentation(
image=np.zeros((0, 0, 3)).astype(np.uint8),
)
assert result == []


def test_clip():
img = ski.data.coins()
result = clip(
Expand All @@ -227,6 +257,15 @@ def test_clip():
assert result["scores"] == [0.9999, 0.0001]


def test_clip_empty():
result = clip(
classes=["coins", "notes"],
image=np.zeros((0, 0, 3)).astype(np.uint8),
)
assert result["scores"] == []
assert result["labels"] == []


def test_vit_classification():
img = ski.data.coins()
result = vit_image_classification(
Expand All @@ -235,6 +274,14 @@ def test_vit_classification():
assert "typewriter keyboard" in result["labels"]


def test_vit_classification_empty():
result = vit_image_classification(
image=np.zeros((0, 0, 3)).astype(np.uint8),
)
assert result["labels"] == []
assert result["scores"] == []


def test_nsfw_classification():
img = ski.data.coins()
result = vit_nsfw_classification(
Expand All @@ -243,23 +290,23 @@ def test_nsfw_classification():
assert result["label"] == "normal"


def test_image_caption() -> None:
def test_image_caption():
img = ski.data.rocket()
result = blip_image_caption(
image=img,
)
assert result.strip() == "a rocket on a stand"


def test_florence_image_caption() -> None:
def test_florence_image_caption():
img = ski.data.rocket()
result = florence2_image_caption(
image=img,
)
assert "The image shows a rocket on a launch pad at night" in result.strip()


def test_loca_zero_shot_counting() -> None:
def test_loca_zero_shot_counting():
img = ski.data.coins()

result = loca_zero_shot_counting(
Expand All @@ -268,7 +315,7 @@ def test_loca_zero_shot_counting() -> None:
assert result["count"] == 21


def test_loca_visual_prompt_counting() -> None:
def test_loca_visual_prompt_counting():
img = ski.data.coins()
result = loca_visual_prompt_counting(
visual_prompt={"bbox": [85, 106, 122, 145]},
Expand All @@ -277,7 +324,7 @@ def test_loca_visual_prompt_counting() -> None:
assert result["count"] == 25


def test_git_vqa_v2() -> None:
def test_git_vqa_v2():
img = ski.data.rocket()
result = git_vqa_v2(
prompt="Is the scene captured during day or night ?",
Expand All @@ -286,7 +333,7 @@ def test_git_vqa_v2() -> None:
assert result.strip() == "night"


def test_image_qa_with_context() -> None:
def test_image_qa_with_context():
img = ski.data.rocket()
result = florence2_roberta_vqa(
prompt="Is the scene captured during day or night ?",
Expand All @@ -295,7 +342,7 @@ def test_image_qa_with_context() -> None:
assert "night" in result.strip()


def test_ixc25_image_vqa() -> None:
def test_ixc25_image_vqa():
img = ski.data.cat()
result = ixc25_image_vqa(
prompt="What animal is in this image?",
Expand All @@ -304,7 +351,7 @@ def test_ixc25_image_vqa() -> None:
assert "cat" in result.strip()


def test_ixc25_video_vqa() -> None:
def test_ixc25_video_vqa():
frames = [
np.array(Image.fromarray(ski.data.cat()).convert("RGB")) for _ in range(10)
]
Expand All @@ -315,33 +362,36 @@ def test_ixc25_video_vqa() -> None:
assert "cat" in result.strip()


def test_ixc25_temporal_localization() -> None:
frames = [
np.array(Image.fromarray(ski.data.cat()).convert("RGB")) for _ in range(10)
]
result = ixc25_temporal_localization(
prompt="What animal is in this video?",
frames=frames,
)
assert result == [True] * 10


def test_ocr() -> None:
def test_ocr():
img = ski.data.page()
result = ocr(
image=img,
)
assert any("Region-based segmentation" in res["label"] for res in result)


def test_florence2_ocr() -> None:
def test_ocr_empty():
result = ocr(
image=np.zeros((0, 0, 3)).astype(np.uint8),
)
assert result == []


def test_florence2_ocr():
img = ski.data.page()
result = florence2_ocr(
image=img,
)
assert any("Region-based segmentation" in res["label"] for res in result)


def test_florence2_ocr_empty():
result = florence2_ocr(
image=np.zeros((0, 0, 3)).astype(np.uint8),
)
assert result == []


def test_mask_distance():
# Create two binary masks
mask1 = np.zeros((100, 100), dtype=np.uint8)
Expand Down Expand Up @@ -399,18 +449,34 @@ def test_generate_hed():
assert result.shape == img.shape


def test_countgd_counting() -> None:
def test_countgd_counting():
img = ski.data.coins()
result = countgd_counting(image=img, prompt="coin")
assert len(result) == 24
assert [res["label"] for res in result] == ["coin"] * 24


def test_countgd_example_based_counting() -> None:
def test_countgd_counting_empty():
result = countgd_counting(
prompt="coin",
image=np.zeros((0, 0, 3)).astype(np.uint8),
)
assert result == []


def test_countgd_example_based_counting():
img = ski.data.coins()
result = countgd_example_based_counting(
visual_prompts=[[85, 106, 122, 145]],
image=img,
)
assert len(result) == 24
assert [res["label"] for res in result] == ["object"] * 24


def test_countgd_example_based_counting_empty():
result = countgd_example_based_counting(
visual_prompts=[[85, 106, 122, 145]],
image=np.zeros((0, 0, 3)).astype(np.uint8),
)
assert result == []
10 changes: 9 additions & 1 deletion tests/unit/test_va.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,12 @@
from vision_agent.agent.vision_agent import parse_execution
from vision_agent.agent.agent_utils import extract_tag
from vision_agent.tools.meta_tools import use_extra_vision_agent_args


def parse_execution(code, test_multi_plan=True, custom_tool_names=None):
code = extract_tag(code, "execute_python")
if not code:
return None
return use_extra_vision_agent_args(code, test_multi_plan, custom_tool_names)


def test_parse_execution_zero():
Expand Down
Loading
Loading