Skip to content

Commit

Permalink
Fix issues 2 (#267)
Browse files Browse the repository at this point in the history
* switched to tags to reduce parsing issues

* tools can deal with empty images

* make fine tune id test looser

* automatically save files to artifacts

* update logo

* updated docs

* remove temporal localization
  • Loading branch information
dillonalaird authored Oct 14, 2024
1 parent 4664e72 commit 7bd8a11
Show file tree
Hide file tree
Showing 11 changed files with 376 additions and 216 deletions.
7 changes: 6 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
<div align="center">
<img alt="vision_agent" height="200px" src="https://github.com/landing-ai/vision-agent/blob/main/assets/logo.jpg?raw=true">
<img alt="vision_agent" height="200px" src="https://github.com/landing-ai/vision-agent/blob/main/assets/logo.png?raw=true">

# 🔍🤖 Vision Agent
[![](https://dcbadge.vercel.app/api/server/wPdN8RCYew?compact=true&style=flat)](https://discord.gg/wPdN8RCYew)
Expand Down Expand Up @@ -345,6 +345,11 @@ result = agent.generate_code(conv)


## Additional Backends
### E2B Code Execution
If you wish to run your code on the E2B backend, make sure you have your `E2B_API_KEY`
set and then set `CODE_SANDBOX_RUNTIME=e2b` in your environment variables. This will
run all the agent generated code on the E2B backend.

### Anthropic
`AnthropicVisionAgentCoder` uses Anthropic. To get started you just need to get an
Anthropic API key and set it in your environment variables:
Expand Down
Binary file removed assets/logo.jpg
Binary file not shown.
Binary file added assets/logo.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
5 changes: 5 additions & 0 deletions docs/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -341,6 +341,11 @@ result = agent.generate_code(conv)


## Additional Backends
### E2B Code Execution
If you wish to run your code on the E2B backend, make sure you have your `E2B_API_KEY`
set and then set `CODE_SANDBOX_RUNTIME=e2b` in your environment variables. This will
run all the agent generated code on the E2B backend.

### Anthropic
`AnthropicVisionAgentCoder` uses Anthropic. To get started you just need to get an
Anthropic API key and set it in your environment variables:
Expand Down
118 changes: 92 additions & 26 deletions tests/integ/test_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@
grounding_dino,
grounding_sam,
ixc25_image_vqa,
ixc25_temporal_localization,
ixc25_video_vqa,
loca_visual_prompt_counting,
loca_zero_shot_counting,
Expand Down Expand Up @@ -71,6 +70,14 @@ def test_owl_v2_image():
assert all([all([0 <= x <= 1 for x in obj["bbox"]]) for obj in result])


def test_owl_v2_image_empty():
result = owl_v2_image(
prompt="coin",
image=np.zeros((0, 0, 3)).astype(np.uint8),
)
assert result == []


def test_owl_v2_fine_tune_id():
img = ski.data.coins()
result = owl_v2_image(
Expand Down Expand Up @@ -110,6 +117,14 @@ def test_florence2_phrase_grounding():
assert all([all([0 <= x <= 1 for x in obj["bbox"]]) for obj in result])


def test_florence2_phrase_grounding_empty():
result = florence2_phrase_grounding(
image=np.zeros((0, 0, 3)).astype(np.uint8),
prompt="coin",
)
assert result == []


def test_florence2_phrase_grounding_fine_tune_id():
img = ski.data.coins()
result = florence2_phrase_grounding(
Expand Down Expand Up @@ -147,7 +162,7 @@ def test_florence2_phrase_grounding_video_fine_tune_id():
fine_tune_id=FINE_TUNE_ID,
)
assert len(result) == 10
assert 16 <= len([res["label"] for res in result[0]]) <= 26
assert 12 <= len([res["label"] for res in result[0]]) <= 26
assert all([all([0 <= x <= 1 for x in obj["bbox"]]) for obj in result[0]])


Expand Down Expand Up @@ -195,6 +210,14 @@ def test_florence2_sam2_image_fine_tune_id():
assert len([res["mask"] for res in result]) == len(result)


def test_florence2_sam2_image_empty():
result = florence2_sam2_image(
prompt="coin",
image=np.zeros((0, 0, 3)).astype(np.uint8),
)
assert result == []


def test_florence2_sam2_video():
frames = [
np.array(Image.fromarray(ski.data.coins()).convert("RGB")) for _ in range(10)
Expand All @@ -208,7 +231,7 @@ def test_florence2_sam2_video():
assert len([res["mask"] for res in result[0]]) == 25


def test_segmentation():
def test_detr_segmentation():
img = ski.data.coins()
result = detr_segmentation(
image=img,
Expand All @@ -218,6 +241,13 @@ def test_segmentation():
assert len([res["mask"] for res in result]) == 1


def test_detr_segmentation_empty():
result = detr_segmentation(
image=np.zeros((0, 0, 3)).astype(np.uint8),
)
assert result == []


def test_clip():
img = ski.data.coins()
result = clip(
Expand All @@ -227,6 +257,15 @@ def test_clip():
assert result["scores"] == [0.9999, 0.0001]


def test_clip_empty():
result = clip(
classes=["coins", "notes"],
image=np.zeros((0, 0, 3)).astype(np.uint8),
)
assert result["scores"] == []
assert result["labels"] == []


def test_vit_classification():
img = ski.data.coins()
result = vit_image_classification(
Expand All @@ -235,6 +274,14 @@ def test_vit_classification():
assert "typewriter keyboard" in result["labels"]


def test_vit_classification_empty():
result = vit_image_classification(
image=np.zeros((0, 0, 3)).astype(np.uint8),
)
assert result["labels"] == []
assert result["scores"] == []


def test_nsfw_classification():
img = ski.data.coins()
result = vit_nsfw_classification(
Expand All @@ -243,23 +290,23 @@ def test_nsfw_classification():
assert result["label"] == "normal"


def test_image_caption() -> None:
def test_image_caption():
img = ski.data.rocket()
result = blip_image_caption(
image=img,
)
assert result.strip() == "a rocket on a stand"


def test_florence_image_caption() -> None:
def test_florence_image_caption():
img = ski.data.rocket()
result = florence2_image_caption(
image=img,
)
assert "The image shows a rocket on a launch pad at night" in result.strip()


def test_loca_zero_shot_counting() -> None:
def test_loca_zero_shot_counting():
img = ski.data.coins()

result = loca_zero_shot_counting(
Expand All @@ -268,7 +315,7 @@ def test_loca_zero_shot_counting() -> None:
assert result["count"] == 21


def test_loca_visual_prompt_counting() -> None:
def test_loca_visual_prompt_counting():
img = ski.data.coins()
result = loca_visual_prompt_counting(
visual_prompt={"bbox": [85, 106, 122, 145]},
Expand All @@ -277,7 +324,7 @@ def test_loca_visual_prompt_counting() -> None:
assert result["count"] == 25


def test_git_vqa_v2() -> None:
def test_git_vqa_v2():
img = ski.data.rocket()
result = git_vqa_v2(
prompt="Is the scene captured during day or night ?",
Expand All @@ -286,7 +333,7 @@ def test_git_vqa_v2() -> None:
assert result.strip() == "night"


def test_image_qa_with_context() -> None:
def test_image_qa_with_context():
img = ski.data.rocket()
result = florence2_roberta_vqa(
prompt="Is the scene captured during day or night ?",
Expand All @@ -295,7 +342,7 @@ def test_image_qa_with_context() -> None:
assert "night" in result.strip()


def test_ixc25_image_vqa() -> None:
def test_ixc25_image_vqa():
img = ski.data.cat()
result = ixc25_image_vqa(
prompt="What animal is in this image?",
Expand All @@ -304,7 +351,7 @@ def test_ixc25_image_vqa() -> None:
assert "cat" in result.strip()


def test_ixc25_video_vqa() -> None:
def test_ixc25_video_vqa():
frames = [
np.array(Image.fromarray(ski.data.cat()).convert("RGB")) for _ in range(10)
]
Expand All @@ -315,33 +362,36 @@ def test_ixc25_video_vqa() -> None:
assert "cat" in result.strip()


def test_ixc25_temporal_localization() -> None:
frames = [
np.array(Image.fromarray(ski.data.cat()).convert("RGB")) for _ in range(10)
]
result = ixc25_temporal_localization(
prompt="What animal is in this video?",
frames=frames,
)
assert result == [True] * 10


def test_ocr() -> None:
def test_ocr():
img = ski.data.page()
result = ocr(
image=img,
)
assert any("Region-based segmentation" in res["label"] for res in result)


def test_florence2_ocr() -> None:
def test_ocr_empty():
result = ocr(
image=np.zeros((0, 0, 3)).astype(np.uint8),
)
assert result == []


def test_florence2_ocr():
img = ski.data.page()
result = florence2_ocr(
image=img,
)
assert any("Region-based segmentation" in res["label"] for res in result)


def test_florence2_ocr_empty():
result = florence2_ocr(
image=np.zeros((0, 0, 3)).astype(np.uint8),
)
assert result == []


def test_mask_distance():
# Create two binary masks
mask1 = np.zeros((100, 100), dtype=np.uint8)
Expand Down Expand Up @@ -399,18 +449,34 @@ def test_generate_hed():
assert result.shape == img.shape


def test_countgd_counting() -> None:
def test_countgd_counting():
img = ski.data.coins()
result = countgd_counting(image=img, prompt="coin")
assert len(result) == 24
assert [res["label"] for res in result] == ["coin"] * 24


def test_countgd_example_based_counting() -> None:
def test_countgd_counting_empty():
result = countgd_counting(
prompt="coin",
image=np.zeros((0, 0, 3)).astype(np.uint8),
)
assert result == []


def test_countgd_example_based_counting():
img = ski.data.coins()
result = countgd_example_based_counting(
visual_prompts=[[85, 106, 122, 145]],
image=img,
)
assert len(result) == 24
assert [res["label"] for res in result] == ["object"] * 24


def test_countgd_example_based_counting_empty():
result = countgd_example_based_counting(
visual_prompts=[[85, 106, 122, 145]],
image=np.zeros((0, 0, 3)).astype(np.uint8),
)
assert result == []
10 changes: 9 additions & 1 deletion tests/unit/test_va.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,12 @@
from vision_agent.agent.vision_agent import parse_execution
from vision_agent.agent.agent_utils import extract_tag
from vision_agent.tools.meta_tools import use_extra_vision_agent_args


def parse_execution(code, test_multi_plan=True, custom_tool_names=None):
code = extract_tag(code, "execute_python")
if not code:
return None
return use_extra_vision_agent_args(code, test_multi_plan, custom_tool_names)


def test_parse_execution_zero():
Expand Down
Loading

0 comments on commit 7bd8a11

Please sign in to comment.