diff --git a/README.md b/README.md index 85fb45d6..954ed93e 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,5 @@
- vision_agent + vision_agent # 🔍🤖 Vision Agent @@ -8,6 +7,7 @@ ![ci_status](https://github.com/landing-ai/vision-agent/actions/workflows/ci_cd.yml/badge.svg) [![PyPI version](https://badge.fury.io/py/vision-agent.svg)](https://badge.fury.io/py/vision-agent) ![version](https://img.shields.io/pypi/pyversions/vision-agent) +
Vision Agent is a library for that helps you to use multimodal models to organize and structure your image data. Check out our discord for roadmaps and updates! diff --git a/tests/test_llm.py b/tests/test_llm.py index 74453a4b..a8070f30 100644 --- a/tests/test_llm.py +++ b/tests/test_llm.py @@ -1,8 +1,7 @@ import pytest from vision_agent.llm.llm import OpenAILLM -from vision_agent.tools import CLIP -from vision_agent.tools.tools import GroundingDINO +from vision_agent.tools import CLIP, GroundingDINO, GroundingSAM from .fixtures import openai_llm_mock # noqa: F401 @@ -54,6 +53,6 @@ def test_generate_detector(openai_llm_mock): # noqa: F811 def test_generate_segmentor(openai_llm_mock): # noqa: F811 llm = OpenAILLM() prompt = "Can you generate a cat segmentor?" - segmentor = llm.generate_detector(prompt) - assert isinstance(segmentor, GroundingDINO) + segmentor = llm.generate_segmentor(prompt) + assert isinstance(segmentor, GroundingSAM) assert segmentor.prompt == "cat" diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py index b539dfbb..de8960dd 100644 --- a/vision_agent/tools/tools.py +++ b/vision_agent/tools/tools.py @@ -55,7 +55,7 @@ class GroundingDINO(ImageTool): 'Example 2: User Question: "Can you detect the person on the left?" {{"Parameters":{{"prompt": "person on the left"}}\n' 'Exmaple 3: User Question: "Can you build me a tool that detects red shirts and green shirts?" {{"Parameters":{{"prompt": "red shirt. green shirt"}}}}\n' "The tool returns a list of dictionaries, each containing the following keys:\n" - " - 'lable': The label of the detected object.\n" + " - 'label': The label of the detected object.\n" " - 'score': The confidence score of the detection.\n" " - 'bbox': The bounding box of the detected object. The box coordinates are normalize to [0, 1]\n" "An example output would be: [{'label': ['car'], 'score': [0.99], 'bbox': [[0.1, 0.2, 0.3, 0.4]]}]\n"