From 2471ddd76e16f400beff032c24bcdfca9b115e8f Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Wed, 20 Mar 2024 09:34:26 -0700 Subject: [PATCH] Add EasyTool (#19) * add easytool and prompts * add more tools, reformat doc * add in imports * fix llm for tools * fix doc * fix formatting * fix flake8 * fix type errors * updated tests * updated tools * updated easytools * formatting fix --- tests/fixtures.py | 23 ++ tests/test_llm.py | 26 ++- tests/test_lmm.py | 26 ++- vision_agent/agent/__init__.py | 1 + vision_agent/agent/easytool.py | 271 ++++++++++++++++++++++++ vision_agent/agent/easytool_prompts.py | 82 ++++++++ vision_agent/llm/llm.py | 44 ++-- vision_agent/lmm/lmm.py | 30 +-- vision_agent/tools/__init__.py | 2 +- vision_agent/tools/tools.py | 278 ++++++++++++++++++------- 10 files changed, 659 insertions(+), 124 deletions(-) create mode 100644 vision_agent/agent/easytool.py create mode 100644 vision_agent/agent/easytool_prompts.py diff --git a/tests/fixtures.py b/tests/fixtures.py index 5a64081f..602096ac 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -2,6 +2,8 @@ import pytest +from vision_agent.tools import CLIP, GroundingDINO, GroundingSAM + @pytest.fixture def openai_llm_mock(request): @@ -27,3 +29,24 @@ def openai_lmm_mock(request): choices=[MagicMock(message=MagicMock(content=content))] ) yield mock_instance + + +@pytest.fixture +def clip_mock(request): + with patch.object(CLIP, "__call__", autospec=True) as mock: + mock.return_value = "test" + yield mock + + +@pytest.fixture +def grounding_dino_mock(request): + with patch.object(GroundingDINO, "__call__", autospec=True) as mock: + mock.return_value = "test" + yield mock + + +@pytest.fixture +def grounding_sam_mock(request): + with patch.object(GroundingSAM, "__call__", autospec=True) as mock: + mock.return_value = "test" + yield mock diff --git a/tests/test_llm.py b/tests/test_llm.py index a28288a4..0a671ca5 100644 --- a/tests/test_llm.py +++ b/tests/test_llm.py @@ -1,9 +1,13 @@ import pytest from vision_agent.llm.llm import OpenAILLM -from vision_agent.tools import CLIP, GroundingDINO, GroundingSAM -from .fixtures import openai_llm_mock # noqa: F401 +from .fixtures import ( # noqa: F401 + clip_mock, + grounding_dino_mock, + grounding_sam_mock, + openai_llm_mock, +) @pytest.mark.parametrize( @@ -57,12 +61,12 @@ def test_call_with_mock(openai_llm_mock): # noqa: F811 ['{"Parameters": {"prompt": "cat"}}'], indirect=["openai_llm_mock"], ) -def test_generate_classifier(openai_llm_mock): # noqa: F811 +def test_generate_classifier(openai_llm_mock, clip_mock): # noqa: F811 llm = OpenAILLM() prompt = "Can you generate a cat classifier?" classifier = llm.generate_classifier(prompt) - assert isinstance(classifier, CLIP) - assert classifier.prompt == "cat" + classifier("image.png") + assert clip_mock.call_args[1] == {"prompt": "cat", "image": "image.png"} @pytest.mark.parametrize( @@ -70,12 +74,12 @@ def test_generate_classifier(openai_llm_mock): # noqa: F811 ['{"Parameters": {"prompt": "cat"}}'], indirect=["openai_llm_mock"], ) -def test_generate_detector(openai_llm_mock): # noqa: F811 +def test_generate_detector(openai_llm_mock, grounding_dino_mock): # noqa: F811 llm = OpenAILLM() prompt = "Can you generate a cat detector?" detector = llm.generate_detector(prompt) - assert isinstance(detector, GroundingDINO) - assert detector.prompt == "cat" + detector("image.png") + assert grounding_dino_mock.call_args[1] == {"prompt": "cat", "image": "image.png"} @pytest.mark.parametrize( @@ -83,9 +87,9 @@ def test_generate_detector(openai_llm_mock): # noqa: F811 ['{"Parameters": {"prompt": "cat"}}'], indirect=["openai_llm_mock"], ) -def test_generate_segmentor(openai_llm_mock): # noqa: F811 +def test_generate_segmentor(openai_llm_mock, grounding_sam_mock): # noqa: F811 llm = OpenAILLM() prompt = "Can you generate a cat segmentor?" segmentor = llm.generate_segmentor(prompt) - assert isinstance(segmentor, GroundingSAM) - assert segmentor.prompt == "cat" + segmentor("image.png") + assert grounding_sam_mock.call_args[1] == {"prompt": "cat", "image": "image.png"} diff --git a/tests/test_lmm.py b/tests/test_lmm.py index c1390726..16b5691f 100644 --- a/tests/test_lmm.py +++ b/tests/test_lmm.py @@ -4,9 +4,13 @@ from PIL import Image from vision_agent.lmm.lmm import OpenAILMM -from vision_agent.tools import CLIP, GroundingDINO, GroundingSAM -from .fixtures import openai_lmm_mock # noqa: F401 +from .fixtures import ( # noqa: F401 + clip_mock, + grounding_dino_mock, + grounding_sam_mock, + openai_lmm_mock, +) def create_temp_image(image_format="jpeg"): @@ -77,12 +81,12 @@ def test_call_with_mock(openai_lmm_mock): # noqa: F811 ['{"Parameters": {"prompt": "cat"}}'], indirect=["openai_lmm_mock"], ) -def test_generate_classifier(openai_lmm_mock): # noqa: F811 +def test_generate_classifier(openai_lmm_mock, clip_mock): # noqa: F811 lmm = OpenAILMM() prompt = "Can you generate a cat classifier?" classifier = lmm.generate_classifier(prompt) - assert isinstance(classifier, CLIP) - assert classifier.prompt == "cat" + classifier("image.png") + assert clip_mock.call_args[1] == {"prompt": "cat", "image": "image.png"} @pytest.mark.parametrize( @@ -90,12 +94,12 @@ def test_generate_classifier(openai_lmm_mock): # noqa: F811 ['{"Parameters": {"prompt": "cat"}}'], indirect=["openai_lmm_mock"], ) -def test_generate_detector(openai_lmm_mock): # noqa: F811 +def test_generate_detector(openai_lmm_mock, grounding_dino_mock): # noqa: F811 lmm = OpenAILMM() prompt = "Can you generate a cat classifier?" detector = lmm.generate_detector(prompt) - assert isinstance(detector, GroundingDINO) - assert detector.prompt == "cat" + detector("image.png") + assert grounding_dino_mock.call_args[1] == {"prompt": "cat", "image": "image.png"} @pytest.mark.parametrize( @@ -103,9 +107,9 @@ def test_generate_detector(openai_lmm_mock): # noqa: F811 ['{"Parameters": {"prompt": "cat"}}'], indirect=["openai_lmm_mock"], ) -def test_generate_segmentor(openai_lmm_mock): # noqa: F811 +def test_generate_segmentor(openai_lmm_mock, grounding_sam_mock): # noqa: F811 lmm = OpenAILMM() prompt = "Can you generate a cat classifier?" segmentor = lmm.generate_segmentor(prompt) - assert isinstance(segmentor, GroundingSAM) - assert segmentor.prompt == "cat" + segmentor("image.png") + assert grounding_sam_mock.call_args[1] == {"prompt": "cat", "image": "image.png"} diff --git a/vision_agent/agent/__init__.py b/vision_agent/agent/__init__.py index 7dd4f393..aec05098 100644 --- a/vision_agent/agent/__init__.py +++ b/vision_agent/agent/__init__.py @@ -1,2 +1,3 @@ from .agent import Agent from .reflexion import Reflexion +from .easytool import EasyTool diff --git a/vision_agent/agent/easytool.py b/vision_agent/agent/easytool.py new file mode 100644 index 00000000..b0fc5f2c --- /dev/null +++ b/vision_agent/agent/easytool.py @@ -0,0 +1,271 @@ +import json +from pathlib import Path +from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union + +from vision_agent import LLM, LMM, OpenAILLM +from vision_agent.tools import TOOLS + +from .agent import Agent +from .easytool_prompts import ( + ANSWER_GENERATE, + ANSWER_SUMMARIZE, + CHOOSE_PARAMETER, + CHOOSE_TOOL, + TASK_DECOMPOSE, + TASK_TOPOLOGY, +) + + +def parse_json(s: str) -> Any: + s = ( + s.replace(": true", ": True") + .replace(": false", ": False") + .replace(":true", ": True") + .replace(":false", ": False") + .replace("```", "") + .strip() + ) + return json.loads(s) + + +def change_name(name: str) -> str: + change_list = ["from", "class", "return", "false", "true", "id", "and", "", "ID"] + if name in change_list: + name = "is_" + name.lower() + return name + + +def format_tools(tools: Dict[int, Any]) -> str: + # Format this way so it's clear what the ID's are + tool_list = [] + for key in tools: + tool_list.append(f"ID: {key}, {tools[key]}\\n") + return str(tool_list) + + +def task_decompose( + model: Union[LLM, LMM, Agent], question: str, tools: Dict[int, Any] +) -> Dict: + prompt = TASK_DECOMPOSE.format(question=question, tools=format_tools(tools)) + tries = 0 + str_result = "" + while True: + try: + str_result = model(prompt) + result = parse_json(str_result) + return result["Tasks"] # type: ignore + except Exception: + if tries > 10: + raise ValueError(f"Failed task_decompose on: {str_result}") + tries += 1 + continue + + +def task_topology( + model: Union[LLM, LMM, Agent], question: str, task_list: List[Dict] +) -> List[Dict[str, Any]]: + prompt = TASK_TOPOLOGY.format(question=question, task_list=task_list) + tries = 0 + str_result = "" + while True: + try: + str_result = model(prompt) + result = parse_json(str_result) + for elt in result["Tasks"]: + if isinstance(elt["dep"], str): + elt["dep"] = [int(dep) for dep in elt["dep"].split(",")] + elif isinstance(elt["dep"], int): + elt["dep"] = [elt["dep"]] + elif isinstance(elt["dep"], list): + elt["dep"] = [int(dep) for dep in elt["dep"]] + return result["Tasks"] # type: ignore + except Exception: + if tries > 10: + raise ValueError(f"Failed task_topology on: {str_result}") + tries += 1 + continue + + +def choose_tool( + model: Union[LLM, LMM, Agent], question: str, tools: Dict[int, Any] +) -> int: + prompt = CHOOSE_TOOL.format(question=question, tools=format_tools(tools)) + tries = 0 + str_result = "" + while True: + try: + str_result = model(prompt) + result = parse_json(str_result) + return result["ID"] # type: ignore + except Exception: + if tries > 10: + raise ValueError(f"Failed choose_tool on: {str_result}") + tries += 1 + continue + + +def choose_parameter( + model: Union[LLM, LMM, Agent], question: str, tool_usage: Dict, previous_log: str +) -> Any: + # TODO: should format tool_usage + prompt = CHOOSE_PARAMETER.format( + question=question, tool_usage=tool_usage, previous_log=previous_log + ) + tries = 0 + str_result = "" + while True: + try: + str_result = model(prompt) + result = parse_json(str_result) + return result["Parameters"] + except Exception: + if tries > 10: + raise ValueError(f"Failed choose_parameter on: {str_result}") + tries += 1 + continue + + +def answer_generate( + model: Union[LLM, LMM, Agent], question: str, call_results: str, previous_log: str +) -> str: + prompt = ANSWER_GENERATE.format( + question=question, call_results=call_results, previous_log=previous_log + ) + return model(prompt) + + +def answer_summarize( + model: Union[LLM, LMM, Agent], question: str, answers: List[Dict] +) -> str: + prompt = ANSWER_SUMMARIZE.format(question=question, answers=answers) + return model(prompt) + + +def function_call(tool: Callable, parameters: Dict[str, Any]) -> Any: + return tool()(**parameters) + + +def retrieval( + model: Union[LLM, LMM, Agent], + question: str, + tools: Dict[int, Any], + previous_log: str, +) -> Tuple[List[Dict], str]: + # TODO: remove tools_used? + tool_id = choose_tool( + model, question, {k: v["description"] for k, v in tools.items()} + ) + if tool_id is None: # TODO + pass + + tool_instructions = tools[tool_id] + tool_usage = tool_instructions["usage"] + tool_name = tool_instructions["name"] + + parameters = choose_parameter(model, question, tool_usage, previous_log) + if parameters is None: # TODO + pass + tool_results = [{"tool_name": tool_name, "parameters": parameters}] + + if len(tool_results) == 0: # TODO + pass + + def parse_tool_results(result: Dict[str, Union[Dict, List]]) -> Any: + call_results: List[Any] = [] + if isinstance(result["parameters"], Dict): + call_result = function_call(tools[tool_id]["class"], result["parameters"]) + if call_result is None: + return call_results + call_results.append(call_result) + elif isinstance(result["parameters"], List): + for parameters in result["parameters"]: + call_result = function_call(tools[tool_id]["class"], parameters) + if call_result is None: + continue + call_results.append(call_result) + return call_results + + call_results = [] + __import__("ipdb").set_trace() + if isinstance(tool_results, Set) or isinstance(tool_results, List): + for result in tool_results: + call_results.extend(parse_tool_results(result)) + elif isinstance(tool_results, Dict): + call_results.extend(parse_tool_results(tool_results)) + + call_results_str = "\n\n".join([str(e) for e in call_results]) + return tool_results, call_results_str + + +class EasyTool(Agent): + r"""This is an implementation of the EasyTool paper https://arxiv.org/abs/2401.06201 + based on the original implementation https://github.com/microsoft/JARVIS/tree/main/easytool + from the funcQA code. + + Examples:: + >>> from vision_agent.agent import EasyTool + >>> agent = EasyTool() + >>> resp = agent("If a car is traveling at 64 km/h, how many kilometers does it travel in 29 minutes?") + >>> print(resp) + >>> "It will travel approximately 31.03 kilometers in 29 minutes." + >>> resp = agent("How many cards are in this image?", image="cards.jpg") + >>> print(resp) + >>> "There are 2 cards in this image." + """ + + def __init__( + self, + task_model: Optional[Union[LLM, LMM]] = None, + answer_model: Optional[Union[LLM, LMM]] = None, + ): + self.task_model = ( + OpenAILLM(json_mode=True) if task_model is None else task_model + ) + self.answer_model = OpenAILLM() if answer_model is None else answer_model + + self.retrieval_num = 3 + self.tools = TOOLS + + def __call__( + self, + input: Union[List[Dict[str, str]], str], + image: Optional[Union[str, Path]] = None, + ) -> str: + if isinstance(input, str): + input = [{"role": "user", "content": input}] + return self.chat(input, image=image) + + def chat( + self, chat: List[Dict[str, str]], image: Optional[Union[str, Path]] = None + ) -> str: + question = chat[0]["content"] + if image: + question += f" Image name: {image}" + tasks = task_decompose( + self.task_model, + question, + {k: v["description"] for k, v in self.tools.items()}, + ) + task_list = [{"task": task, "id": i + 1} for i, task in enumerate(tasks)] + task_list = task_topology(self.task_model, question, task_list) + task_depend = {"Original Quesiton": question} + previous_log = "" + answers = [] + for task in task_list: + task_depend[task["id"]] = {"task": task["task"], "answer": ""} # type: ignore + # TODO topological sort task_list + for task in task_list: + task_str = task["task"] + previous_log = str(task_depend) + tool_results, call_results = retrieval( + self.task_model, + task_str, + self.tools, + previous_log, + ) + answer = answer_generate( + self.answer_model, task_str, call_results, previous_log + ) + answers.append({"task": task_str, "answer": answer}) + task_depend[task["id"]]["answer"] = answer # type: ignore + return answer_summarize(self.answer_model, question, answers) diff --git a/vision_agent/agent/easytool_prompts.py b/vision_agent/agent/easytool_prompts.py new file mode 100644 index 00000000..acc0a111 --- /dev/null +++ b/vision_agent/agent/easytool_prompts.py @@ -0,0 +1,82 @@ +TASK_DECOMPOSE = """You need to decompose a complex user's question into some simple subtasks and let the model execute it step by step. +This is the user's question: {question} +This is tool list: +{tools} + +Please note that: +1. You should only decompose this complex user's question into some simple subtasks which can be executed easily by using one single tool in the tool list. +2. If one subtask need the results from other subtask, you can should write clearly. For example: +{{"Tasks": ["Convert 23 km/h to X km/min by 'divide_'", "Multiply X km/min by 45 min to get Y by 'multiply_'"]}} +3. You must ONLY output in a parsible JSON format. An example output looks like: + +{{"Tasks": ["Task 1", "Task 2", ...]}} + +Output: """ + +TASK_TOPOLOGY = """Given a complex user's question, I have decompose this question into some simple subtasks. I think there exists a logical connections and order amontg the tasks. Thus you need to help me output this logical connections and order. +You must ONLY output in a parsible JSON format with the following format:" + +{{"Tasks": [{{"task": task, "id", task_id, "dep": [dependency_task_id1, dependency_task_id2, ...]}}]}} + +The "dep" field denotes the id of the previous task which generates a new resource upon which the current task depends. If there are no dependencies, set "dep" to -1. + + +This is user's question: {question} + +These are subtasks of this question: + +{task_list} + +Output: """ + +CHOOSE_TOOL = """This is the user's question: {question} +These are the tools you can select to solve the question: +Tool List: +{tools} + +Please note that: +1. You should only chooce one tool the Tool List to solve this question. +2. You must ONLY output the ID of the tool you chose in a parsible JSON format. Two example outputs look like: + +Example 1: {{"ID": 1}} +Example 2: {{"ID": 2}} + +Output: """ + +CHOOSE_PARAMETER = """Given a user's question and a API tool documentation, you need to output parameters according to the API tool documentation to successfully call the API to solve the user's question. +Please note that: +1. The Example in the API tool documentation can help you better understand the use of the API. +2. Ensure the parameters you output are correct. The output must contain the required parameters, and can contain the optional parameters based on the question. If no paremters in the required parameters and optional parameters, just leave it as {{"Parameters":{{}}}} +3. If the user's question mentions other APIs, you should ONLY consider the API tool documentation I give and do not consider other APIs. +4. The question may have dependencies on answers of other questions, so we will provide logs of previous questions and answers for your reference. +5. If you need to use this API multiple times,, please set "Parameters" to a list. +6. You must ONLY output in a parsible JSON format. Two examples output looks like: + +Example 1: {{"Parameters":{{"input": [1,2,3]}}}} +Example 2: {{"Parameters":[{{"input": [1,2,3]}}, {{"input": [2,3,4]}}]}} + +There are logs of previous questions and answers: +{previous_log} +This is the current user's question: {question} +This is API tool documentation: {tool_usage} +Output: """ + + +ANSWER_GENERATE = """You should answer the question based on the response output by the API tool. +Please note that: +1. Try to organize the response into a natural language answer. +2. We will not show the API response to the user, thus you need to make full use of the response and give the information in the response that can satisfy the user's question in as much detail as possible. +3. If the API tool does not provide useful information in the response, please answer with your knowledge. +4. The question may have dependencies on answers of other questions, so we will provide logs of previous questions and answers. +There are logs of previous questions and answers: +{previous_log} +This is the user's question: {question} +This is the response output by the API tool: +{call_results} +We will not show the API response to the user, thus you need to make full use of the response and give the information in the response that can satisfy the user's question in as much detail as possible. +Output: """ + +ANSWER_SUMMARIZE = """We break down a complex user's problems into simple subtasks and provide answers to each simple subtask. You need to organize these answers to each subtask and form a self-consistent final answer to the user's question +This is the user's question: {question} +These are subtasks and their answers: {answers} +Final answer: """ diff --git a/vision_agent/llm/llm.py b/vision_agent/llm/llm.py index f07b0611..374b58c9 100644 --- a/vision_agent/llm/llm.py +++ b/vision_agent/llm/llm.py @@ -1,6 +1,6 @@ import json from abc import ABC, abstractmethod -from typing import Dict, List, Mapping, Union, cast +from typing import Callable, Dict, List, Mapping, Union, cast from openai import OpenAI @@ -10,7 +10,6 @@ SYSTEM_PROMPT, GroundingDINO, GroundingSAM, - ImageTool, ) @@ -31,24 +30,31 @@ def __call__(self, input: Union[str, List[Dict[str, str]]]) -> str: class OpenAILLM(LLM): r"""An LLM class for any OpenAI LLM model.""" - def __init__(self, model_name: str = "gpt-4-turbo-preview"): + def __init__( + self, model_name: str = "gpt-4-turbo-preview", json_mode: bool = False + ): self.model_name = model_name self.client = OpenAI() + self.json_mode = json_mode def generate(self, prompt: str) -> str: + kwargs = {"response_format": {"type": "json_object"}} if self.json_mode else {} response = self.client.chat.completions.create( model=self.model_name, messages=[ {"role": "user", "content": prompt}, ], + **kwargs, # type: ignore ) return cast(str, response.choices[0].message.content) def chat(self, chat: List[Dict[str, str]]) -> str: + kwargs = {"response_format": {"type": "json_object"}} if self.json_mode else {} response = self.client.chat.completions.create( model=self.model_name, messages=chat, # type: ignore + **kwargs, ) return cast(str, response.choices[0].message.content) @@ -58,8 +64,9 @@ def __call__(self, input: Union[str, List[Dict[str, str]]]) -> str: return self.generate(input) return self.chat(input) - def generate_classifier(self, prompt: str) -> ImageTool: - prompt = CHOOSE_PARAMS.format(api_doc=CLIP.doc, question=prompt) + def generate_classifier(self, question: str) -> Callable: + api_doc = CLIP.description + "\n" + str(CLIP.usage) + prompt = CHOOSE_PARAMS.format(api_doc=api_doc, question=question) response = self.client.chat.completions.create( model=self.model_name, response_format={"type": "json_object"}, @@ -72,36 +79,41 @@ def generate_classifier(self, prompt: str) -> ImageTool: params = json.loads(cast(str, response.choices[0].message.content))[ "Parameters" ] - return CLIP(**cast(Mapping, params)) - def generate_detector(self, params: str) -> ImageTool: - params = CHOOSE_PARAMS.format(api_doc=GroundingDINO.doc, question=params) + return lambda x: CLIP()(**{"prompt": params["prompt"], "image": x}) + + def generate_detector(self, question: str) -> Callable: + api_doc = GroundingDINO.description + "\n" + str(GroundingDINO.usage) + prompt = CHOOSE_PARAMS.format(api_doc=api_doc, question=question) response = self.client.chat.completions.create( model=self.model_name, response_format={"type": "json_object"}, messages=[ {"role": "system", "content": SYSTEM_PROMPT}, - {"role": "user", "content": params}, + {"role": "user", "content": prompt}, ], ) - params = json.loads(cast(str, response.choices[0].message.content))[ + params: Mapping = json.loads(cast(str, response.choices[0].message.content))[ "Parameters" ] - return GroundingDINO(**cast(Mapping, params)) - def generate_segmentor(self, params: str) -> ImageTool: - params = CHOOSE_PARAMS.format(api_doc=GroundingSAM.doc, question=params) + return lambda x: GroundingDINO()(**{"prompt": params["prompt"], "image": x}) + + def generate_segmentor(self, question: str) -> Callable: + api_doc = GroundingSAM.description + "\n" + str(GroundingSAM.usage) + prompt = CHOOSE_PARAMS.format(api_doc=api_doc, question=question) response = self.client.chat.completions.create( model=self.model_name, response_format={"type": "json_object"}, messages=[ {"role": "system", "content": SYSTEM_PROMPT}, - {"role": "user", "content": params}, + {"role": "user", "content": prompt}, ], ) - params = json.loads(cast(str, response.choices[0].message.content))[ + params: Mapping = json.loads(cast(str, response.choices[0].message.content))[ "Parameters" ] - return GroundingSAM(**cast(Mapping, params)) + + return lambda x: GroundingSAM()(**{"prompt": params["prompt"], "image": x}) diff --git a/vision_agent/lmm/lmm.py b/vision_agent/lmm/lmm.py index 2023000c..48d449f5 100644 --- a/vision_agent/lmm/lmm.py +++ b/vision_agent/lmm/lmm.py @@ -3,7 +3,7 @@ import logging from abc import ABC, abstractmethod from pathlib import Path -from typing import Any, Dict, List, Mapping, Optional, Union, cast +from typing import Any, Callable, Dict, List, Optional, Union, cast import requests from openai import OpenAI @@ -14,7 +14,6 @@ SYSTEM_PROMPT, GroundingDINO, GroundingSAM, - ImageTool, ) _LOGGER = logging.getLogger(__name__) @@ -168,8 +167,9 @@ def generate(self, prompt: str, image: Optional[Union[str, Path]] = None) -> str ) return cast(str, response.choices[0].message.content) - def generate_classifier(self, prompt: str) -> ImageTool: - prompt = CHOOSE_PARAMS.format(api_doc=CLIP.doc, question=prompt) + def generate_classifier(self, question: str) -> Callable: + api_doc = CLIP.description + "\n" + str(CLIP.usage) + prompt = CHOOSE_PARAMS.format(api_doc=api_doc, question=question) response = self.client.chat.completions.create( model=self.model_name, messages=[ @@ -179,7 +179,7 @@ def generate_classifier(self, prompt: str) -> ImageTool: ) try: - prompt = json.loads(cast(str, response.choices[0].message.content))[ + params = json.loads(cast(str, response.choices[0].message.content))[ "Parameters" ] except json.JSONDecodeError: @@ -188,15 +188,16 @@ def generate_classifier(self, prompt: str) -> ImageTool: ) raise ValueError("Failed to decode response") - return CLIP(**cast(Mapping, prompt)) + return lambda x: CLIP()(**{"prompt": params["prompt"], "image": x}) - def generate_detector(self, params: str) -> ImageTool: - params = CHOOSE_PARAMS.format(api_doc=GroundingDINO.doc, question=params) + def generate_detector(self, question: str) -> Callable: + api_doc = GroundingDINO.description + "\n" + str(GroundingDINO.usage) + prompt = CHOOSE_PARAMS.format(api_doc=api_doc, question=question) response = self.client.chat.completions.create( model=self.model_name, messages=[ {"role": "system", "content": SYSTEM_PROMPT}, - {"role": "user", "content": params}, + {"role": "user", "content": prompt}, ], ) @@ -210,10 +211,11 @@ def generate_detector(self, params: str) -> ImageTool: ) raise ValueError("Failed to decode response") - return GroundingDINO(**cast(Mapping, params)) + return lambda x: GroundingDINO()(**{"prompt": params["prompt"], "image": x}) - def generate_segmentor(self, prompt: str) -> ImageTool: - prompt = CHOOSE_PARAMS.format(api_doc=GroundingSAM.doc, question=prompt) + def generate_segmentor(self, question: str) -> Callable: + api_doc = GroundingSAM.description + "\n" + str(GroundingSAM.usage) + prompt = CHOOSE_PARAMS.format(api_doc=api_doc, question=question) response = self.client.chat.completions.create( model=self.model_name, messages=[ @@ -223,7 +225,7 @@ def generate_segmentor(self, prompt: str) -> ImageTool: ) try: - prompt = json.loads(cast(str, response.choices[0].message.content))[ + params = json.loads(cast(str, response.choices[0].message.content))[ "Parameters" ] except json.JSONDecodeError: @@ -232,7 +234,7 @@ def generate_segmentor(self, prompt: str) -> ImageTool: ) raise ValueError("Failed to decode response") - return GroundingSAM(**cast(Mapping, prompt)) + return lambda x: GroundingSAM()(**{"prompt": params["prompt"], "image": x}) def get_lmm(name: str) -> LMM: diff --git a/vision_agent/tools/__init__.py b/vision_agent/tools/__init__.py index 2f7fe9be..dae14d66 100644 --- a/vision_agent/tools/__init__.py +++ b/vision_agent/tools/__init__.py @@ -1,2 +1,2 @@ from .prompts import CHOOSE_PARAMS, SYSTEM_PROMPT -from .tools import CLIP, GroundingDINO, GroundingSAM, ImageTool +from .tools import CLIP, TOOLS, GroundingDINO, GroundingSAM, Tool diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py index b892e2c9..c1b8fe2d 100644 --- a/vision_agent/tools/tools.py +++ b/vision_agent/tools/tools.py @@ -1,5 +1,5 @@ import logging -from abc import ABC, abstractmethod +from abc import ABC from pathlib import Path from typing import Any, Dict, List, Tuple, Union, cast @@ -17,18 +17,19 @@ def normalize_bbox( ) -> List[float]: r"""Normalize the bounding box coordinates to be between 0 and 1.""" x1, y1, x2, y2 = bbox - x1 = x1 / image_size[1] - y1 = y1 / image_size[0] - x2 = x2 / image_size[1] - y2 = y2 / image_size[0] + x1 = round(x1 / image_size[1], 2) + y1 = round(y1 / image_size[0], 2) + x2 = round(x2 / image_size[1], 2) + y2 = round(y2 / image_size[0], 2) return [x1, y1, x2, y2] def rle_decode(mask_rle: str, shape: Tuple[int, int]) -> np.ndarray: - """ - mask_rle: run-length as string formated (start length) - shape: (height,width) of array to return - Returns numpy array, 1 - mask, 0 - background + r"""Decode a run-length encoded mask. Returns numpy array, 1 - mask, 0 - background. + + Args: + mask_rle: Run-length as string formated (start length) + shape: The (height, width) of array to return """ s = mask_rle.split() starts, lengths = [np.asarray(x, dtype=int) for x in (s[0:][::2], s[1:][::2])] @@ -40,39 +41,58 @@ def rle_decode(mask_rle: str, shape: Tuple[int, int]) -> np.ndarray: return img.reshape(shape) -class ImageTool(ABC): - @abstractmethod - def __call__(self, image: Union[str, ImageType]) -> List[Dict]: - pass +class Tool(ABC): + name: str + description: str + usage: Dict -class CLIP(ImageTool): - """ - Example usage: - > from vision_agent.tools import tools - > t = tools.CLIP(["red line", "yellow dot", "none"]) - > t("examples/img/ct_scan1.jpg")) +class CLIP(Tool): + r"""CLIP is a tool that can classify or tag any image given a set if input classes + or tags. - [[0.02567436918616295, 0.9534115791320801, 0.020914122462272644]] + Examples:: + >>> from vision_agent.tools import tools + >>> t = tools.CLIP(["red line", "yellow dot", "none"]) + >>> t("examples/img/ct_scan1.jpg")) + >>> [[0.02567436918616295, 0.9534115791320801, 0.020914122462272644]] """ _ENDPOINT = "https://rb4ii6dfacmwqfxivi4aedyyfm0endsv.lambda-url.us-east-2.on.aws" - doc = ( - "CLIP is a tool that can classify or tag any image given a set if input classes or tags." + name = "clip_" + description = ( + "'clip_' is a tool that can classify or tag any image given a set if input classes or tags." "Here are some exmaples of how to use the tool, the examples are in the format of User Question: which will have the user's question in quotes followed by the parameters in JSON format, which is the parameters you need to output to call the API to solve the user's question.\n" - 'Example 1: User Question: "Can you classify this image as a cat?" {{"Parameters":{{"prompt": ["cat"]}}}}\n' - 'Example 2: User Question: "Can you tag this photograph with cat or dog?" {{"Parameters":{{"prompt": ["cat", "dog"]}}}}\n' - 'Exmaple 3: User Question: "Can you build me a classifier taht classifies red shirts, green shirts and other?" {{"Parameters":{{"prompt": ["red shirt", "green shirt", "other"]}}}}\n' ) + usage = { + "required_parameters": [ + {"name": "prompt", "type": "List[str]"}, + {"name": "image", "type": "str"}, + ], + "examples": [ + { + "scenario": "Can you classify this image as a cat? Image name: cat.jpg", + "parameters": {"prompt": ["cat"], "image": "cat.jpg"}, + }, + { + "scenario": "Can you tag this photograph with cat or dog? Image name: cat_dog.jpg", + "parameters": {"prompt": ["cat", "dog"], "image": "cat_dog.jpg"}, + }, + { + "scenario": "Can you build me a classifier that classifies red shirts, green shirts and other? Image name: shirts.jpg", + "parameters": { + "prompt": ["red shirt", "green shirt", "other"], + "image": "shirts.jpg", + }, + }, + ], + } - def __init__(self, prompt: list[str]): - self.prompt = prompt - - def __call__(self, image: Union[str, ImageType]) -> List[Dict]: + def __call__(self, prompt: List[str], image: Union[str, ImageType]) -> List[Dict]: image_b64 = convert_to_b64(image) data = { - "classes": self.prompt, + "classes": prompt, "images": [image_b64], } res = requests.post( @@ -89,30 +109,48 @@ def __call__(self, image: Union[str, ImageType]) -> List[Dict]: return cast(List[Dict], resp_json["data"]) -class GroundingDINO(ImageTool): +class GroundingDINO(Tool): _ENDPOINT = "https://chnicr4kes5ku77niv2zoytggq0qyqlp.lambda-url.us-east-2.on.aws" - doc = ( - "Grounding DINO is a tool that can detect arbitrary objects with inputs such as category names or referring expressions." + name = "grounding_dino_" + description = ( + "'grounding_dino_' is a tool that can detect arbitrary objects with inputs such as category names or referring expressions." "Here are some exmaples of how to use the tool, the examples are in the format of User Question: which will have the user's question in quotes followed by the parameters in JSON format, which is the parameters you need to output to call the API to solve the user's question.\n" - 'Example 1: User Question: "Can you build me a car detector?" {{"Parameters":{{"prompt": "car"}}}}\n' - 'Example 2: User Question: "Can you detect the person on the left?" {{"Parameters":{{"prompt": "person on the left"}}\n' - 'Exmaple 3: User Question: "Can you build me a tool that detects red shirts and green shirts?" {{"Parameters":{{"prompt": "red shirt. green shirt"}}}}\n' "The tool returns a list of dictionaries, each containing the following keys:\n" - " - 'label': The label of the detected object.\n" - " - 'score': The confidence score of the detection.\n" - " - 'bbox': The bounding box of the detected object. The box coordinates are normalize to [0, 1]\n" - "An example output would be: [{'label': ['car'], 'score': [0.99], 'bbox': [[0.1, 0.2, 0.3, 0.4]]}]\n" + ' - "label": The label of the detected object.\n' + ' - "score": The confidence score of the detection.\n' + ' - "bbox": The bounding box of the detected object. The box coordinates are normalize to [0, 1]\n' + 'An example output would be: [{"label": ["car"], "score": [0.99], "bbox": [[0.1, 0.2, 0.3, 0.4]]}]\n' ) + usage = { + "required_parameters": [ + {"name": "prompt", "type": "str"}, + {"name": "image", "type": "str"}, + ], + "examples": [ + { + "scenario": "Can you build me a car detector?", + "parameters": {"prompt": "car", "image": ""}, + }, + { + "scenario": "Can you detect the person on the left? Image name: person.jpg", + "parameters": {"prompt": "person on the left", "image": "person.jpg"}, + }, + { + "scenario": "Detect the red shirts and green shirst. Image name: shirts.jpg", + "parameters": { + "prompt": "red shirt. green shirt", + "image": "shirts.jpg", + }, + }, + ], + } - def __init__(self, prompt: str): - self.prompt = prompt - - def __call__(self, image: Union[str, Path, ImageType]) -> List[Dict]: + def __call__(self, prompt: str, image: Union[str, Path, ImageType]) -> List[Dict]: image_size = get_image_size(image) image_b64 = convert_to_b64(image) data = { - "prompt": self.prompt, + "prompt": prompt, "images": [image_b64], } res = requests.post( @@ -132,44 +170,65 @@ def __call__(self, image: Union[str, Path, ImageType]) -> List[Dict]: elt["bboxes"] = [ normalize_bbox(box, image_size) for box in elt["bboxes"] ] + if "scores" in elt: + elt["scores"] = [round(score, 2) for score in elt["scores"]] return cast(List[Dict], resp_data) -class GroundingSAM(ImageTool): - """ - Example usage: - > from vision_agent.tools import tools - > t = tools.GroundingSAM(["red line", "yellow dot", "none"]) - > t("examples/img/ct_scan1.jpg") - - [{'label': 'none', 'mask': array([[0, 0, 0, ..., 0, 0, 0], - [0, 0, 0, ..., 0, 0, 0], - ..., - [0, 0, 0, ..., 0, 0, 0], - [0, 0, 0, ..., 0, 0, 0]], dtype=uint8)}, {'label': 'red line', 'mask': array([[0, 0, 0, ..., 0, 0, 0], - [0, 0, 0, ..., 0, 0, 0], - ..., - [1, 1, 1, ..., 1, 1, 1], - [1, 1, 1, ..., 1, 1, 1]], dtype=uint8)}] +class GroundingSAM(Tool): + r"""Grounding SAM is a tool that can detect and segment arbitrary objects with + inputs such as category names or referring expressions. + + Examples:: + >>> from vision_agent.tools import tools + >>> t = tools.GroundingSAM(["red line", "yellow dot", "none"]) + >>> t("examples/img/ct_scan1.jpg") + >>> [{'label': 'none', 'mask': array([[0, 0, 0, ..., 0, 0, 0], + >>> [0, 0, 0, ..., 0, 0, 0], + >>> ..., + >>> [0, 0, 0, ..., 0, 0, 0], + >>> [0, 0, 0, ..., 0, 0, 0]], dtype=uint8)}, {'label': 'red line', 'mask': array([[0, 0, 0, ..., 0, 0, 0], + >>> [0, 0, 0, ..., 0, 0, 0], + >>> ..., + >>> [1, 1, 1, ..., 1, 1, 1], + >>> [1, 1, 1, ..., 1, 1, 1]], dtype=uint8)}] """ _ENDPOINT = "https://cou5lfmus33jbddl6hoqdfbw7e0qidrw.lambda-url.us-east-2.on.aws" - doc = ( - "Grounding SAM is a tool that can detect and segment arbitrary objects with inputs such as category names or referring expressions." + name = "grounding_sam_" + description = ( + "'grounding_sam_' is a tool that can detect and segment arbitrary objects with inputs such as category names or referring expressions." "Here are some exmaples of how to use the tool, the examples are in the format of User Question: which will have the user's question in quotes followed by the parameters in JSON format, which is the parameters you need to output to call the API to solve the user's question.\n" - 'Example 1: User Question: "Can you build me a car segmentor?" {{"Parameters":{{"prompt": ["car"]}}}}\n' - 'Example 2: User Question: "Can you segment the person on the left?" {{"Parameters":{{"prompt": ["person on the left"]}}\n' - 'Exmaple 3: User Question: "Can you build me a tool that segments red shirts and green shirts?" {{"Parameters":{{"prompt": ["red shirt", "green shirt"]}}}}\n' ) + usage = { + "required_parameters": [ + {"name": "prompt", "type": "List[str]"}, + {"name": "image", "type": "str"}, + ], + "examples": [ + { + "scenario": "Can you build me a car segmentor?", + "parameters": {"prompt": ["car"], "image": ""}, + }, + { + "scenario": "Can you segment the person on the left? Image name: person.jpg", + "parameters": {"prompt": ["person on the left"], "image": "person.jpg"}, + }, + { + "scenario": "Can you build me a tool that segments red shirts and green shirts? Image name: shirts.jpg", + "parameters": { + "prompt": ["red shirt", "green shirt"], + "image": "shirts.jpg", + }, + }, + ], + } - def __init__(self, prompt: list[str]): - self.prompt = prompt - - def __call__(self, image: Union[str, ImageType]) -> List[Dict]: + def __call__(self, prompt: List[str], image: Union[str, ImageType]) -> List[Dict]: image_b64 = convert_to_b64(image) data = { - "classes": self.prompt, + "classes": prompt, "image": image_b64, } res = requests.post( @@ -195,3 +254,80 @@ def __call__(self, image: Union[str, ImageType]) -> List[Dict]: } ) return preds + + +class Add(Tool): + name = "add_" + description = "'add_' returns the sum of all the arguments passed to it, normalized to 2 decimal places." + usage = { + "required_parameters": {"name": "input", "type": "List[int]"}, + "examples": [ + { + "scenario": "If you want to calculate 2 + 4", + "parameters": {"input": [2, 4]}, + } + ], + } + + def __call__(self, input: List[int]) -> float: + return round(sum(input), 2) + + +class Subtract(Tool): + name = "subtract_" + description = "'subtract_' returns the difference of all the arguments passed to it, normalized to 2 decimal places." + usage = { + "required_parameters": {"name": "input", "type": "List[int]"}, + "examples": [ + { + "scenario": "If you want to calculate 4 - 2", + "parameters": {"input": [4, 2]}, + } + ], + } + + def __call__(self, input: List[int]) -> float: + return round(input[0] - input[1], 2) + + +class Multiply(Tool): + name = "multiply_" + description = "'multiply_' returns the product of all the arguments passed to it, normalized to 2 decimal places." + usage = { + "required_parameters": {"name": "input", "type": "List[int]"}, + "examples": [ + { + "scenario": "If you want to calculate 2 * 4", + "parameters": {"input": [2, 4]}, + } + ], + } + + def __call__(self, input: List[int]) -> float: + return round(input[0] * input[1], 2) + + +class Divide(Tool): + name = "divide_" + description = "'divide_' returns the division of all the arguments passed to it, normalized to 2 decimal places." + usage = { + "required_parameters": {"name": "input", "type": "List[int]"}, + "examples": [ + { + "scenario": "If you want to calculate 4 / 2", + "parameters": {"input": [4, 2]}, + } + ], + } + + def __call__(self, input: List[int]) -> float: + return round(input[0] / input[1], 2) + + +TOOLS = { + i: {"name": c.name, "description": c.description, "usage": c.usage, "class": c} + for i, c in enumerate( + [CLIP, GroundingDINO, GroundingSAM, Add, Subtract, Multiply, Divide] + ) + if (hasattr(c, "name") and hasattr(c, "description") and hasattr(c, "usage")) +}