From f81e1693801bc0161500f8f8ca04777cf030d2ae Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Tue, 19 Mar 2024 16:13:51 -0700 Subject: [PATCH 01/12] add easytool and prompts --- vision_agent/agent/easytool.py | 273 +++++++++++++++++++++++++ vision_agent/agent/easytool_prompts.py | 82 ++++++++ 2 files changed, 355 insertions(+) create mode 100644 vision_agent/agent/easytool.py create mode 100644 vision_agent/agent/easytool_prompts.py diff --git a/vision_agent/agent/easytool.py b/vision_agent/agent/easytool.py new file mode 100644 index 00000000..4690a829 --- /dev/null +++ b/vision_agent/agent/easytool.py @@ -0,0 +1,273 @@ +import json +from pathlib import Path +from typing import Any, Dict, List, Optional, Set, Tuple, Union + +from vision_agent import LLM, LMM, OpenAILLM +from vision_agent.tools import TOOLS + +from .agent import Agent +from .easytool_prompts import ( + ANSWER_GENERATE, + ANSWER_SUMMARIZE, + CHOOSE_PARAMETER, + CHOOSE_TOOL, + TASK_DECOMPOSE, + TASK_TOPOLOGY, +) + + +def parse_json(s: str) -> Dict: + s = ( + s.replace(": true", ": True") + .replace(": false", ": False") + .replace(":true", ": True") + .replace(":false", ": False") + .replace("```", "") + .strip() + ) + return json.loads(s) + + +def change_name(name: str): + change_list = ["from", "class", "return", "false", "true", "id", "and", "", "ID"] + if name in change_list: + name = "is_" + name.lower() + return name + + +def format_tools(tools: Dict[int, Any]) -> str: + # Format this way so it's clear what the ID's are + tool_list = [] + for key in tools: + tool_list.append(f"ID: {key}, {tools[key]}\\n") + return str(tool_list) + + +def task_decompose( + model: Union[LLM, LMM, Agent], question: str, tools: Dict[int, Any] +) -> Dict: + prompt = TASK_DECOMPOSE.format(question=question, tools=format_tools(tools)) + tries = 0 + str_result = "" + while True: + try: + str_result = model(prompt) + result = parse_json(str_result) + return result["Tasks"] + except Exception as _: + if tries > 10: + raise ValueError(f"Failed task_decompose on: {str_result}") + tries += 1 + continue + + +def task_topology( + model: Union[LLM, LMM, Agent], question: str, task_list: List[Dict] +) -> List[Dict[str, Any]]: + prompt = TASK_TOPOLOGY.format(question=question, task_list=task_list) + tries = 0 + str_result = "" + while True: + try: + str_result = model(prompt) + result = parse_json(str_result) + for elt in result["Tasks"]: + if isinstance(elt["dep"], str): + elt["dep"] = [int(dep) for dep in elt["dep"].split(",")] + elif isinstance(elt["dep"], int): + elt["dep"] = [elt["dep"]] + elif isinstance(elt["dep"], list): + elt["dep"] = [int(dep) for dep in elt["dep"]] + return result["Tasks"] ## TODO + except Exception as _: + if tries > 10: + raise ValueError(f"Failed task_topology on: {str_result}") + tries += 1 + continue + + +def choose_tool( + model: Union[LLM, LMM, Agent], question: str, tools: Dict[int, Any] +) -> int: + prompt = CHOOSE_TOOL.format(question=question, tools=format_tools(tools)) + tries = 0 + str_result = "" + while True: + try: + str_result = model(prompt) + result = parse_json(str_result) + return result["ID"] + except Exception as _: + if tries > 10: + raise ValueError(f"Failed choose_tool on: {str_result}") + tries += 1 + continue + + +def choose_parameter( + model: Union[LLM, LMM, Agent], question: str, tool_usage: Dict, previous_log: str +) -> Any: + # TODO: should format tool_usage + prompt = CHOOSE_PARAMETER.format( + question=question, tool_usage=tool_usage, previous_log=previous_log + ) + tries = 0 + str_result = "" + while True: + try: + str_result = model(prompt) + result = parse_json(str_result) + return result["Parameters"] + except Exception as _: + if tries > 10: + raise ValueError(f"Failed choose_parameter on: {str_result}") + tries += 1 + continue + + +def answer_generate( + model: Union[LLM, LMM, Agent], question: str, call_results: str, previous_log: str +) -> str: + prompt = ANSWER_GENERATE.format( + question=question, call_results=call_results, previous_log=previous_log + ) + return model(prompt) + + +def answer_summarize( + model: Union[LLM, LMM, Agent], question: str, answers: List[Dict] +) -> str: + prompt = ANSWER_SUMMARIZE.format(question=question, answers=answers) + return model(prompt) + + +def retrieval( + model: Union[LLM, LMM, Agent], + question: str, + tools: Dict[int, Any], + previous_log: str, +) -> Tuple[List[Dict], str]: + # TODO: remove tools_used? + tool_id = choose_tool( + model, question, {k: v["description"] for k, v in tools.items()} + ) + if tool_id is None: # TODO + pass + + tool_instructions = tools[tool_id] + tool_description = tool_instructions["description"] + tool_usage = tool_instructions["usage"] + tool_name = tool_instructions["name"] + + parameters = choose_parameter(model, question, tool_usage, previous_log) + if parameters is None: # TODO + pass + tool_results = [{"tool_name": tool_name, "parameters": parameters}] + + if len(tool_results) == 0: # TODO + pass + + def parse_tool_results(result: Dict[str, Union[Dict, List]]) -> Any: + call_results = [] + if isinstance(result["parameters"], Dict): + parameters = {} + for key in result["parameters"]: + parameters[change_name(key)] = result["parameters"][key] + # TODO: wrap call to handle errors + call_result = tools[tool_id]["class"]()(**parameters) + if call_result == None: + continue + call_results.append(call_result) + elif isinstance(result["parameters"], List): + for param_list in result["parameters"]: + parameters = {} + for key in param_list: + parameters[change_name(key)] = param_list[key] + call_result = tools[tool_id]["class"]()(**parameters) + if call_result == None: + continue + call_results.append(call_result) + return call_results + + call_results = [] + if isinstance(tool_results, Set) or isinstance(tool_results, List): + for result in tool_results: + call_results.extend(parse_tool_results(result)) + elif isinstance(tool_results, Dict): + call_results.extend(parse_tool_results(tool_results)) + + call_results_str = "\n\n".join([str(e) for e in call_results]) + return tool_results, call_results_str + + +class EasyTool(Agent): + r"""This is an implementation of the EasyTool paper https://arxiv.org/abs/2401.06201 + based on the original implementation https://github.com/microsoft/JARVIS/tree/main/easytool + from the funcQA code. + + Examples:: + >>> from vision_agent.agent import EasyTool + >>> agent = EasyTool() + >>> resp = agent("If a car is traveling at 64 km/h, how many kilometers does it travel in 29 minutes?") + >>> print(resp) + >>> "It will travel approximately 31.03 kilometers in 29 minutes." + """ + def __init__( + self, + task_model: Optional[Union[LLM, LMM]] = None, + answer_model: Optional[Union[LLM, LMM]] = None, + ): + if task_model is None: + self.task_model = OpenAILLM(json_mode=True) + else: + self.task_model = task_model + + if answer_model is None: + self.answer_model = OpenAILLM() + else: + self.answer_model = answer_model + + self.retrieval_num = 3 + self.tools = TOOLS + + def __call__( + self, + input: Union[List[Dict[str, str]], str], + image: Optional[Union[str, Path]] = None, + ) -> str: + if isinstance(input, str): + input = [{"role": "user", "content": input}] + return self.chat(input, image=image) + + def chat( + self, chat: List[Dict[str, str]], image: Optional[Union[str, Path]] = None + ) -> str: + question = chat[0]["content"] + tasks = task_decompose( + self.task_model, + question, + {k: v["description"] for k, v in self.tools.items()}, + ) + task_list = [{"task": task, "id": i + 1} for i, task in enumerate(tasks)] + task_list = task_topology(self.task_model, question, task_list) + task_depend = {"Original Quesiton": question} + previous_log = "" + answers = [] + for task in task_list: + task_depend[task["id"]] = {"task": task["task"], "answer": ""} # type: ignore + # TODO topological sort task_list + for task in task_list: + task_str = task["task"] + previous_log = str(task_depend) + tool_results, call_results = retrieval( + self.task_model, + task_str, + self.tools, + previous_log, + ) + answer = answer_generate( + self.answer_model, task_str, call_results, previous_log + ) + answers.append({"task": task_str, "answer": answer}) + task_depend[task["id"]]["answer"] = answer # type: ignore + return answer_summarize(self.answer_model, question, answers) diff --git a/vision_agent/agent/easytool_prompts.py b/vision_agent/agent/easytool_prompts.py new file mode 100644 index 00000000..acc0a111 --- /dev/null +++ b/vision_agent/agent/easytool_prompts.py @@ -0,0 +1,82 @@ +TASK_DECOMPOSE = """You need to decompose a complex user's question into some simple subtasks and let the model execute it step by step. +This is the user's question: {question} +This is tool list: +{tools} + +Please note that: +1. You should only decompose this complex user's question into some simple subtasks which can be executed easily by using one single tool in the tool list. +2. If one subtask need the results from other subtask, you can should write clearly. For example: +{{"Tasks": ["Convert 23 km/h to X km/min by 'divide_'", "Multiply X km/min by 45 min to get Y by 'multiply_'"]}} +3. You must ONLY output in a parsible JSON format. An example output looks like: + +{{"Tasks": ["Task 1", "Task 2", ...]}} + +Output: """ + +TASK_TOPOLOGY = """Given a complex user's question, I have decompose this question into some simple subtasks. I think there exists a logical connections and order amontg the tasks. Thus you need to help me output this logical connections and order. +You must ONLY output in a parsible JSON format with the following format:" + +{{"Tasks": [{{"task": task, "id", task_id, "dep": [dependency_task_id1, dependency_task_id2, ...]}}]}} + +The "dep" field denotes the id of the previous task which generates a new resource upon which the current task depends. If there are no dependencies, set "dep" to -1. + + +This is user's question: {question} + +These are subtasks of this question: + +{task_list} + +Output: """ + +CHOOSE_TOOL = """This is the user's question: {question} +These are the tools you can select to solve the question: +Tool List: +{tools} + +Please note that: +1. You should only chooce one tool the Tool List to solve this question. +2. You must ONLY output the ID of the tool you chose in a parsible JSON format. Two example outputs look like: + +Example 1: {{"ID": 1}} +Example 2: {{"ID": 2}} + +Output: """ + +CHOOSE_PARAMETER = """Given a user's question and a API tool documentation, you need to output parameters according to the API tool documentation to successfully call the API to solve the user's question. +Please note that: +1. The Example in the API tool documentation can help you better understand the use of the API. +2. Ensure the parameters you output are correct. The output must contain the required parameters, and can contain the optional parameters based on the question. If no paremters in the required parameters and optional parameters, just leave it as {{"Parameters":{{}}}} +3. If the user's question mentions other APIs, you should ONLY consider the API tool documentation I give and do not consider other APIs. +4. The question may have dependencies on answers of other questions, so we will provide logs of previous questions and answers for your reference. +5. If you need to use this API multiple times,, please set "Parameters" to a list. +6. You must ONLY output in a parsible JSON format. Two examples output looks like: + +Example 1: {{"Parameters":{{"input": [1,2,3]}}}} +Example 2: {{"Parameters":[{{"input": [1,2,3]}}, {{"input": [2,3,4]}}]}} + +There are logs of previous questions and answers: +{previous_log} +This is the current user's question: {question} +This is API tool documentation: {tool_usage} +Output: """ + + +ANSWER_GENERATE = """You should answer the question based on the response output by the API tool. +Please note that: +1. Try to organize the response into a natural language answer. +2. We will not show the API response to the user, thus you need to make full use of the response and give the information in the response that can satisfy the user's question in as much detail as possible. +3. If the API tool does not provide useful information in the response, please answer with your knowledge. +4. The question may have dependencies on answers of other questions, so we will provide logs of previous questions and answers. +There are logs of previous questions and answers: +{previous_log} +This is the user's question: {question} +This is the response output by the API tool: +{call_results} +We will not show the API response to the user, thus you need to make full use of the response and give the information in the response that can satisfy the user's question in as much detail as possible. +Output: """ + +ANSWER_SUMMARIZE = """We break down a complex user's problems into simple subtasks and provide answers to each simple subtask. You need to organize these answers to each subtask and form a self-consistent final answer to the user's question +This is the user's question: {question} +These are subtasks and their answers: {answers} +Final answer: """ From 5cc52f9711b4450cba9f361ab809c7cbde3ae47b Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Tue, 19 Mar 2024 16:14:13 -0700 Subject: [PATCH 02/12] add more tools, reformat doc --- vision_agent/tools/__init__.py | 2 +- vision_agent/tools/tools.py | 182 ++++++++++++++++++++++++++------- 2 files changed, 145 insertions(+), 39 deletions(-) diff --git a/vision_agent/tools/__init__.py b/vision_agent/tools/__init__.py index 2f7fe9be..a608c059 100644 --- a/vision_agent/tools/__init__.py +++ b/vision_agent/tools/__init__.py @@ -1,2 +1,2 @@ from .prompts import CHOOSE_PARAMS, SYSTEM_PROMPT -from .tools import CLIP, GroundingDINO, GroundingSAM, ImageTool +from .tools import CLIP, TOOLS, GroundingDINO, GroundingSAM, ImageTool diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py index b892e2c9..1c60738d 100644 --- a/vision_agent/tools/tools.py +++ b/vision_agent/tools/tools.py @@ -25,10 +25,11 @@ def normalize_bbox( def rle_decode(mask_rle: str, shape: Tuple[int, int]) -> np.ndarray: - """ - mask_rle: run-length as string formated (start length) - shape: (height,width) of array to return - Returns numpy array, 1 - mask, 0 - background + r"""Decode a run-length encoded mask. Returns numpy array, 1 - mask, 0 - background. + + Args: + mask_rle: Run-length as string formated (start length) + shape: The (height, width) of array to return """ s = mask_rle.split() starts, lengths = [np.asarray(x, dtype=int) for x in (s[0:][::2], s[1:][::2])] @@ -47,24 +48,27 @@ def __call__(self, image: Union[str, ImageType]) -> List[Dict]: class CLIP(ImageTool): - """ - Example usage: - > from vision_agent.tools import tools - > t = tools.CLIP(["red line", "yellow dot", "none"]) - > t("examples/img/ct_scan1.jpg")) + r"""CLIP is a tool that can classify or tag any image given a set if input classes + or tags. - [[0.02567436918616295, 0.9534115791320801, 0.020914122462272644]] + Examples:: + >>> from vision_agent.tools import tools + >>> t = tools.CLIP(["red line", "yellow dot", "none"]) + >>> t("examples/img/ct_scan1.jpg")) + >>> [[0.02567436918616295, 0.9534115791320801, 0.020914122462272644]] """ _ENDPOINT = "https://rb4ii6dfacmwqfxivi4aedyyfm0endsv.lambda-url.us-east-2.on.aws" - doc = ( - "CLIP is a tool that can classify or tag any image given a set if input classes or tags." + name = "clip_" + description = ( + "'clip_' is a tool that can classify or tag any image given a set if input classes or tags." "Here are some exmaples of how to use the tool, the examples are in the format of User Question: which will have the user's question in quotes followed by the parameters in JSON format, which is the parameters you need to output to call the API to solve the user's question.\n" 'Example 1: User Question: "Can you classify this image as a cat?" {{"Parameters":{{"prompt": ["cat"]}}}}\n' 'Example 2: User Question: "Can you tag this photograph with cat or dog?" {{"Parameters":{{"prompt": ["cat", "dog"]}}}}\n' 'Exmaple 3: User Question: "Can you build me a classifier taht classifies red shirts, green shirts and other?" {{"Parameters":{{"prompt": ["red shirt", "green shirt", "other"]}}}}\n' ) + usage = {} def __init__(self, prompt: list[str]): self.prompt = prompt @@ -92,18 +96,33 @@ def __call__(self, image: Union[str, ImageType]) -> List[Dict]: class GroundingDINO(ImageTool): _ENDPOINT = "https://chnicr4kes5ku77niv2zoytggq0qyqlp.lambda-url.us-east-2.on.aws" - doc = ( - "Grounding DINO is a tool that can detect arbitrary objects with inputs such as category names or referring expressions." + name = "grounding_dino_" + description = ( + "'grounding_dino_' is a tool that can detect arbitrary objects with inputs such as category names or referring expressions." "Here are some exmaples of how to use the tool, the examples are in the format of User Question: which will have the user's question in quotes followed by the parameters in JSON format, which is the parameters you need to output to call the API to solve the user's question.\n" - 'Example 1: User Question: "Can you build me a car detector?" {{"Parameters":{{"prompt": "car"}}}}\n' - 'Example 2: User Question: "Can you detect the person on the left?" {{"Parameters":{{"prompt": "person on the left"}}\n' - 'Exmaple 3: User Question: "Can you build me a tool that detects red shirts and green shirts?" {{"Parameters":{{"prompt": "red shirt. green shirt"}}}}\n' "The tool returns a list of dictionaries, each containing the following keys:\n" - " - 'label': The label of the detected object.\n" - " - 'score': The confidence score of the detection.\n" - " - 'bbox': The bounding box of the detected object. The box coordinates are normalize to [0, 1]\n" - "An example output would be: [{'label': ['car'], 'score': [0.99], 'bbox': [[0.1, 0.2, 0.3, 0.4]]}]\n" + ' - "label": The label of the detected object.\n' + ' - "score": The confidence score of the detection.\n' + ' - "bbox": The bounding box of the detected object. The box coordinates are normalize to [0, 1]\n' + 'An example output would be: [{"label": ["car"], "score": [0.99], "bbox": [[0.1, 0.2, 0.3, 0.4]]}]\n' ) + usage = { + "required_parameters": {"name": "prompt", "type": "str"}, + "examples": [ + { + "scenario": "Can you build me a car detector?", + "parameters": {"prompt": "car"}, + }, + { + "scenario": "Can you detect the person on the left?", + "parameters": {"prompt": "person on the left"}, + }, + { + "scenario": "Detect the red shirts and green shirst.", + "parameters": {"prompt": "red shirt. green shirt"}, + }, + ], + } def __init__(self, prompt: str): self.prompt = prompt @@ -136,32 +155,35 @@ def __call__(self, image: Union[str, Path, ImageType]) -> List[Dict]: class GroundingSAM(ImageTool): - """ - Example usage: - > from vision_agent.tools import tools - > t = tools.GroundingSAM(["red line", "yellow dot", "none"]) - > t("examples/img/ct_scan1.jpg") - - [{'label': 'none', 'mask': array([[0, 0, 0, ..., 0, 0, 0], - [0, 0, 0, ..., 0, 0, 0], - ..., - [0, 0, 0, ..., 0, 0, 0], - [0, 0, 0, ..., 0, 0, 0]], dtype=uint8)}, {'label': 'red line', 'mask': array([[0, 0, 0, ..., 0, 0, 0], - [0, 0, 0, ..., 0, 0, 0], - ..., - [1, 1, 1, ..., 1, 1, 1], - [1, 1, 1, ..., 1, 1, 1]], dtype=uint8)}] + r"""Grounding SAM is a tool that can detect and segment arbitrary objects with + inputs such as category names or referring expressions. + + Examples:: + >>> from vision_agent.tools import tools + >>> t = tools.GroundingSAM(["red line", "yellow dot", "none"]) + >>> t("examples/img/ct_scan1.jpg") + >>> [{'label': 'none', 'mask': array([[0, 0, 0, ..., 0, 0, 0], + >>> [0, 0, 0, ..., 0, 0, 0], + >>> ..., + >>> [0, 0, 0, ..., 0, 0, 0], + >>> [0, 0, 0, ..., 0, 0, 0]], dtype=uint8)}, {'label': 'red line', 'mask': array([[0, 0, 0, ..., 0, 0, 0], + >>> [0, 0, 0, ..., 0, 0, 0], + >>> ..., + >>> [1, 1, 1, ..., 1, 1, 1], + >>> [1, 1, 1, ..., 1, 1, 1]], dtype=uint8)}] """ _ENDPOINT = "https://cou5lfmus33jbddl6hoqdfbw7e0qidrw.lambda-url.us-east-2.on.aws" - doc = ( - "Grounding SAM is a tool that can detect and segment arbitrary objects with inputs such as category names or referring expressions." + name = "grounding_sam_" + description = ( + "'grounding_sam_' is a tool that can detect and segment arbitrary objects with inputs such as category names or referring expressions." "Here are some exmaples of how to use the tool, the examples are in the format of User Question: which will have the user's question in quotes followed by the parameters in JSON format, which is the parameters you need to output to call the API to solve the user's question.\n" 'Example 1: User Question: "Can you build me a car segmentor?" {{"Parameters":{{"prompt": ["car"]}}}}\n' 'Example 2: User Question: "Can you segment the person on the left?" {{"Parameters":{{"prompt": ["person on the left"]}}\n' 'Exmaple 3: User Question: "Can you build me a tool that segments red shirts and green shirts?" {{"Parameters":{{"prompt": ["red shirt", "green shirt"]}}}}\n' ) + usage = {} def __init__(self, prompt: list[str]): self.prompt = prompt @@ -195,3 +217,87 @@ def __call__(self, image: Union[str, ImageType]) -> List[Dict]: } ) return preds + + +class Add: + name = "add_" + description = "'add_' returns the sum of all the arguments passed to it, normalized to 2 decimal places." + usage = ( + { + "required_parameters": {"name": "input", "type": "List[int]"}, + "examples": [ + { + "scenario": "If you want to calculate 2 + 4", + "parameters": {"input": [2, 4]}, + } + ], + }, + ) + + def __call__(self, input: List[int]) -> float: + return round(sum(input), 2) + + +class Subtract: + name = "subtract_" + description = "'subtract_' returns the difference of all the arguments passed to it, normalized to 2 decimal places." + usage = ( + { + "required_parameters": {"name": "input", "type": "List[int]"}, + "examples": [ + { + "scenario": "If you want to calculate 4 - 2", + "parameters": {"input": [4, 2]}, + } + ], + }, + ) + + def __call__(self, input: List[int]) -> float: + return round(input[0] - input[1], 2) + + +class Multiply: + name = "multiply_" + description = "'multiply_' returns the product of all the arguments passed to it, normalized to 2 decimal places." + usage = ( + { + "required_parameters": {"name": "input", "type": "List[int]"}, + "examples": [ + { + "scenario": "If you want to calculate 2 * 4", + "parameters": {"input": [2, 4]}, + } + ], + }, + ) + + def __call__(self, input: List[int]) -> float: + return round(input[0] * input[1], 2) + + +class Divide: + name = "divide_" + description = "'divide_' returns the division of all the arguments passed to it, normalized to 2 decimal places." + usage = ( + { + "required_parameters": {"name": "input", "type": "List[int]"}, + "examples": [ + { + "scenario": "If you want to calculate 4 / 2", + "parameters": {"input": [4, 2]}, + } + ], + }, + ) + + def __call__(self, input: List[int]) -> float: + return round(input[0] / input[1], 2) + + +TOOLS = { + i: {"name": c.name, "description": c.description, "usage": c.usage, "class": c} + for i, c in enumerate( + [CLIP, GroundingDINO, GroundingSAM, Add, Subtract, Multiply, Divide] + ) +} From 82d192a695b19d209b4c19233451a6f640178b84 Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Tue, 19 Mar 2024 16:14:24 -0700 Subject: [PATCH 03/12] add in imports --- vision_agent/agent/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vision_agent/agent/__init__.py b/vision_agent/agent/__init__.py index 7dd4f393..aec05098 100644 --- a/vision_agent/agent/__init__.py +++ b/vision_agent/agent/__init__.py @@ -1,2 +1,3 @@ from .agent import Agent from .reflexion import Reflexion +from .easytool import EasyTool From 22d5b74065be9ca5f0eb3ae21e2bdee146d7a9d7 Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Tue, 19 Mar 2024 16:14:51 -0700 Subject: [PATCH 04/12] fix llm for tools --- vision_agent/llm/llm.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/vision_agent/llm/llm.py b/vision_agent/llm/llm.py index f07b0611..412369ca 100644 --- a/vision_agent/llm/llm.py +++ b/vision_agent/llm/llm.py @@ -31,24 +31,29 @@ def __call__(self, input: Union[str, List[Dict[str, str]]]) -> str: class OpenAILLM(LLM): r"""An LLM class for any OpenAI LLM model.""" - def __init__(self, model_name: str = "gpt-4-turbo-preview"): + def __init__(self, model_name: str = "gpt-4-turbo-preview", json_mode: bool = False): self.model_name = model_name self.client = OpenAI() + self.json_mode = json_mode def generate(self, prompt: str) -> str: + kwargs = {"response_format": {"type": "json_object"}} if self.json_mode else {} response = self.client.chat.completions.create( model=self.model_name, messages=[ {"role": "user", "content": prompt}, ], + **kwargs, # type: ignore ) return cast(str, response.choices[0].message.content) def chat(self, chat: List[Dict[str, str]]) -> str: + kwargs = {"response_format": {"type": "json_object"}} if self.json_mode else {} response = self.client.chat.completions.create( model=self.model_name, messages=chat, # type: ignore + **kwargs, # type: ignore ) return cast(str, response.choices[0].message.content) @@ -59,7 +64,7 @@ def __call__(self, input: Union[str, List[Dict[str, str]]]) -> str: return self.chat(input) def generate_classifier(self, prompt: str) -> ImageTool: - prompt = CHOOSE_PARAMS.format(api_doc=CLIP.doc, question=prompt) + prompt = CHOOSE_PARAMS.format(api_doc=CLIP.description, question=prompt) response = self.client.chat.completions.create( model=self.model_name, response_format={"type": "json_object"}, @@ -75,7 +80,8 @@ def generate_classifier(self, prompt: str) -> ImageTool: return CLIP(**cast(Mapping, params)) def generate_detector(self, params: str) -> ImageTool: - params = CHOOSE_PARAMS.format(api_doc=GroundingDINO.doc, question=params) + api_doc = GroundingDINO.description + "\n" + str(GroundingDINO.usage) + params = CHOOSE_PARAMS.format(api_doc=api_doc, question=params) response = self.client.chat.completions.create( model=self.model_name, response_format={"type": "json_object"}, @@ -91,7 +97,7 @@ def generate_detector(self, params: str) -> ImageTool: return GroundingDINO(**cast(Mapping, params)) def generate_segmentor(self, params: str) -> ImageTool: - params = CHOOSE_PARAMS.format(api_doc=GroundingSAM.doc, question=params) + params = CHOOSE_PARAMS.format(api_doc=GroundingSAM.description, question=params) response = self.client.chat.completions.create( model=self.model_name, response_format={"type": "json_object"}, From 4f01bae893b96699edb55f2d6c21f4874cce32a6 Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Tue, 19 Mar 2024 16:16:48 -0700 Subject: [PATCH 05/12] fix doc --- vision_agent/llm/llm.py | 6 ++++-- vision_agent/lmm/lmm.py | 9 ++++++--- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/vision_agent/llm/llm.py b/vision_agent/llm/llm.py index 412369ca..91e3ebe8 100644 --- a/vision_agent/llm/llm.py +++ b/vision_agent/llm/llm.py @@ -64,7 +64,8 @@ def __call__(self, input: Union[str, List[Dict[str, str]]]) -> str: return self.chat(input) def generate_classifier(self, prompt: str) -> ImageTool: - prompt = CHOOSE_PARAMS.format(api_doc=CLIP.description, question=prompt) + api_doc = CLIP.description + "\n" + str(CLIP.usage) + prompt = CHOOSE_PARAMS.format(api_doc=api_doc, question=prompt) response = self.client.chat.completions.create( model=self.model_name, response_format={"type": "json_object"}, @@ -97,7 +98,8 @@ def generate_detector(self, params: str) -> ImageTool: return GroundingDINO(**cast(Mapping, params)) def generate_segmentor(self, params: str) -> ImageTool: - params = CHOOSE_PARAMS.format(api_doc=GroundingSAM.description, question=params) + api_doc = GroundingSAM.description + "\n" + str(GroundingSAM.usage) + params = CHOOSE_PARAMS.format(api_doc=api_doc, question=params) response = self.client.chat.completions.create( model=self.model_name, response_format={"type": "json_object"}, diff --git a/vision_agent/lmm/lmm.py b/vision_agent/lmm/lmm.py index 2023000c..58ee1d65 100644 --- a/vision_agent/lmm/lmm.py +++ b/vision_agent/lmm/lmm.py @@ -169,7 +169,8 @@ def generate(self, prompt: str, image: Optional[Union[str, Path]] = None) -> str return cast(str, response.choices[0].message.content) def generate_classifier(self, prompt: str) -> ImageTool: - prompt = CHOOSE_PARAMS.format(api_doc=CLIP.doc, question=prompt) + api_doc = CLIP.description + "\n" + str(CLIP.usage) + prompt = CHOOSE_PARAMS.format(api_doc=api_doc, question=prompt) response = self.client.chat.completions.create( model=self.model_name, messages=[ @@ -191,7 +192,8 @@ def generate_classifier(self, prompt: str) -> ImageTool: return CLIP(**cast(Mapping, prompt)) def generate_detector(self, params: str) -> ImageTool: - params = CHOOSE_PARAMS.format(api_doc=GroundingDINO.doc, question=params) + api_doc = GroundingDINO.description + "\n" + str(GroundingDINO.usage) + params = CHOOSE_PARAMS.format(api_doc=api_doc, question=params) response = self.client.chat.completions.create( model=self.model_name, messages=[ @@ -213,7 +215,8 @@ def generate_detector(self, params: str) -> ImageTool: return GroundingDINO(**cast(Mapping, params)) def generate_segmentor(self, prompt: str) -> ImageTool: - prompt = CHOOSE_PARAMS.format(api_doc=GroundingSAM.doc, question=prompt) + api_doc = GroundingSAM.description + "\n" + str(GroundingSAM.usage) + prompt = CHOOSE_PARAMS.format(api_doc=api_doc, question=prompt) response = self.client.chat.completions.create( model=self.model_name, messages=[ From 38f73a4379ff042cfbc93856b89da166fb36bfce Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Tue, 19 Mar 2024 16:17:09 -0700 Subject: [PATCH 06/12] fix formatting --- vision_agent/agent/easytool.py | 3 ++- vision_agent/llm/llm.py | 8 +++++--- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/vision_agent/agent/easytool.py b/vision_agent/agent/easytool.py index 4690a829..b80f1a7b 100644 --- a/vision_agent/agent/easytool.py +++ b/vision_agent/agent/easytool.py @@ -160,7 +160,7 @@ def retrieval( tool_name = tool_instructions["name"] parameters = choose_parameter(model, question, tool_usage, previous_log) - if parameters is None: # TODO + if parameters is None: # TODO pass tool_results = [{"tool_name": tool_name, "parameters": parameters}] @@ -212,6 +212,7 @@ class EasyTool(Agent): >>> print(resp) >>> "It will travel approximately 31.03 kilometers in 29 minutes." """ + def __init__( self, task_model: Optional[Union[LLM, LMM]] = None, diff --git a/vision_agent/llm/llm.py b/vision_agent/llm/llm.py index 91e3ebe8..dcf61762 100644 --- a/vision_agent/llm/llm.py +++ b/vision_agent/llm/llm.py @@ -31,7 +31,9 @@ def __call__(self, input: Union[str, List[Dict[str, str]]]) -> str: class OpenAILLM(LLM): r"""An LLM class for any OpenAI LLM model.""" - def __init__(self, model_name: str = "gpt-4-turbo-preview", json_mode: bool = False): + def __init__( + self, model_name: str = "gpt-4-turbo-preview", json_mode: bool = False + ): self.model_name = model_name self.client = OpenAI() self.json_mode = json_mode @@ -43,7 +45,7 @@ def generate(self, prompt: str) -> str: messages=[ {"role": "user", "content": prompt}, ], - **kwargs, # type: ignore + **kwargs, # type: ignore ) return cast(str, response.choices[0].message.content) @@ -53,7 +55,7 @@ def chat(self, chat: List[Dict[str, str]]) -> str: response = self.client.chat.completions.create( model=self.model_name, messages=chat, # type: ignore - **kwargs, # type: ignore + **kwargs, # type: ignore ) return cast(str, response.choices[0].message.content) From 8fded0b67172256b2c05d1cf8920b8ab5afb5db2 Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Tue, 19 Mar 2024 16:19:50 -0700 Subject: [PATCH 07/12] fix flake8 --- vision_agent/agent/easytool.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/vision_agent/agent/easytool.py b/vision_agent/agent/easytool.py index b80f1a7b..ac35f805 100644 --- a/vision_agent/agent/easytool.py +++ b/vision_agent/agent/easytool.py @@ -54,7 +54,7 @@ def task_decompose( str_result = model(prompt) result = parse_json(str_result) return result["Tasks"] - except Exception as _: + except Exception: if tries > 10: raise ValueError(f"Failed task_decompose on: {str_result}") tries += 1 @@ -78,8 +78,8 @@ def task_topology( elt["dep"] = [elt["dep"]] elif isinstance(elt["dep"], list): elt["dep"] = [int(dep) for dep in elt["dep"]] - return result["Tasks"] ## TODO - except Exception as _: + return result["Tasks"] + except Exception: if tries > 10: raise ValueError(f"Failed task_topology on: {str_result}") tries += 1 @@ -97,7 +97,7 @@ def choose_tool( str_result = model(prompt) result = parse_json(str_result) return result["ID"] - except Exception as _: + except Exception: if tries > 10: raise ValueError(f"Failed choose_tool on: {str_result}") tries += 1 @@ -118,7 +118,7 @@ def choose_parameter( str_result = model(prompt) result = parse_json(str_result) return result["Parameters"] - except Exception as _: + except Exception: if tries > 10: raise ValueError(f"Failed choose_parameter on: {str_result}") tries += 1 @@ -155,7 +155,6 @@ def retrieval( pass tool_instructions = tools[tool_id] - tool_description = tool_instructions["description"] tool_usage = tool_instructions["usage"] tool_name = tool_instructions["name"] @@ -175,7 +174,7 @@ def parse_tool_results(result: Dict[str, Union[Dict, List]]) -> Any: parameters[change_name(key)] = result["parameters"][key] # TODO: wrap call to handle errors call_result = tools[tool_id]["class"]()(**parameters) - if call_result == None: + if call_result is None: continue call_results.append(call_result) elif isinstance(result["parameters"], List): @@ -184,7 +183,7 @@ def parse_tool_results(result: Dict[str, Union[Dict, List]]) -> Any: for key in param_list: parameters[change_name(key)] = param_list[key] call_result = tools[tool_id]["class"]()(**parameters) - if call_result == None: + if call_result is None: continue call_results.append(call_result) return call_results From 066959c50f7e4f27b934dd4b73caf819c208a64b Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Tue, 19 Mar 2024 16:36:35 -0700 Subject: [PATCH 08/12] fix type errors --- vision_agent/agent/easytool.py | 23 +++----- vision_agent/llm/llm.py | 2 +- vision_agent/tools/tools.py | 101 ++++++++++++++++----------------- 3 files changed, 60 insertions(+), 66 deletions(-) diff --git a/vision_agent/agent/easytool.py b/vision_agent/agent/easytool.py index ac35f805..b21114ec 100644 --- a/vision_agent/agent/easytool.py +++ b/vision_agent/agent/easytool.py @@ -16,7 +16,7 @@ ) -def parse_json(s: str) -> Dict: +def parse_json(s: str) -> Any: s = ( s.replace(": true", ": True") .replace(": false", ": False") @@ -28,7 +28,7 @@ def parse_json(s: str) -> Dict: return json.loads(s) -def change_name(name: str): +def change_name(name: str) -> str: change_list = ["from", "class", "return", "false", "true", "id", "and", "", "ID"] if name in change_list: name = "is_" + name.lower() @@ -53,7 +53,7 @@ def task_decompose( try: str_result = model(prompt) result = parse_json(str_result) - return result["Tasks"] + return result["Tasks"] # type: ignore except Exception: if tries > 10: raise ValueError(f"Failed task_decompose on: {str_result}") @@ -78,7 +78,7 @@ def task_topology( elt["dep"] = [elt["dep"]] elif isinstance(elt["dep"], list): elt["dep"] = [int(dep) for dep in elt["dep"]] - return result["Tasks"] + return result["Tasks"] # type: ignore except Exception: if tries > 10: raise ValueError(f"Failed task_topology on: {str_result}") @@ -96,7 +96,7 @@ def choose_tool( try: str_result = model(prompt) result = parse_json(str_result) - return result["ID"] + return result["ID"] # type: ignore except Exception: if tries > 10: raise ValueError(f"Failed choose_tool on: {str_result}") @@ -217,15 +217,10 @@ def __init__( task_model: Optional[Union[LLM, LMM]] = None, answer_model: Optional[Union[LLM, LMM]] = None, ): - if task_model is None: - self.task_model = OpenAILLM(json_mode=True) - else: - self.task_model = task_model - - if answer_model is None: - self.answer_model = OpenAILLM() - else: - self.answer_model = answer_model + self.task_model = ( + OpenAILLM(json_mode=True) if task_model is None else task_model + ) + self.answer_model = OpenAILLM() if answer_model is None else answer_model self.retrieval_num = 3 self.tools = TOOLS diff --git a/vision_agent/llm/llm.py b/vision_agent/llm/llm.py index dcf61762..bb6e7d7f 100644 --- a/vision_agent/llm/llm.py +++ b/vision_agent/llm/llm.py @@ -55,7 +55,7 @@ def chat(self, chat: List[Dict[str, str]]) -> str: response = self.client.chat.completions.create( model=self.model_name, messages=chat, # type: ignore - **kwargs, # type: ignore + **kwargs, ) return cast(str, response.choices[0].message.content) diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py index 1c60738d..d6d66168 100644 --- a/vision_agent/tools/tools.py +++ b/vision_agent/tools/tools.py @@ -41,7 +41,13 @@ def rle_decode(mask_rle: str, shape: Tuple[int, int]) -> np.ndarray: return img.reshape(shape) -class ImageTool(ABC): +class Tool(ABC): + name: str + description: str + usage: Dict + + +class ImageTool(Tool): @abstractmethod def __call__(self, image: Union[str, ImageType]) -> List[Dict]: pass @@ -68,7 +74,7 @@ class CLIP(ImageTool): 'Example 2: User Question: "Can you tag this photograph with cat or dog?" {{"Parameters":{{"prompt": ["cat", "dog"]}}}}\n' 'Exmaple 3: User Question: "Can you build me a classifier taht classifies red shirts, green shirts and other?" {{"Parameters":{{"prompt": ["red shirt", "green shirt", "other"]}}}}\n' ) - usage = {} + usage: Dict = {} def __init__(self, prompt: list[str]): self.prompt = prompt @@ -183,7 +189,7 @@ class GroundingSAM(ImageTool): 'Example 2: User Question: "Can you segment the person on the left?" {{"Parameters":{{"prompt": ["person on the left"]}}\n' 'Exmaple 3: User Question: "Can you build me a tool that segments red shirts and green shirts?" {{"Parameters":{{"prompt": ["red shirt", "green shirt"]}}}}\n' ) - usage = {} + usage: Dict = {} def __init__(self, prompt: list[str]): self.prompt = prompt @@ -219,77 +225,69 @@ def __call__(self, image: Union[str, ImageType]) -> List[Dict]: return preds -class Add: +class Add(Tool): name = "add_" description = "'add_' returns the sum of all the arguments passed to it, normalized to 2 decimal places." - usage = ( - { - "required_parameters": {"name": "input", "type": "List[int]"}, - "examples": [ - { - "scenario": "If you want to calculate 2 + 4", - "parameters": {"input": [2, 4]}, - } - ], - }, - ) + usage = { + "required_parameters": {"name": "input", "type": "List[int]"}, + "examples": [ + { + "scenario": "If you want to calculate 2 + 4", + "parameters": {"input": [2, 4]}, + } + ], + } def __call__(self, input: List[int]) -> float: return round(sum(input), 2) -class Subtract: +class Subtract(Tool): name = "subtract_" description = "'subtract_' returns the difference of all the arguments passed to it, normalized to 2 decimal places." - usage = ( - { - "required_parameters": {"name": "input", "type": "List[int]"}, - "examples": [ - { - "scenario": "If you want to calculate 4 - 2", - "parameters": {"input": [4, 2]}, - } - ], - }, - ) + usage = { + "required_parameters": {"name": "input", "type": "List[int]"}, + "examples": [ + { + "scenario": "If you want to calculate 4 - 2", + "parameters": {"input": [4, 2]}, + } + ], + } def __call__(self, input: List[int]) -> float: return round(input[0] - input[1], 2) -class Multiply: +class Multiply(Tool): name = "multiply_" description = "'multiply_' returns the product of all the arguments passed to it, normalized to 2 decimal places." - usage = ( - { - "required_parameters": {"name": "input", "type": "List[int]"}, - "examples": [ - { - "scenario": "If you want to calculate 2 * 4", - "parameters": {"input": [2, 4]}, - } - ], - }, - ) + usage = { + "required_parameters": {"name": "input", "type": "List[int]"}, + "examples": [ + { + "scenario": "If you want to calculate 2 * 4", + "parameters": {"input": [2, 4]}, + } + ], + } def __call__(self, input: List[int]) -> float: return round(input[0] * input[1], 2) -class Divide: +class Divide(Tool): name = "divide_" description = "'divide_' returns the division of all the arguments passed to it, normalized to 2 decimal places." - usage = ( - { - "required_parameters": {"name": "input", "type": "List[int]"}, - "examples": [ - { - "scenario": "If you want to calculate 4 / 2", - "parameters": {"input": [4, 2]}, - } - ], - }, - ) + usage = { + "required_parameters": {"name": "input", "type": "List[int]"}, + "examples": [ + { + "scenario": "If you want to calculate 4 / 2", + "parameters": {"input": [4, 2]}, + } + ], + } def __call__(self, input: List[int]) -> float: return round(input[0] / input[1], 2) @@ -300,4 +298,5 @@ def __call__(self, input: List[int]) -> float: for i, c in enumerate( [CLIP, GroundingDINO, GroundingSAM, Add, Subtract, Multiply, Divide] ) + if (hasattr(c, "name") and hasattr(c, "description") and hasattr(c, "usage")) } From b36f3849c545ea963557f61716b2ba8c7cc47243 Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Tue, 19 Mar 2024 20:39:00 -0700 Subject: [PATCH 09/12] updated tests --- tests/fixtures.py | 23 +++++++++++++++++++++++ tests/test_llm.py | 26 +++++++++++++++----------- tests/test_lmm.py | 26 +++++++++++++++----------- 3 files changed, 53 insertions(+), 22 deletions(-) diff --git a/tests/fixtures.py b/tests/fixtures.py index 5a64081f..602096ac 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -2,6 +2,8 @@ import pytest +from vision_agent.tools import CLIP, GroundingDINO, GroundingSAM + @pytest.fixture def openai_llm_mock(request): @@ -27,3 +29,24 @@ def openai_lmm_mock(request): choices=[MagicMock(message=MagicMock(content=content))] ) yield mock_instance + + +@pytest.fixture +def clip_mock(request): + with patch.object(CLIP, "__call__", autospec=True) as mock: + mock.return_value = "test" + yield mock + + +@pytest.fixture +def grounding_dino_mock(request): + with patch.object(GroundingDINO, "__call__", autospec=True) as mock: + mock.return_value = "test" + yield mock + + +@pytest.fixture +def grounding_sam_mock(request): + with patch.object(GroundingSAM, "__call__", autospec=True) as mock: + mock.return_value = "test" + yield mock diff --git a/tests/test_llm.py b/tests/test_llm.py index a28288a4..0a671ca5 100644 --- a/tests/test_llm.py +++ b/tests/test_llm.py @@ -1,9 +1,13 @@ import pytest from vision_agent.llm.llm import OpenAILLM -from vision_agent.tools import CLIP, GroundingDINO, GroundingSAM -from .fixtures import openai_llm_mock # noqa: F401 +from .fixtures import ( # noqa: F401 + clip_mock, + grounding_dino_mock, + grounding_sam_mock, + openai_llm_mock, +) @pytest.mark.parametrize( @@ -57,12 +61,12 @@ def test_call_with_mock(openai_llm_mock): # noqa: F811 ['{"Parameters": {"prompt": "cat"}}'], indirect=["openai_llm_mock"], ) -def test_generate_classifier(openai_llm_mock): # noqa: F811 +def test_generate_classifier(openai_llm_mock, clip_mock): # noqa: F811 llm = OpenAILLM() prompt = "Can you generate a cat classifier?" classifier = llm.generate_classifier(prompt) - assert isinstance(classifier, CLIP) - assert classifier.prompt == "cat" + classifier("image.png") + assert clip_mock.call_args[1] == {"prompt": "cat", "image": "image.png"} @pytest.mark.parametrize( @@ -70,12 +74,12 @@ def test_generate_classifier(openai_llm_mock): # noqa: F811 ['{"Parameters": {"prompt": "cat"}}'], indirect=["openai_llm_mock"], ) -def test_generate_detector(openai_llm_mock): # noqa: F811 +def test_generate_detector(openai_llm_mock, grounding_dino_mock): # noqa: F811 llm = OpenAILLM() prompt = "Can you generate a cat detector?" detector = llm.generate_detector(prompt) - assert isinstance(detector, GroundingDINO) - assert detector.prompt == "cat" + detector("image.png") + assert grounding_dino_mock.call_args[1] == {"prompt": "cat", "image": "image.png"} @pytest.mark.parametrize( @@ -83,9 +87,9 @@ def test_generate_detector(openai_llm_mock): # noqa: F811 ['{"Parameters": {"prompt": "cat"}}'], indirect=["openai_llm_mock"], ) -def test_generate_segmentor(openai_llm_mock): # noqa: F811 +def test_generate_segmentor(openai_llm_mock, grounding_sam_mock): # noqa: F811 llm = OpenAILLM() prompt = "Can you generate a cat segmentor?" segmentor = llm.generate_segmentor(prompt) - assert isinstance(segmentor, GroundingSAM) - assert segmentor.prompt == "cat" + segmentor("image.png") + assert grounding_sam_mock.call_args[1] == {"prompt": "cat", "image": "image.png"} diff --git a/tests/test_lmm.py b/tests/test_lmm.py index c1390726..16b5691f 100644 --- a/tests/test_lmm.py +++ b/tests/test_lmm.py @@ -4,9 +4,13 @@ from PIL import Image from vision_agent.lmm.lmm import OpenAILMM -from vision_agent.tools import CLIP, GroundingDINO, GroundingSAM -from .fixtures import openai_lmm_mock # noqa: F401 +from .fixtures import ( # noqa: F401 + clip_mock, + grounding_dino_mock, + grounding_sam_mock, + openai_lmm_mock, +) def create_temp_image(image_format="jpeg"): @@ -77,12 +81,12 @@ def test_call_with_mock(openai_lmm_mock): # noqa: F811 ['{"Parameters": {"prompt": "cat"}}'], indirect=["openai_lmm_mock"], ) -def test_generate_classifier(openai_lmm_mock): # noqa: F811 +def test_generate_classifier(openai_lmm_mock, clip_mock): # noqa: F811 lmm = OpenAILMM() prompt = "Can you generate a cat classifier?" classifier = lmm.generate_classifier(prompt) - assert isinstance(classifier, CLIP) - assert classifier.prompt == "cat" + classifier("image.png") + assert clip_mock.call_args[1] == {"prompt": "cat", "image": "image.png"} @pytest.mark.parametrize( @@ -90,12 +94,12 @@ def test_generate_classifier(openai_lmm_mock): # noqa: F811 ['{"Parameters": {"prompt": "cat"}}'], indirect=["openai_lmm_mock"], ) -def test_generate_detector(openai_lmm_mock): # noqa: F811 +def test_generate_detector(openai_lmm_mock, grounding_dino_mock): # noqa: F811 lmm = OpenAILMM() prompt = "Can you generate a cat classifier?" detector = lmm.generate_detector(prompt) - assert isinstance(detector, GroundingDINO) - assert detector.prompt == "cat" + detector("image.png") + assert grounding_dino_mock.call_args[1] == {"prompt": "cat", "image": "image.png"} @pytest.mark.parametrize( @@ -103,9 +107,9 @@ def test_generate_detector(openai_lmm_mock): # noqa: F811 ['{"Parameters": {"prompt": "cat"}}'], indirect=["openai_lmm_mock"], ) -def test_generate_segmentor(openai_lmm_mock): # noqa: F811 +def test_generate_segmentor(openai_lmm_mock, grounding_sam_mock): # noqa: F811 lmm = OpenAILMM() prompt = "Can you generate a cat classifier?" segmentor = lmm.generate_segmentor(prompt) - assert isinstance(segmentor, GroundingSAM) - assert segmentor.prompt == "cat" + segmentor("image.png") + assert grounding_sam_mock.call_args[1] == {"prompt": "cat", "image": "image.png"} From f0383630e7ddd7a3a19d351cea667e4dda6022a6 Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Tue, 19 Mar 2024 20:39:22 -0700 Subject: [PATCH 10/12] updated tools --- vision_agent/llm/llm.py | 32 ++++++----- vision_agent/lmm/lmm.py | 27 +++++---- vision_agent/tools/__init__.py | 2 +- vision_agent/tools/tools.py | 102 +++++++++++++++++++-------------- 4 files changed, 90 insertions(+), 73 deletions(-) diff --git a/vision_agent/llm/llm.py b/vision_agent/llm/llm.py index bb6e7d7f..374b58c9 100644 --- a/vision_agent/llm/llm.py +++ b/vision_agent/llm/llm.py @@ -1,6 +1,6 @@ import json from abc import ABC, abstractmethod -from typing import Dict, List, Mapping, Union, cast +from typing import Callable, Dict, List, Mapping, Union, cast from openai import OpenAI @@ -10,7 +10,6 @@ SYSTEM_PROMPT, GroundingDINO, GroundingSAM, - ImageTool, ) @@ -65,9 +64,9 @@ def __call__(self, input: Union[str, List[Dict[str, str]]]) -> str: return self.generate(input) return self.chat(input) - def generate_classifier(self, prompt: str) -> ImageTool: + def generate_classifier(self, question: str) -> Callable: api_doc = CLIP.description + "\n" + str(CLIP.usage) - prompt = CHOOSE_PARAMS.format(api_doc=api_doc, question=prompt) + prompt = CHOOSE_PARAMS.format(api_doc=api_doc, question=question) response = self.client.chat.completions.create( model=self.model_name, response_format={"type": "json_object"}, @@ -80,38 +79,41 @@ def generate_classifier(self, prompt: str) -> ImageTool: params = json.loads(cast(str, response.choices[0].message.content))[ "Parameters" ] - return CLIP(**cast(Mapping, params)) - def generate_detector(self, params: str) -> ImageTool: + return lambda x: CLIP()(**{"prompt": params["prompt"], "image": x}) + + def generate_detector(self, question: str) -> Callable: api_doc = GroundingDINO.description + "\n" + str(GroundingDINO.usage) - params = CHOOSE_PARAMS.format(api_doc=api_doc, question=params) + prompt = CHOOSE_PARAMS.format(api_doc=api_doc, question=question) response = self.client.chat.completions.create( model=self.model_name, response_format={"type": "json_object"}, messages=[ {"role": "system", "content": SYSTEM_PROMPT}, - {"role": "user", "content": params}, + {"role": "user", "content": prompt}, ], ) - params = json.loads(cast(str, response.choices[0].message.content))[ + params: Mapping = json.loads(cast(str, response.choices[0].message.content))[ "Parameters" ] - return GroundingDINO(**cast(Mapping, params)) - def generate_segmentor(self, params: str) -> ImageTool: + return lambda x: GroundingDINO()(**{"prompt": params["prompt"], "image": x}) + + def generate_segmentor(self, question: str) -> Callable: api_doc = GroundingSAM.description + "\n" + str(GroundingSAM.usage) - params = CHOOSE_PARAMS.format(api_doc=api_doc, question=params) + prompt = CHOOSE_PARAMS.format(api_doc=api_doc, question=question) response = self.client.chat.completions.create( model=self.model_name, response_format={"type": "json_object"}, messages=[ {"role": "system", "content": SYSTEM_PROMPT}, - {"role": "user", "content": params}, + {"role": "user", "content": prompt}, ], ) - params = json.loads(cast(str, response.choices[0].message.content))[ + params: Mapping = json.loads(cast(str, response.choices[0].message.content))[ "Parameters" ] - return GroundingSAM(**cast(Mapping, params)) + + return lambda x: GroundingSAM()(**{"prompt": params["prompt"], "image": x}) diff --git a/vision_agent/lmm/lmm.py b/vision_agent/lmm/lmm.py index 58ee1d65..48d449f5 100644 --- a/vision_agent/lmm/lmm.py +++ b/vision_agent/lmm/lmm.py @@ -3,7 +3,7 @@ import logging from abc import ABC, abstractmethod from pathlib import Path -from typing import Any, Dict, List, Mapping, Optional, Union, cast +from typing import Any, Callable, Dict, List, Optional, Union, cast import requests from openai import OpenAI @@ -14,7 +14,6 @@ SYSTEM_PROMPT, GroundingDINO, GroundingSAM, - ImageTool, ) _LOGGER = logging.getLogger(__name__) @@ -168,9 +167,9 @@ def generate(self, prompt: str, image: Optional[Union[str, Path]] = None) -> str ) return cast(str, response.choices[0].message.content) - def generate_classifier(self, prompt: str) -> ImageTool: + def generate_classifier(self, question: str) -> Callable: api_doc = CLIP.description + "\n" + str(CLIP.usage) - prompt = CHOOSE_PARAMS.format(api_doc=api_doc, question=prompt) + prompt = CHOOSE_PARAMS.format(api_doc=api_doc, question=question) response = self.client.chat.completions.create( model=self.model_name, messages=[ @@ -180,7 +179,7 @@ def generate_classifier(self, prompt: str) -> ImageTool: ) try: - prompt = json.loads(cast(str, response.choices[0].message.content))[ + params = json.loads(cast(str, response.choices[0].message.content))[ "Parameters" ] except json.JSONDecodeError: @@ -189,16 +188,16 @@ def generate_classifier(self, prompt: str) -> ImageTool: ) raise ValueError("Failed to decode response") - return CLIP(**cast(Mapping, prompt)) + return lambda x: CLIP()(**{"prompt": params["prompt"], "image": x}) - def generate_detector(self, params: str) -> ImageTool: + def generate_detector(self, question: str) -> Callable: api_doc = GroundingDINO.description + "\n" + str(GroundingDINO.usage) - params = CHOOSE_PARAMS.format(api_doc=api_doc, question=params) + prompt = CHOOSE_PARAMS.format(api_doc=api_doc, question=question) response = self.client.chat.completions.create( model=self.model_name, messages=[ {"role": "system", "content": SYSTEM_PROMPT}, - {"role": "user", "content": params}, + {"role": "user", "content": prompt}, ], ) @@ -212,11 +211,11 @@ def generate_detector(self, params: str) -> ImageTool: ) raise ValueError("Failed to decode response") - return GroundingDINO(**cast(Mapping, params)) + return lambda x: GroundingDINO()(**{"prompt": params["prompt"], "image": x}) - def generate_segmentor(self, prompt: str) -> ImageTool: + def generate_segmentor(self, question: str) -> Callable: api_doc = GroundingSAM.description + "\n" + str(GroundingSAM.usage) - prompt = CHOOSE_PARAMS.format(api_doc=api_doc, question=prompt) + prompt = CHOOSE_PARAMS.format(api_doc=api_doc, question=question) response = self.client.chat.completions.create( model=self.model_name, messages=[ @@ -226,7 +225,7 @@ def generate_segmentor(self, prompt: str) -> ImageTool: ) try: - prompt = json.loads(cast(str, response.choices[0].message.content))[ + params = json.loads(cast(str, response.choices[0].message.content))[ "Parameters" ] except json.JSONDecodeError: @@ -235,7 +234,7 @@ def generate_segmentor(self, prompt: str) -> ImageTool: ) raise ValueError("Failed to decode response") - return GroundingSAM(**cast(Mapping, prompt)) + return lambda x: GroundingSAM()(**{"prompt": params["prompt"], "image": x}) def get_lmm(name: str) -> LMM: diff --git a/vision_agent/tools/__init__.py b/vision_agent/tools/__init__.py index a608c059..dae14d66 100644 --- a/vision_agent/tools/__init__.py +++ b/vision_agent/tools/__init__.py @@ -1,2 +1,2 @@ from .prompts import CHOOSE_PARAMS, SYSTEM_PROMPT -from .tools import CLIP, TOOLS, GroundingDINO, GroundingSAM, ImageTool +from .tools import CLIP, TOOLS, GroundingDINO, GroundingSAM, Tool diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py index d6d66168..d4c400d0 100644 --- a/vision_agent/tools/tools.py +++ b/vision_agent/tools/tools.py @@ -1,5 +1,5 @@ import logging -from abc import ABC, abstractmethod +from abc import ABC from pathlib import Path from typing import Any, Dict, List, Tuple, Union, cast @@ -17,10 +17,10 @@ def normalize_bbox( ) -> List[float]: r"""Normalize the bounding box coordinates to be between 0 and 1.""" x1, y1, x2, y2 = bbox - x1 = x1 / image_size[1] - y1 = y1 / image_size[0] - x2 = x2 / image_size[1] - y2 = y2 / image_size[0] + x1 = round(x1 / image_size[1], 2) + y1 = round(y1 / image_size[0], 2) + x2 = round(x2 / image_size[1], 2) + y2 = round(y2 / image_size[0], 2) return [x1, y1, x2, y2] @@ -47,13 +47,7 @@ class Tool(ABC): usage: Dict -class ImageTool(Tool): - @abstractmethod - def __call__(self, image: Union[str, ImageType]) -> List[Dict]: - pass - - -class CLIP(ImageTool): +class CLIP(Tool): r"""CLIP is a tool that can classify or tag any image given a set if input classes or tags. @@ -70,19 +64,32 @@ class CLIP(ImageTool): description = ( "'clip_' is a tool that can classify or tag any image given a set if input classes or tags." "Here are some exmaples of how to use the tool, the examples are in the format of User Question: which will have the user's question in quotes followed by the parameters in JSON format, which is the parameters you need to output to call the API to solve the user's question.\n" - 'Example 1: User Question: "Can you classify this image as a cat?" {{"Parameters":{{"prompt": ["cat"]}}}}\n' - 'Example 2: User Question: "Can you tag this photograph with cat or dog?" {{"Parameters":{{"prompt": ["cat", "dog"]}}}}\n' - 'Exmaple 3: User Question: "Can you build me a classifier taht classifies red shirts, green shirts and other?" {{"Parameters":{{"prompt": ["red shirt", "green shirt", "other"]}}}}\n' ) - usage: Dict = {} - - def __init__(self, prompt: list[str]): - self.prompt = prompt + usage = { + "required_parameters": [{"name": "prompt", "type": "List[str]"}, {"name": "image", "type": "str"}], + "examples": [ + { + "scenario": "Can you classify this image as a cat? Image name: cat.jpg", + "parameters": {"prompt": ["cat"], "image": "cat.jpg"}, + }, + { + "scenario": "Can you tag this photograph with cat or dog? Image name: cat_dog.jpg", + "parameters": {"prompt": ["cat", "dog"], "image": "cat_dog.jpg"}, + }, + { + "scenario": "Can you build me a classifier that classifies red shirts, green shirts and other? Image name: shirts.jpg", + "parameters": { + "prompt": ["red shirt", "green shirt", "other"], + "image": "shirts.jpg", + }, + }, + ], + } - def __call__(self, image: Union[str, ImageType]) -> List[Dict]: + def __call__(self, prompt: List[str], image: Union[str, ImageType]) -> List[Dict]: image_b64 = convert_to_b64(image) data = { - "classes": self.prompt, + "classes": prompt, "images": [image_b64], } res = requests.post( @@ -99,7 +106,7 @@ def __call__(self, image: Union[str, ImageType]) -> List[Dict]: return cast(List[Dict], resp_json["data"]) -class GroundingDINO(ImageTool): +class GroundingDINO(Tool): _ENDPOINT = "https://chnicr4kes5ku77niv2zoytggq0qyqlp.lambda-url.us-east-2.on.aws" name = "grounding_dino_" @@ -113,31 +120,28 @@ class GroundingDINO(ImageTool): 'An example output would be: [{"label": ["car"], "score": [0.99], "bbox": [[0.1, 0.2, 0.3, 0.4]]}]\n' ) usage = { - "required_parameters": {"name": "prompt", "type": "str"}, + "required_parameters": [{"name": "prompt", "type": "str"}, {"name": "image", "type": "str"}], "examples": [ { "scenario": "Can you build me a car detector?", - "parameters": {"prompt": "car"}, + "parameters": {"prompt": "car", "image": ""}, }, { - "scenario": "Can you detect the person on the left?", - "parameters": {"prompt": "person on the left"}, + "scenario": "Can you detect the person on the left? Image name: person.jpg", + "parameters": {"prompt": "person on the left", "image": "person.jpg"}, }, { - "scenario": "Detect the red shirts and green shirst.", - "parameters": {"prompt": "red shirt. green shirt"}, + "scenario": "Detect the red shirts and green shirst. Image name: shirts.jpg", + "parameters": {"prompt": "red shirt. green shirt", "image": "shirts.jpg"}, }, ], } - def __init__(self, prompt: str): - self.prompt = prompt - - def __call__(self, image: Union[str, Path, ImageType]) -> List[Dict]: + def __call__(self, prompt: str, image: Union[str, Path, ImageType]) -> List[Dict]: image_size = get_image_size(image) image_b64 = convert_to_b64(image) data = { - "prompt": self.prompt, + "prompt": prompt, "images": [image_b64], } res = requests.post( @@ -157,10 +161,12 @@ def __call__(self, image: Union[str, Path, ImageType]) -> List[Dict]: elt["bboxes"] = [ normalize_bbox(box, image_size) for box in elt["bboxes"] ] + if "scores" in elt: + elt["scores"] = [round(score, 2) for score in elt["scores"]] return cast(List[Dict], resp_data) -class GroundingSAM(ImageTool): +class GroundingSAM(Tool): r"""Grounding SAM is a tool that can detect and segment arbitrary objects with inputs such as category names or referring expressions. @@ -185,19 +191,29 @@ class GroundingSAM(ImageTool): description = ( "'grounding_sam_' is a tool that can detect and segment arbitrary objects with inputs such as category names or referring expressions." "Here are some exmaples of how to use the tool, the examples are in the format of User Question: which will have the user's question in quotes followed by the parameters in JSON format, which is the parameters you need to output to call the API to solve the user's question.\n" - 'Example 1: User Question: "Can you build me a car segmentor?" {{"Parameters":{{"prompt": ["car"]}}}}\n' - 'Example 2: User Question: "Can you segment the person on the left?" {{"Parameters":{{"prompt": ["person on the left"]}}\n' - 'Exmaple 3: User Question: "Can you build me a tool that segments red shirts and green shirts?" {{"Parameters":{{"prompt": ["red shirt", "green shirt"]}}}}\n' ) - usage: Dict = {} - - def __init__(self, prompt: list[str]): - self.prompt = prompt + usage = { + "required_parameters": [{"name": "prompt", "type": "List[str]"}, {"name": "image", "type": "str"}], + "examples": [ + { + "scenario": "Can you build me a car segmentor?", + "parameters": {"prompt": ["car"], "image": ""}, + }, + { + "scenario": "Can you segment the person on the left? Image name: person.jpg", + "parameters": {"prompt": ["person on the left"], "image": "person.jpg"}, + }, + { + "scenario": "Can you build me a tool that segments red shirts and green shirts? Image name: shirts.jpg", + "parameters": {"prompt": ["red shirt", "green shirt"], "image": "shirts.jpg"}, + }, + ] + } - def __call__(self, image: Union[str, ImageType]) -> List[Dict]: + def __call__(self, prompt: List[str], image: Union[str, ImageType]) -> List[Dict]: image_b64 = convert_to_b64(image) data = { - "classes": self.prompt, + "classes": prompt, "image": image_b64, } res = requests.post( From a7f4b58dfa4c99a69034626df3692301e3036199 Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Tue, 19 Mar 2024 20:39:33 -0700 Subject: [PATCH 11/12] updated easytools --- vision_agent/agent/easytool.py | 33 ++++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/vision_agent/agent/easytool.py b/vision_agent/agent/easytool.py index b21114ec..b0fc5f2c 100644 --- a/vision_agent/agent/easytool.py +++ b/vision_agent/agent/easytool.py @@ -1,6 +1,6 @@ import json from pathlib import Path -from typing import Any, Dict, List, Optional, Set, Tuple, Union +from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union from vision_agent import LLM, LMM, OpenAILLM from vision_agent.tools import TOOLS @@ -141,6 +141,10 @@ def answer_summarize( return model(prompt) +def function_call(tool: Callable, parameters: Dict[str, Any]) -> Any: + return tool()(**parameters) + + def retrieval( model: Union[LLM, LMM, Agent], question: str, @@ -167,28 +171,22 @@ def retrieval( pass def parse_tool_results(result: Dict[str, Union[Dict, List]]) -> Any: - call_results = [] + call_results: List[Any] = [] if isinstance(result["parameters"], Dict): - parameters = {} - for key in result["parameters"]: - parameters[change_name(key)] = result["parameters"][key] - # TODO: wrap call to handle errors - call_result = tools[tool_id]["class"]()(**parameters) - if call_result is None: - continue - call_results.append(call_result) + call_result = function_call(tools[tool_id]["class"], result["parameters"]) + if call_result is None: + return call_results + call_results.append(call_result) elif isinstance(result["parameters"], List): - for param_list in result["parameters"]: - parameters = {} - for key in param_list: - parameters[change_name(key)] = param_list[key] - call_result = tools[tool_id]["class"]()(**parameters) + for parameters in result["parameters"]: + call_result = function_call(tools[tool_id]["class"], parameters) if call_result is None: continue call_results.append(call_result) return call_results call_results = [] + __import__("ipdb").set_trace() if isinstance(tool_results, Set) or isinstance(tool_results, List): for result in tool_results: call_results.extend(parse_tool_results(result)) @@ -210,6 +208,9 @@ class EasyTool(Agent): >>> resp = agent("If a car is traveling at 64 km/h, how many kilometers does it travel in 29 minutes?") >>> print(resp) >>> "It will travel approximately 31.03 kilometers in 29 minutes." + >>> resp = agent("How many cards are in this image?", image="cards.jpg") + >>> print(resp) + >>> "There are 2 cards in this image." """ def __init__( @@ -238,6 +239,8 @@ def chat( self, chat: List[Dict[str, str]], image: Optional[Union[str, Path]] = None ) -> str: question = chat[0]["content"] + if image: + question += f" Image name: {image}" tasks = task_decompose( self.task_model, question, From 88ac6a2141893385f978df7ec43b8843f1ffde37 Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Tue, 19 Mar 2024 20:48:24 -0700 Subject: [PATCH 12/12] formatting fix --- vision_agent/tools/tools.py | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py index d4c400d0..c1b8fe2d 100644 --- a/vision_agent/tools/tools.py +++ b/vision_agent/tools/tools.py @@ -66,7 +66,10 @@ class CLIP(Tool): "Here are some exmaples of how to use the tool, the examples are in the format of User Question: which will have the user's question in quotes followed by the parameters in JSON format, which is the parameters you need to output to call the API to solve the user's question.\n" ) usage = { - "required_parameters": [{"name": "prompt", "type": "List[str]"}, {"name": "image", "type": "str"}], + "required_parameters": [ + {"name": "prompt", "type": "List[str]"}, + {"name": "image", "type": "str"}, + ], "examples": [ { "scenario": "Can you classify this image as a cat? Image name: cat.jpg", @@ -120,7 +123,10 @@ class GroundingDINO(Tool): 'An example output would be: [{"label": ["car"], "score": [0.99], "bbox": [[0.1, 0.2, 0.3, 0.4]]}]\n' ) usage = { - "required_parameters": [{"name": "prompt", "type": "str"}, {"name": "image", "type": "str"}], + "required_parameters": [ + {"name": "prompt", "type": "str"}, + {"name": "image", "type": "str"}, + ], "examples": [ { "scenario": "Can you build me a car detector?", @@ -132,7 +138,10 @@ class GroundingDINO(Tool): }, { "scenario": "Detect the red shirts and green shirst. Image name: shirts.jpg", - "parameters": {"prompt": "red shirt. green shirt", "image": "shirts.jpg"}, + "parameters": { + "prompt": "red shirt. green shirt", + "image": "shirts.jpg", + }, }, ], } @@ -193,7 +202,10 @@ class GroundingSAM(Tool): "Here are some exmaples of how to use the tool, the examples are in the format of User Question: which will have the user's question in quotes followed by the parameters in JSON format, which is the parameters you need to output to call the API to solve the user's question.\n" ) usage = { - "required_parameters": [{"name": "prompt", "type": "List[str]"}, {"name": "image", "type": "str"}], + "required_parameters": [ + {"name": "prompt", "type": "List[str]"}, + {"name": "image", "type": "str"}, + ], "examples": [ { "scenario": "Can you build me a car segmentor?", @@ -205,9 +217,12 @@ class GroundingSAM(Tool): }, { "scenario": "Can you build me a tool that segments red shirts and green shirts? Image name: shirts.jpg", - "parameters": {"prompt": ["red shirt", "green shirt"], "image": "shirts.jpg"}, + "parameters": { + "prompt": ["red shirt", "green shirt"], + "image": "shirts.jpg", + }, }, - ] + ], } def __call__(self, prompt: List[str], image: Union[str, ImageType]) -> List[Dict]: