Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding basic tools #11

Merged
merged 5 commits into from
Mar 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 79 additions & 0 deletions vision_agent/lmm/lmm.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,21 @@
import base64
import json
import logging
from abc import ABC, abstractmethod
from pathlib import Path
from typing import Any, Dict, List, Optional, Union, cast

import requests

from vision_agent.tools import (
SYSTEM_PROMPT,
CHOOSE_PARAMS,
ImageTool,
CLIP,
GroundingDINO,
GroundingSAM,
)

logging.basicConfig(level=logging.INFO)

_LOGGER = logging.getLogger(__name__)
Expand Down Expand Up @@ -90,6 +100,75 @@ def generate(self, prompt: str, image: Optional[Union[str, Path]] = None) -> str
)
return cast(str, response.choices[0].message.content)

def generate_classifier(self, prompt: str) -> ImageTool:
prompt = CHOOSE_PARAMS.format(api_doc=CLIP.doc, question=prompt)
response = self.client.chat.completions.create(
model="gpt-4-turbo-preview", # no need to use vision model here
response_format={"type": "json_object"},
messages=[
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": prompt},
],
)

try:
prompt = json.loads(cast(str, response.choices[0].message.content))[
"prompt"
]
except json.JSONDecodeError:
_LOGGER.error(
f"Failed to decode response: {response.choices[0].message.content}"
)
raise ValueError("Failed to decode response")

return CLIP(prompt)

def generate_detector(self, prompt: str) -> ImageTool:
prompt = CHOOSE_PARAMS.format(api_doc=GroundingDINO.doc, question=prompt)
response = self.client.chat.completions.create(
model="gpt-4-turbo-preview", # no need to use vision model here
response_format={"type": "json_object"},
messages=[
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": prompt},
],
)

try:
prompt = json.loads(cast(str, response.choices[0].message.content))[
"prompt"
]
except json.JSONDecodeError:
_LOGGER.error(
f"Failed to decode response: {response.choices[0].message.content}"
)
raise ValueError("Failed to decode response")

return GroundingDINO(prompt)

def generate_segmentor(self, prompt: str) -> ImageTool:
prompt = CHOOSE_PARAMS.format(api_doc=GroundingSAM.doc, question=prompt)
response = self.client.chat.completions.create(
model="gpt-4-turbo-preview", # no need to use vision model here
response_format={"type": "json_object"},
messages=[
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": prompt},
],
)

try:
prompt = json.loads(cast(str, response.choices[0].message.content))[
"prompt"
]
except json.JSONDecodeError:
_LOGGER.error(
f"Failed to decode response: {response.choices[0].message.content}"
)
raise ValueError("Failed to decode response")

return GroundingSAM(prompt)


def get_lmm(name: str) -> LMM:
if name == "openai":
Expand Down
2 changes: 2 additions & 0 deletions vision_agent/tools/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
from .prompts import SYSTEM_PROMPT, CHOOSE_PARAMS
from .tools import ImageTool, CLIP, GroundingDINO, GroundingSAM
19 changes: 19 additions & 0 deletions vision_agent/tools/prompts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
SYSTEM_PROMPT = "You are a helpful assistant."

# EasyTool prompts
CHOOSE_PARAMS = (
"This is an API tool documentation. Given a user's question, you need to output parameters according to the API tool documentation to successfully call the API to solve the user's question.\n"
"This is the API tool documentation: {api_doc}\n"
"Please note that: \n"
"1. The Example in the API tool documentation can help you better understand the use of the API.\n"
'2. Ensure the parameters you output are correct. The output must contain the required parameters, and can contain the optional parameters based on the question. If no paremters in the required parameters and optional parameters, just leave it as {{"Parameters":{{}}}}\n'
"3. If the user's question mentions other APIs, you should ONLY consider the API tool documentation I give and do not consider other APIs.\n"
'4. If you need to use this API multiple times, please set "Parameters" to a list.\n'
"5. You must ONLY output in a parsible JSON format. Two examples output looks like:\n"
"'''\n"
'Example 1: {{"Parameters":{{"keyword": "Artificial Intelligence", "language": "English"}}}}\n'
'Example 2: {{"Parameters":[{{"keyword": "Artificial Intelligence", "language": "English"}}, {{"keyword": "Machine Learning", "language": "English"}}]}}\n'
"'''\n"
"This is user's question: {question}\n"
"Output:\n"
)
58 changes: 58 additions & 0 deletions vision_agent/tools/tools.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
from typing import Dict, List, Union
from abc import ABC, abstractmethod

from PIL.Image import Image as ImageType


class ImageTool(ABC):
@abstractmethod
def __call__(self, image: Union[str, ImageType]) -> List[Dict]:
pass


class CLIP(ImageTool):
doc = (
"CLIP is a tool that can classify or tag any image given a set if input classes or tags."
"Here are some exmaples of how to use the tool, the examples are in the format of User Question: which will have the user's question in quotes followed by the parameters in JSON format, which is the parameters you need to output to call the API to solve the user's question.\n"
'Example 1: User Question: "Can you classify this image as a cat?" {{"Parameters":{{"prompt": ["cat"]}}}}\n'
'Example 2: User Question: "Can you tag this photograph with cat or dog?" {{"Parameters":{{"prompt": ["cat", "dog"]}}}}\n'
'Exmaple 3: User Question: "Can you build me a classifier taht classifies red shirts, green shirts and other?" {{"Parameters":{{"prompt": ["red shirt", "green shirt", "other"]}}}}\n'
)

def __init__(self, prompt: str):
self.prompt = prompt

def __call__(self, image: Union[str, ImageType]) -> List[Dict]:
raise NotImplementedError


class GroundingDINO(ImageTool):
doc = (
"Grounding DINO is a tool that can detect arbitrary objects with inputs such as category names or referring expressions."
"Here are some exmaples of how to use the tool, the examples are in the format of User Question: which will have the user's question in quotes followed by the parameters in JSON format, which is the parameters you need to output to call the API to solve the user's question.\n"
'Example 1: User Question: "Can you build me a car detector?" {{"Parameters":{{"prompt": "car"}}}}\n'
'Example 2: User Question: "Can you detect the person on the left?" {{"Parameters":{{"prompt": "person on the left"}}\n'
'Exmaple 3: User Question: "Can you build me a tool that detects red shirts and green shirts?" {{"Parameters":{{"prompt": "red shirt. green shirt"}}}}\n'
)

def __init__(self, prompt: str):
self.prompt = prompt

def __call__(self, image: Union[str, ImageType]) -> List[Dict]:
raise NotImplementedError


class GroundingSAM(ImageTool):
doc = (
"Grounding SAM is a tool that can detect and segment arbitrary objects with inputs such as category names or referring expressions."
"Here are some exmaples of how to use the tool, the examples are in the format of User Question: which will have the user's question in quotes followed by the parameters in JSON format, which is the parameters you need to output to call the API to solve the user's question.\n"
'Example 1: User Question: "Can you build me a car segmentor?" {{"Parameters":{{"prompt": "car"}}}}\n'
'Example 2: User Question: "Can you segment the person on the left?" {{"Parameters":{{"prompt": "person on the left"}}\n'
'Exmaple 3: User Question: "Can you build me a tool that segments red shirts and green shirts?" {{"Parameters":{{"prompt": "red shirt. green shirt"}}}}\n'
)

def __init__(self, prompt: str):
self.prompt = prompt

def __call__(self, image: Union[str, ImageType]) -> List[Dict]:
raise NotImplementedError
Loading