Skip to content

Commit

Permalink
Add GPT-4o as a VQA tool (#221)
Browse files Browse the repository at this point in the history
* moved image related utils to image_utils

* isort

* added gpt4o tool

* format fix

* remove generate tests

* flake8 black
  • Loading branch information
dillonalaird authored Sep 5, 2024
1 parent ab510a6 commit 14c8e05
Show file tree
Hide file tree
Showing 7 changed files with 150 additions and 205 deletions.
59 changes: 0 additions & 59 deletions tests/unit/test_lmm.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
import json
import tempfile
from unittest.mock import patch

import numpy as np
import pytest
from PIL import Image

Expand Down Expand Up @@ -163,60 +161,3 @@ def test_chat_ollama_mock(chat_ollama_lmm_mock): # noqa: F811
assert response == "mocked response"
call_args = json.loads(chat_ollama_lmm_mock.call_args.kwargs["data"])
assert call_args["messages"][0]["content"] == "test prompt"


@pytest.mark.parametrize(
"openai_lmm_mock",
['{"Parameters": {"prompt": "cat"}}'],
indirect=["openai_lmm_mock"],
)
def test_generate_classifier(openai_lmm_mock): # noqa: F811
with patch("vision_agent.tools.clip") as clip_mock:
clip_mock.return_value = "test"
clip_mock.__name__ = "clip"
clip_mock.__doc__ = "clip"

lmm = OpenAILMM()
prompt = "Can you generate a cat classifier?"
classifier = lmm.generate_classifier(prompt)
dummy_image = np.zeros((10, 10, 3)).astype(np.uint8)
classifier(dummy_image)
assert clip_mock.call_args[0][1] == "cat"


@pytest.mark.parametrize(
"openai_lmm_mock",
['{"Parameters": {"prompt": "cat"}}'],
indirect=["openai_lmm_mock"],
)
def test_generate_detector(openai_lmm_mock): # noqa: F811
with patch("vision_agent.tools.owl_v2") as owl_v2_mock:
owl_v2_mock.return_value = "test"
owl_v2_mock.__name__ = "owl_v2"
owl_v2_mock.__doc__ = "owl_v2"

lmm = OpenAILMM()
prompt = "Can you generate a cat classifier?"
detector = lmm.generate_detector(prompt)
dummy_image = np.zeros((10, 10, 3)).astype(np.uint8)
detector(dummy_image)
assert owl_v2_mock.call_args[0][0] == "cat"


@pytest.mark.parametrize(
"openai_lmm_mock",
['{"Parameters": {"prompt": "cat"}}'],
indirect=["openai_lmm_mock"],
)
def test_generate_segmentor(openai_lmm_mock): # noqa: F811
with patch("vision_agent.tools.grounding_sam") as grounding_sam_mock:
grounding_sam_mock.return_value = "test"
grounding_sam_mock.__name__ = "grounding_sam"
grounding_sam_mock.__doc__ = "grounding_sam"

lmm = OpenAILMM()
prompt = "Can you generate a cat classifier?"
segmentor = lmm.generate_segmentor(prompt)
dummy_image = np.zeros((10, 10, 3)).astype(np.uint8)
segmentor(dummy_image)
assert grounding_sam_mock.call_args[0][0] == "cat"
162 changes: 26 additions & 136 deletions vision_agent/lmm/lmm.py
Original file line number Diff line number Diff line change
@@ -1,85 +1,44 @@
import base64
import io
import json
import logging
import os
from abc import ABC, abstractmethod
from pathlib import Path
from typing import Any, Callable, Dict, Iterator, List, Optional, Union, cast
from typing import Any, Dict, Iterator, List, Optional, Sequence, Union, cast

import anthropic
import requests
from anthropic.types import ImageBlockParam, MessageParam, TextBlockParam
from openai import AzureOpenAI, OpenAI
from PIL import Image

import vision_agent.tools as T
from vision_agent.tools.prompts import CHOOSE_PARAMS, SYSTEM_PROMPT
from vision_agent.utils.image_utils import encode_media

from .types import Message

_LOGGER = logging.getLogger(__name__)


def encode_image_bytes(image: bytes) -> str:
image = Image.open(io.BytesIO(image)).convert("RGB") # type: ignore
buffer = io.BytesIO()
image.save(buffer, format="PNG") # type: ignore
encoded_image = base64.b64encode(buffer.getvalue()).decode("utf-8")
return encoded_image


def encode_media(media: Union[str, Path]) -> str:
if type(media) is str and media.startswith(("http", "https")):
# for mp4 video url, we assume there is a same url but ends with png
# vision-agent-ui will upload this png when uploading the video
if media.endswith((".mp4", "mov")) and media.find("vision-agent-dev.s3") != -1:
return media[:-4] + ".png"
return media
extension = "png"
extension = Path(media).suffix
if extension.lower() not in {
".jpg",
".jpeg",
".png",
".webp",
".bmp",
".mp4",
".mov",
}:
raise ValueError(f"Unsupported image extension: {extension}")

image_bytes = b""
if extension.lower() in {".mp4", ".mov"}:
frames = T.extract_frames(media)
image = frames[len(frames) // 2]
buffer = io.BytesIO()
Image.fromarray(image[0]).convert("RGB").save(buffer, format="PNG")
image_bytes = buffer.getvalue()
else:
image_bytes = open(media, "rb").read()
return encode_image_bytes(image_bytes)


class LMM(ABC):
@abstractmethod
def generate(
self, prompt: str, media: Optional[List[Union[str, Path]]] = None, **kwargs: Any
self,
prompt: str,
media: Optional[Sequence[Union[str, Path]]] = None,
**kwargs: Any,
) -> Union[str, Iterator[Optional[str]]]:
pass

@abstractmethod
def chat(
self,
chat: List[Message],
chat: Sequence[Message],
**kwargs: Any,
) -> Union[str, Iterator[Optional[str]]]:
pass

@abstractmethod
def __call__(
self,
input: Union[str, List[Message]],
input: Union[str, Sequence[Message]],
**kwargs: Any,
) -> Union[str, Iterator[Optional[str]]]:
pass
Expand Down Expand Up @@ -111,7 +70,7 @@ def __init__(

def __call__(
self,
input: Union[str, List[Message]],
input: Union[str, Sequence[Message]],
**kwargs: Any,
) -> Union[str, Iterator[Optional[str]]]:
if isinstance(input, str):
Expand All @@ -120,13 +79,13 @@ def __call__(

def chat(
self,
chat: List[Message],
chat: Sequence[Message],
**kwargs: Any,
) -> Union[str, Iterator[Optional[str]]]:
"""Chat with the LMM model.
Parameters:
chat (List[Dict[str, str]]): A list of dictionaries containing the chat
chat (Squence[Dict[str, str]]): A list of dictionaries containing the chat
messages. The messages can be in the format:
[{"role": "user", "content": "Hello!"}, ...]
or if it contains media, it should be in the format:
Expand All @@ -147,6 +106,7 @@ def chat(
"url": (
encoded_media
if encoded_media.startswith(("http", "https"))
or encoded_media.startswith("data:image/")
else f"data:image/png;base64,{encoded_media}"
),
"detail": "low",
Expand Down Expand Up @@ -174,7 +134,7 @@ def f() -> Iterator[Optional[str]]:
def generate(
self,
prompt: str,
media: Optional[List[Union[str, Path]]] = None,
media: Optional[Sequence[Union[str, Path]]] = None,
**kwargs: Any,
) -> Union[str, Iterator[Optional[str]]]:
message: List[Dict[str, Any]] = [
Expand All @@ -192,7 +152,12 @@ def generate(
{
"type": "image_url",
"image_url": {
"url": f"data:image/png;base64,{encoded_media}",
"url": (
encoded_media
if encoded_media.startswith(("http", "https"))
or encoded_media.startswith("data:image/")
else f"data:image/png;base64,{encoded_media}"
),
"detail": "low",
},
},
Expand All @@ -214,81 +179,6 @@ def f() -> Iterator[Optional[str]]:
else:
return cast(str, response.choices[0].message.content)

def generate_classifier(self, question: str) -> Callable:
api_doc = T.get_tool_documentation([T.clip])
prompt = CHOOSE_PARAMS.format(api_doc=api_doc, question=question)
response = self.client.chat.completions.create(
model=self.model_name,
messages=[
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": prompt},
],
response_format={"type": "json_object"},
)

try:
params = json.loads(cast(str, response.choices[0].message.content))[
"Parameters"
]
except json.JSONDecodeError:
_LOGGER.error(
f"Failed to decode response: {response.choices[0].message.content}"
)
raise ValueError("Failed to decode response")

return lambda x: T.clip(x, params["prompt"])

def generate_detector(self, question: str) -> Callable:
api_doc = T.get_tool_documentation([T.owl_v2])
prompt = CHOOSE_PARAMS.format(api_doc=api_doc, question=question)
response = self.client.chat.completions.create(
model=self.model_name,
messages=[
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": prompt},
],
response_format={"type": "json_object"},
)

try:
params = json.loads(cast(str, response.choices[0].message.content))[
"Parameters"
]
except json.JSONDecodeError:
_LOGGER.error(
f"Failed to decode response: {response.choices[0].message.content}"
)
raise ValueError("Failed to decode response")

return lambda x: T.owl_v2(params["prompt"], x)

def generate_segmentor(self, question: str) -> Callable:
api_doc = T.get_tool_documentation([T.grounding_sam])
prompt = CHOOSE_PARAMS.format(api_doc=api_doc, question=question)
response = self.client.chat.completions.create(
model=self.model_name,
messages=[
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": prompt},
],
response_format={"type": "json_object"},
)

try:
params = json.loads(cast(str, response.choices[0].message.content))[
"Parameters"
]
except json.JSONDecodeError:
_LOGGER.error(
f"Failed to decode response: {response.choices[0].message.content}"
)
raise ValueError("Failed to decode response")

return lambda x: T.grounding_sam(params["prompt"], x)

def generate_image_qa_tool(self, question: str) -> Callable:
return lambda x: T.git_vqa_v2(question, x)


class AzureOpenAILMM(OpenAILMM):
def __init__(
Expand Down Expand Up @@ -362,7 +252,7 @@ def __init__(

def __call__(
self,
input: Union[str, List[Message]],
input: Union[str, Sequence[Message]],
**kwargs: Any,
) -> Union[str, Iterator[Optional[str]]]:
if isinstance(input, str):
Expand All @@ -371,13 +261,13 @@ def __call__(

def chat(
self,
chat: List[Message],
chat: Sequence[Message],
**kwargs: Any,
) -> Union[str, Iterator[Optional[str]]]:
"""Chat with the LMM model.
Parameters:
chat (List[Dict[str, str]]): A list of dictionaries containing the chat
chat (Sequence[Dict[str, str]]): A list of dictionaries containing the chat
messages. The messages can be in the format:
[{"role": "user", "content": "Hello!"}, ...]
or if it contains media, it should be in the format:
Expand Down Expand Up @@ -429,7 +319,7 @@ def f() -> Iterator[Optional[str]]:
def generate(
self,
prompt: str,
media: Optional[List[Union[str, Path]]] = None,
media: Optional[Sequence[Union[str, Path]]] = None,
**kwargs: Any,
) -> Union[str, Iterator[Optional[str]]]:
url = f"{self.url}/generate"
Expand Down Expand Up @@ -493,7 +383,7 @@ def __init__(

def __call__(
self,
input: Union[str, List[Dict[str, Any]]],
input: Union[str, Sequence[Dict[str, Any]]],
**kwargs: Any,
) -> Union[str, Iterator[Optional[str]]]:
if isinstance(input, str):
Expand All @@ -502,7 +392,7 @@ def __call__(

def chat(
self,
chat: List[Dict[str, Any]],
chat: Sequence[Dict[str, Any]],
**kwargs: Any,
) -> Union[str, Iterator[Optional[str]]]:
messages: List[MessageParam] = []
Expand Down Expand Up @@ -551,7 +441,7 @@ def f() -> Iterator[Optional[str]]:
def generate(
self,
prompt: str,
media: Optional[List[Union[str, Path]]] = None,
media: Optional[Sequence[Union[str, Path]]] = None,
**kwargs: Any,
) -> Union[str, Iterator[Optional[str]]]:
content: List[Union[TextBlockParam, ImageBlockParam]] = [
Expand Down
8 changes: 5 additions & 3 deletions vision_agent/tools/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
clip,
closest_box_distance,
closest_mask_distance,
countgd_counting,
countgd_example_based_counting,
depth_anything_v2,
detr_segmentation,
dpt_hybrid_midas,
Expand All @@ -30,20 +32,20 @@
generate_soft_edge_image,
get_tool_documentation,
git_vqa_v2,
gpt4o_image_vqa,
gpt4o_video_vqa,
grounding_dino,
grounding_sam,
ixc25_image_vqa,
ixc25_video_vqa,
load_image,
loca_visual_prompt_counting,
loca_zero_shot_counting,
countgd_counting,
countgd_example_based_counting,
ocr,
overlay_bounding_boxes,
overlay_counting_results,
overlay_heat_map,
overlay_segmentation_masks,
overlay_counting_results,
owl_v2,
save_image,
save_json,
Expand Down
Loading

0 comments on commit 14c8e05

Please sign in to comment.