Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add GPT-4o as a VQA tool #221

Merged
merged 6 commits into from
Sep 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 0 additions & 59 deletions tests/unit/test_lmm.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
import json
import tempfile
from unittest.mock import patch

import numpy as np
import pytest
from PIL import Image

Expand Down Expand Up @@ -163,60 +161,3 @@ def test_chat_ollama_mock(chat_ollama_lmm_mock): # noqa: F811
assert response == "mocked response"
call_args = json.loads(chat_ollama_lmm_mock.call_args.kwargs["data"])
assert call_args["messages"][0]["content"] == "test prompt"


@pytest.mark.parametrize(
"openai_lmm_mock",
['{"Parameters": {"prompt": "cat"}}'],
indirect=["openai_lmm_mock"],
)
def test_generate_classifier(openai_lmm_mock): # noqa: F811
with patch("vision_agent.tools.clip") as clip_mock:
clip_mock.return_value = "test"
clip_mock.__name__ = "clip"
clip_mock.__doc__ = "clip"

lmm = OpenAILMM()
prompt = "Can you generate a cat classifier?"
classifier = lmm.generate_classifier(prompt)
dummy_image = np.zeros((10, 10, 3)).astype(np.uint8)
classifier(dummy_image)
assert clip_mock.call_args[0][1] == "cat"


@pytest.mark.parametrize(
"openai_lmm_mock",
['{"Parameters": {"prompt": "cat"}}'],
indirect=["openai_lmm_mock"],
)
def test_generate_detector(openai_lmm_mock): # noqa: F811
with patch("vision_agent.tools.owl_v2") as owl_v2_mock:
owl_v2_mock.return_value = "test"
owl_v2_mock.__name__ = "owl_v2"
owl_v2_mock.__doc__ = "owl_v2"

lmm = OpenAILMM()
prompt = "Can you generate a cat classifier?"
detector = lmm.generate_detector(prompt)
dummy_image = np.zeros((10, 10, 3)).astype(np.uint8)
detector(dummy_image)
assert owl_v2_mock.call_args[0][0] == "cat"


@pytest.mark.parametrize(
"openai_lmm_mock",
['{"Parameters": {"prompt": "cat"}}'],
indirect=["openai_lmm_mock"],
)
def test_generate_segmentor(openai_lmm_mock): # noqa: F811
with patch("vision_agent.tools.grounding_sam") as grounding_sam_mock:
grounding_sam_mock.return_value = "test"
grounding_sam_mock.__name__ = "grounding_sam"
grounding_sam_mock.__doc__ = "grounding_sam"

lmm = OpenAILMM()
prompt = "Can you generate a cat classifier?"
segmentor = lmm.generate_segmentor(prompt)
dummy_image = np.zeros((10, 10, 3)).astype(np.uint8)
segmentor(dummy_image)
assert grounding_sam_mock.call_args[0][0] == "cat"
162 changes: 26 additions & 136 deletions vision_agent/lmm/lmm.py
Original file line number Diff line number Diff line change
@@ -1,85 +1,44 @@
import base64
import io
import json
import logging
import os
from abc import ABC, abstractmethod
from pathlib import Path
from typing import Any, Callable, Dict, Iterator, List, Optional, Union, cast
from typing import Any, Dict, Iterator, List, Optional, Sequence, Union, cast

import anthropic
import requests
from anthropic.types import ImageBlockParam, MessageParam, TextBlockParam
from openai import AzureOpenAI, OpenAI
from PIL import Image

import vision_agent.tools as T
from vision_agent.tools.prompts import CHOOSE_PARAMS, SYSTEM_PROMPT
from vision_agent.utils.image_utils import encode_media

from .types import Message

_LOGGER = logging.getLogger(__name__)


def encode_image_bytes(image: bytes) -> str:
image = Image.open(io.BytesIO(image)).convert("RGB") # type: ignore
buffer = io.BytesIO()
image.save(buffer, format="PNG") # type: ignore
encoded_image = base64.b64encode(buffer.getvalue()).decode("utf-8")
return encoded_image


def encode_media(media: Union[str, Path]) -> str:
if type(media) is str and media.startswith(("http", "https")):
# for mp4 video url, we assume there is a same url but ends with png
# vision-agent-ui will upload this png when uploading the video
if media.endswith((".mp4", "mov")) and media.find("vision-agent-dev.s3") != -1:
return media[:-4] + ".png"
return media
extension = "png"
extension = Path(media).suffix
if extension.lower() not in {
".jpg",
".jpeg",
".png",
".webp",
".bmp",
".mp4",
".mov",
}:
raise ValueError(f"Unsupported image extension: {extension}")

image_bytes = b""
if extension.lower() in {".mp4", ".mov"}:
frames = T.extract_frames(media)
image = frames[len(frames) // 2]
buffer = io.BytesIO()
Image.fromarray(image[0]).convert("RGB").save(buffer, format="PNG")
image_bytes = buffer.getvalue()
else:
image_bytes = open(media, "rb").read()
return encode_image_bytes(image_bytes)


class LMM(ABC):
@abstractmethod
def generate(
self, prompt: str, media: Optional[List[Union[str, Path]]] = None, **kwargs: Any
self,
prompt: str,
media: Optional[Sequence[Union[str, Path]]] = None,
**kwargs: Any,
) -> Union[str, Iterator[Optional[str]]]:
pass

@abstractmethod
def chat(
self,
chat: List[Message],
chat: Sequence[Message],
**kwargs: Any,
) -> Union[str, Iterator[Optional[str]]]:
pass

@abstractmethod
def __call__(
self,
input: Union[str, List[Message]],
input: Union[str, Sequence[Message]],
**kwargs: Any,
) -> Union[str, Iterator[Optional[str]]]:
pass
Expand Down Expand Up @@ -111,7 +70,7 @@ def __init__(

def __call__(
self,
input: Union[str, List[Message]],
input: Union[str, Sequence[Message]],
**kwargs: Any,
) -> Union[str, Iterator[Optional[str]]]:
if isinstance(input, str):
Expand All @@ -120,13 +79,13 @@ def __call__(

def chat(
self,
chat: List[Message],
chat: Sequence[Message],
**kwargs: Any,
) -> Union[str, Iterator[Optional[str]]]:
"""Chat with the LMM model.

Parameters:
chat (List[Dict[str, str]]): A list of dictionaries containing the chat
chat (Squence[Dict[str, str]]): A list of dictionaries containing the chat
messages. The messages can be in the format:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

NIT Typo Sequence

[{"role": "user", "content": "Hello!"}, ...]
or if it contains media, it should be in the format:
Expand All @@ -147,6 +106,7 @@ def chat(
"url": (
encoded_media
if encoded_media.startswith(("http", "https"))
or encoded_media.startswith("data:image/")
else f"data:image/png;base64,{encoded_media}"
),
"detail": "low",
Expand Down Expand Up @@ -174,7 +134,7 @@ def f() -> Iterator[Optional[str]]:
def generate(
self,
prompt: str,
media: Optional[List[Union[str, Path]]] = None,
media: Optional[Sequence[Union[str, Path]]] = None,
**kwargs: Any,
) -> Union[str, Iterator[Optional[str]]]:
message: List[Dict[str, Any]] = [
Expand All @@ -192,7 +152,12 @@ def generate(
{
"type": "image_url",
"image_url": {
"url": f"data:image/png;base64,{encoded_media}",
"url": (
encoded_media
if encoded_media.startswith(("http", "https"))
or encoded_media.startswith("data:image/")
else f"data:image/png;base64,{encoded_media}"
),
"detail": "low",
},
},
Expand All @@ -214,81 +179,6 @@ def f() -> Iterator[Optional[str]]:
else:
return cast(str, response.choices[0].message.content)

def generate_classifier(self, question: str) -> Callable:
api_doc = T.get_tool_documentation([T.clip])
prompt = CHOOSE_PARAMS.format(api_doc=api_doc, question=question)
response = self.client.chat.completions.create(
model=self.model_name,
messages=[
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": prompt},
],
response_format={"type": "json_object"},
)

try:
params = json.loads(cast(str, response.choices[0].message.content))[
"Parameters"
]
except json.JSONDecodeError:
_LOGGER.error(
f"Failed to decode response: {response.choices[0].message.content}"
)
raise ValueError("Failed to decode response")

return lambda x: T.clip(x, params["prompt"])

def generate_detector(self, question: str) -> Callable:
api_doc = T.get_tool_documentation([T.owl_v2])
prompt = CHOOSE_PARAMS.format(api_doc=api_doc, question=question)
response = self.client.chat.completions.create(
model=self.model_name,
messages=[
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": prompt},
],
response_format={"type": "json_object"},
)

try:
params = json.loads(cast(str, response.choices[0].message.content))[
"Parameters"
]
except json.JSONDecodeError:
_LOGGER.error(
f"Failed to decode response: {response.choices[0].message.content}"
)
raise ValueError("Failed to decode response")

return lambda x: T.owl_v2(params["prompt"], x)

def generate_segmentor(self, question: str) -> Callable:
api_doc = T.get_tool_documentation([T.grounding_sam])
prompt = CHOOSE_PARAMS.format(api_doc=api_doc, question=question)
response = self.client.chat.completions.create(
model=self.model_name,
messages=[
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": prompt},
],
response_format={"type": "json_object"},
)

try:
params = json.loads(cast(str, response.choices[0].message.content))[
"Parameters"
]
except json.JSONDecodeError:
_LOGGER.error(
f"Failed to decode response: {response.choices[0].message.content}"
)
raise ValueError("Failed to decode response")

return lambda x: T.grounding_sam(params["prompt"], x)

def generate_image_qa_tool(self, question: str) -> Callable:
return lambda x: T.git_vqa_v2(question, x)


class AzureOpenAILMM(OpenAILMM):
def __init__(
Expand Down Expand Up @@ -362,7 +252,7 @@ def __init__(

def __call__(
self,
input: Union[str, List[Message]],
input: Union[str, Sequence[Message]],
**kwargs: Any,
) -> Union[str, Iterator[Optional[str]]]:
if isinstance(input, str):
Expand All @@ -371,13 +261,13 @@ def __call__(

def chat(
self,
chat: List[Message],
chat: Sequence[Message],
**kwargs: Any,
) -> Union[str, Iterator[Optional[str]]]:
"""Chat with the LMM model.

Parameters:
chat (List[Dict[str, str]]): A list of dictionaries containing the chat
chat (Sequence[Dict[str, str]]): A list of dictionaries containing the chat
messages. The messages can be in the format:
[{"role": "user", "content": "Hello!"}, ...]
or if it contains media, it should be in the format:
Expand Down Expand Up @@ -429,7 +319,7 @@ def f() -> Iterator[Optional[str]]:
def generate(
self,
prompt: str,
media: Optional[List[Union[str, Path]]] = None,
media: Optional[Sequence[Union[str, Path]]] = None,
**kwargs: Any,
) -> Union[str, Iterator[Optional[str]]]:
url = f"{self.url}/generate"
Expand Down Expand Up @@ -493,7 +383,7 @@ def __init__(

def __call__(
self,
input: Union[str, List[Dict[str, Any]]],
input: Union[str, Sequence[Dict[str, Any]]],
**kwargs: Any,
) -> Union[str, Iterator[Optional[str]]]:
if isinstance(input, str):
Expand All @@ -502,7 +392,7 @@ def __call__(

def chat(
self,
chat: List[Dict[str, Any]],
chat: Sequence[Dict[str, Any]],
**kwargs: Any,
) -> Union[str, Iterator[Optional[str]]]:
messages: List[MessageParam] = []
Expand Down Expand Up @@ -551,7 +441,7 @@ def f() -> Iterator[Optional[str]]:
def generate(
self,
prompt: str,
media: Optional[List[Union[str, Path]]] = None,
media: Optional[Sequence[Union[str, Path]]] = None,
**kwargs: Any,
) -> Union[str, Iterator[Optional[str]]]:
content: List[Union[TextBlockParam, ImageBlockParam]] = [
Expand Down
8 changes: 5 additions & 3 deletions vision_agent/tools/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
clip,
closest_box_distance,
closest_mask_distance,
countgd_counting,
countgd_example_based_counting,
depth_anything_v2,
detr_segmentation,
dpt_hybrid_midas,
Expand All @@ -30,20 +32,20 @@
generate_soft_edge_image,
get_tool_documentation,
git_vqa_v2,
gpt4o_image_vqa,
gpt4o_video_vqa,
grounding_dino,
grounding_sam,
ixc25_image_vqa,
ixc25_video_vqa,
load_image,
loca_visual_prompt_counting,
loca_zero_shot_counting,
countgd_counting,
countgd_example_based_counting,
ocr,
overlay_bounding_boxes,
overlay_counting_results,
overlay_heat_map,
overlay_segmentation_masks,
overlay_counting_results,
owl_v2,
save_image,
save_json,
Expand Down
Loading
Loading