Add GPT-4o as a VQA tool (#221)

* moved image related utils to image_utils * isort * added gpt4o tool * format fix * remove generate tests * flake8 black
landing-ai · Sep 5, 2024 · 14c8e05 · 14c8e05
1 parent ab510a6
commit 14c8e05
Show file tree

Hide file tree

Showing 7 changed files with 150 additions and 205 deletions.
diff --git a/tests/unit/test_lmm.py b/tests/unit/test_lmm.py
@@ -1,8 +1,6 @@
 import json
 import tempfile
-from unittest.mock import patch
 
-import numpy as np
 import pytest
 from PIL import Image
 
@@ -163,60 +161,3 @@ def test_chat_ollama_mock(chat_ollama_lmm_mock): # noqa: F811
  assert response == "mocked response"
  call_args = json.loads(chat_ollama_lmm_mock.call_args.kwargs["data"])
  assert call_args["messages"][0]["content"] == "test prompt"
-
-
-@pytest.mark.parametrize(
- "openai_lmm_mock",
- ['{"Parameters": {"prompt": "cat"}}'],
- indirect=["openai_lmm_mock"],
-)
-def test_generate_classifier(openai_lmm_mock): # noqa: F811
- with patch("vision_agent.tools.clip") as clip_mock:
- clip_mock.return_value = "test"
- clip_mock.__name__ = "clip"
- clip_mock.__doc__ = "clip"
-
- lmm = OpenAILMM()
- prompt = "Can you generate a cat classifier?"
- classifier = lmm.generate_classifier(prompt)
- dummy_image = np.zeros((10, 10, 3)).astype(np.uint8)
- classifier(dummy_image)
- assert clip_mock.call_args[0][1] == "cat"
-
-
-@pytest.mark.parametrize(
- "openai_lmm_mock",
- ['{"Parameters": {"prompt": "cat"}}'],
- indirect=["openai_lmm_mock"],
-)
-def test_generate_detector(openai_lmm_mock): # noqa: F811
- with patch("vision_agent.tools.owl_v2") as owl_v2_mock:
- owl_v2_mock.return_value = "test"
- owl_v2_mock.__name__ = "owl_v2"
- owl_v2_mock.__doc__ = "owl_v2"
-
- lmm = OpenAILMM()
- prompt = "Can you generate a cat classifier?"
- detector = lmm.generate_detector(prompt)
- dummy_image = np.zeros((10, 10, 3)).astype(np.uint8)
- detector(dummy_image)
- assert owl_v2_mock.call_args[0][0] == "cat"
-
-
-@pytest.mark.parametrize(
- "openai_lmm_mock",
- ['{"Parameters": {"prompt": "cat"}}'],
- indirect=["openai_lmm_mock"],
-)
-def test_generate_segmentor(openai_lmm_mock): # noqa: F811
- with patch("vision_agent.tools.grounding_sam") as grounding_sam_mock:
- grounding_sam_mock.return_value = "test"
- grounding_sam_mock.__name__ = "grounding_sam"
- grounding_sam_mock.__doc__ = "grounding_sam"
-
- lmm = OpenAILMM()
- prompt = "Can you generate a cat classifier?"
- segmentor = lmm.generate_segmentor(prompt)
- dummy_image = np.zeros((10, 10, 3)).astype(np.uint8)
- segmentor(dummy_image)
- assert grounding_sam_mock.call_args[0][0] == "cat"
diff --git a/vision_agent/lmm/lmm.py b/vision_agent/lmm/lmm.py
@@ -1,85 +1,44 @@
-import base64
-import io
 import json
 import logging
 import os
 from abc import ABC, abstractmethod
 from pathlib import Path
-from typing import Any, Callable, Dict, Iterator, List, Optional, Union, cast
+from typing import Any, Dict, Iterator, List, Optional, Sequence, Union, cast
 
 import anthropic
 import requests
 from anthropic.types import ImageBlockParam, MessageParam, TextBlockParam
 from openai import AzureOpenAI, OpenAI
-from PIL import Image
 
-import vision_agent.tools as T
-from vision_agent.tools.prompts import CHOOSE_PARAMS, SYSTEM_PROMPT
+from vision_agent.utils.image_utils import encode_media
 
 from .types import Message
 
 _LOGGER = logging.getLogger(__name__)
 
 
-def encode_image_bytes(image: bytes) -> str:
- image = Image.open(io.BytesIO(image)).convert("RGB") # type: ignore
- buffer = io.BytesIO()
- image.save(buffer, format="PNG") # type: ignore
- encoded_image = base64.b64encode(buffer.getvalue()).decode("utf-8")
- return encoded_image
-
-
-def encode_media(media: Union[str, Path]) -> str:
- if type(media) is str and media.startswith(("http", "https")):
- # for mp4 video url, we assume there is a same url but ends with png
- # vision-agent-ui will upload this png when uploading the video
- if media.endswith((".mp4", "mov")) and media.find("vision-agent-dev.s3") != -1:
- return media[:-4] + ".png"
- return media
- extension = "png"
- extension = Path(media).suffix
- if extension.lower() not in {
- ".jpg",
- ".jpeg",
- ".png",
- ".webp",
- ".bmp",
- ".mp4",
- ".mov",
- }:
- raise ValueError(f"Unsupported image extension: {extension}")
-
- image_bytes = b""
- if extension.lower() in {".mp4", ".mov"}:
- frames = T.extract_frames(media)
- image = frames[len(frames) // 2]
- buffer = io.BytesIO()
- Image.fromarray(image[0]).convert("RGB").save(buffer, format="PNG")
- image_bytes = buffer.getvalue()
- else:
- image_bytes = open(media, "rb").read()
- return encode_image_bytes(image_bytes)
-
-
 class LMM(ABC):
  @abstractmethod
  def generate(
- self, prompt: str, media: Optional[List[Union[str, Path]]] = None, **kwargs: Any
+ self,
+ prompt: str,
+ media: Optional[Sequence[Union[str, Path]]] = None,
+ **kwargs: Any,
  ) -> Union[str, Iterator[Optional[str]]]:
  pass
 
  @abstractmethod
  def chat(
  self,
- chat: List[Message],
+ chat: Sequence[Message],
  **kwargs: Any,
  ) -> Union[str, Iterator[Optional[str]]]:
  pass
 
  @abstractmethod
  def __call__(
  self,
- input: Union[str, List[Message]],
+ input: Union[str, Sequence[Message]],
  **kwargs: Any,
  ) -> Union[str, Iterator[Optional[str]]]:
  pass
@@ -111,7 +70,7 @@ def __init__(
 
  def __call__(
  self,
- input: Union[str, List[Message]],
+ input: Union[str, Sequence[Message]],
  **kwargs: Any,
  ) -> Union[str, Iterator[Optional[str]]]:
  if isinstance(input, str):
@@ -120,13 +79,13 @@ def __call__(
 
  def chat(
  self,
- chat: List[Message],
+ chat: Sequence[Message],
  **kwargs: Any,
  ) -> Union[str, Iterator[Optional[str]]]:
  """Chat with the LMM model.
 
  Parameters:
- chat (List[Dict[str, str]]): A list of dictionaries containing the chat
+ chat (Squence[Dict[str, str]]): A list of dictionaries containing the chat
  messages. The messages can be in the format:
  [{"role": "user", "content": "Hello!"}, ...]
  or if it contains media, it should be in the format:
@@ -147,6 +106,7 @@ def chat(
  "url": (
  encoded_media
  if encoded_media.startswith(("http", "https"))
+ or encoded_media.startswith("data:image/")
  else f"data:image/png;base64,{encoded_media}"
  ),
  "detail": "low",
@@ -174,7 +134,7 @@ def f() -> Iterator[Optional[str]]:
  def generate(
  self,
  prompt: str,
- media: Optional[List[Union[str, Path]]] = None,
+ media: Optional[Sequence[Union[str, Path]]] = None,
  **kwargs: Any,
  ) -> Union[str, Iterator[Optional[str]]]:
  message: List[Dict[str, Any]] = [
@@ -192,7 +152,12 @@ def generate(
  {
  "type": "image_url",
  "image_url": {
- "url": f"data:image/png;base64,{encoded_media}",
+ "url": (
+ encoded_media
+ if encoded_media.startswith(("http", "https"))
+ or encoded_media.startswith("data:image/")
+ else f"data:image/png;base64,{encoded_media}"
+ ),
  "detail": "low",
  },
  },
@@ -214,81 +179,6 @@ def f() -> Iterator[Optional[str]]:
  else:
  return cast(str, response.choices[0].message.content)
 
- def generate_classifier(self, question: str) -> Callable:
- api_doc = T.get_tool_documentation([T.clip])
- prompt = CHOOSE_PARAMS.format(api_doc=api_doc, question=question)
- response = self.client.chat.completions.create(
- model=self.model_name,
- messages=[
- {"role": "system", "content": SYSTEM_PROMPT},
- {"role": "user", "content": prompt},
- ],
- response_format={"type": "json_object"},
- )
-
- try:
- params = json.loads(cast(str, response.choices[0].message.content))[
- "Parameters"
- ]
- except json.JSONDecodeError:
- _LOGGER.error(
- f"Failed to decode response: {response.choices[0].message.content}"
- )
- raise ValueError("Failed to decode response")
-
- return lambda x: T.clip(x, params["prompt"])
-
- def generate_detector(self, question: str) -> Callable:
- api_doc = T.get_tool_documentation([T.owl_v2])
- prompt = CHOOSE_PARAMS.format(api_doc=api_doc, question=question)
- response = self.client.chat.completions.create(
- model=self.model_name,
- messages=[
- {"role": "system", "content": SYSTEM_PROMPT},
- {"role": "user", "content": prompt},
- ],
- response_format={"type": "json_object"},
- )
-
- try:
- params = json.loads(cast(str, response.choices[0].message.content))[
- "Parameters"
- ]
- except json.JSONDecodeError:
- _LOGGER.error(
- f"Failed to decode response: {response.choices[0].message.content}"
- )
- raise ValueError("Failed to decode response")
-
- return lambda x: T.owl_v2(params["prompt"], x)
-
- def generate_segmentor(self, question: str) -> Callable:
- api_doc = T.get_tool_documentation([T.grounding_sam])
- prompt = CHOOSE_PARAMS.format(api_doc=api_doc, question=question)
- response = self.client.chat.completions.create(
- model=self.model_name,
- messages=[
- {"role": "system", "content": SYSTEM_PROMPT},
- {"role": "user", "content": prompt},
- ],
- response_format={"type": "json_object"},
- )
-
- try:
- params = json.loads(cast(str, response.choices[0].message.content))[
- "Parameters"
- ]
- except json.JSONDecodeError:
- _LOGGER.error(
- f"Failed to decode response: {response.choices[0].message.content}"
- )
- raise ValueError("Failed to decode response")
-
- return lambda x: T.grounding_sam(params["prompt"], x)
-
- def generate_image_qa_tool(self, question: str) -> Callable:
- return lambda x: T.git_vqa_v2(question, x)
-
 
 class AzureOpenAILMM(OpenAILMM):
  def __init__(
@@ -362,7 +252,7 @@ def __init__(
 
  def __call__(
  self,
- input: Union[str, List[Message]],
+ input: Union[str, Sequence[Message]],
  **kwargs: Any,
  ) -> Union[str, Iterator[Optional[str]]]:
  if isinstance(input, str):
@@ -371,13 +261,13 @@ def __call__(
 
  def chat(
  self,
- chat: List[Message],
+ chat: Sequence[Message],
  **kwargs: Any,
  ) -> Union[str, Iterator[Optional[str]]]:
  """Chat with the LMM model.
 
  Parameters:
- chat (List[Dict[str, str]]): A list of dictionaries containing the chat
+ chat (Sequence[Dict[str, str]]): A list of dictionaries containing the chat
  messages. The messages can be in the format:
  [{"role": "user", "content": "Hello!"}, ...]
  or if it contains media, it should be in the format:
@@ -429,7 +319,7 @@ def f() -> Iterator[Optional[str]]:
  def generate(
  self,
  prompt: str,
- media: Optional[List[Union[str, Path]]] = None,
+ media: Optional[Sequence[Union[str, Path]]] = None,
  **kwargs: Any,
  ) -> Union[str, Iterator[Optional[str]]]:
  url = f"{self.url}/generate"
@@ -493,7 +383,7 @@ def __init__(
 
  def __call__(
  self,
- input: Union[str, List[Dict[str, Any]]],
+ input: Union[str, Sequence[Dict[str, Any]]],
  **kwargs: Any,
  ) -> Union[str, Iterator[Optional[str]]]:
  if isinstance(input, str):
@@ -502,7 +392,7 @@ def __call__(
 
  def chat(
  self,
- chat: List[Dict[str, Any]],
+ chat: Sequence[Dict[str, Any]],
  **kwargs: Any,
  ) -> Union[str, Iterator[Optional[str]]]:
  messages: List[MessageParam] = []
@@ -551,7 +441,7 @@ def f() -> Iterator[Optional[str]]:
  def generate(
  self,
  prompt: str,
- media: Optional[List[Union[str, Path]]] = None,
+ media: Optional[Sequence[Union[str, Path]]] = None,
  **kwargs: Any,
  ) -> Union[str, Iterator[Optional[str]]]:
  content: List[Union[TextBlockParam, ImageBlockParam]] = [

diff --git a/vision_agent/tools/__init__.py b/vision_agent/tools/__init__.py
@@ -16,6 +16,8 @@
  clip,
  closest_box_distance,
  closest_mask_distance,
+ countgd_counting,
+ countgd_example_based_counting,
  depth_anything_v2,
  detr_segmentation,
  dpt_hybrid_midas,
@@ -30,20 +32,20 @@
  generate_soft_edge_image,
  get_tool_documentation,
  git_vqa_v2,
+ gpt4o_image_vqa,
+ gpt4o_video_vqa,
  grounding_dino,
  grounding_sam,
  ixc25_image_vqa,
  ixc25_video_vqa,
  load_image,
  loca_visual_prompt_counting,
  loca_zero_shot_counting,
- countgd_counting,
- countgd_example_based_counting,
  ocr,
  overlay_bounding_boxes,
+ overlay_counting_results,
  overlay_heat_map,
  overlay_segmentation_masks,
- overlay_counting_results,
  owl_v2,
  save_image,
  save_json,