diff --git a/README.md b/README.md index a5f24480..fda69986 100644 --- a/README.md +++ b/README.md @@ -168,20 +168,18 @@ result = agent.chat_with_workflow(conv) ### Tools There are a variety of tools for the model or the user to use. Some are executed locally -while others are hosted for you. You can also ask an LMM directly to build a tool for -you. For example: +while others are hosted for you. You can easily access them yourself, for example if +you want to run `owl_v2` and visualize the output you can run: ```python ->>> import vision_agent as va ->>> lmm = va.lmm.OpenAILMM() ->>> detector = lmm.generate_detector("Can you build a jar detector for me?") ->>> detector(va.tools.load_image("jar.jpg")) -[{"labels": ["jar",], - "scores": [0.99], - "bboxes": [ - [0.58, 0.2, 0.72, 0.45], - ] -}] +import vision_agent.tools as T +import matplotlib.pyplot as plt + +image = T.load_image("dogs.jpg") +dets = T.owl_v2("dogs", image) +viz = T.overlay_bounding_boxes(image, dets) +plt.imshow(viz) +plt.show() ``` You can also add custom tools to the agent: @@ -214,6 +212,41 @@ function. Make sure the documentation is in the same format above with descripti `Parameters:`, `Returns:`, and `Example\n-------`. You can find an example use case [here](examples/custom_tools/) as this is what the agent uses to pick and use the tool. +## Additional LLMs +### Ollama +We also provide a `VisionAgentCoder` that uses Ollama. To get started you must download +a few models: + +```bash +ollama pull llama3.1 +ollama pull mxbai-embed-large +``` + +`llama3.1` is used for the `OllamaLMM` for `OllamaVisionAgentCoder`. Normally we would +use an actual LMM such as `llava` but `llava` cannot handle the long context lengths +required by the agent. Since `llama3.1` cannot handle images you may see some +performance degredation. `mxbai-embed-large` is the embedding model used to look up +tools. You can use it just like you would use `VisionAgentCoder`: + +```python +>>> import vision_agent as va +>>> agent = va.agent.OllamaVisionAgentCoder() +>>> agent("Count the apples in the image", media="apples.jpg") +``` +> WARNING: VisionAgent doesn't work well unless the underlying LMM is sufficiently powerful. Do not expect good results or even working code with smaller models like Llama 3.1 8B. + +### Azure OpenAI +We also provide a `AzureVisionAgentCoder` that uses Azure OpenAI models. To get started +follow the Azure Setup section below. You can use it just like you would use= +`VisionAgentCoder`: + +```python +>>> import vision_agent as va +>>> agent = va.agent.AzureVisionAgentCoder() +>>> agent("Count the apples in the image", media="apples.jpg") +``` + + ### Azure Setup If you want to use Azure OpenAI models, you need to have two OpenAI model deployments: @@ -252,6 +285,6 @@ agent = va.agent.AzureVisionAgentCoder() 2. Follow the instructions to purchase and manage your API credits. 3. Ensure your API key is correctly configured in your project settings. -Failure to have sufficient API credits may result in limited or no functionality for the features that rely on the OpenAI API. - -For more details on managing your API usage and credits, please refer to the OpenAI API documentation. +Failure to have sufficient API credits may result in limited or no functionality for +the features that rely on the OpenAI API. For more details on managing your API usage +and credits, please refer to the OpenAI API documentation. diff --git a/docs/index.md b/docs/index.md index d0ed8178..fc5ddde1 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1,4 +1,9 @@ # 🔍🤖 Vision Agent +[![](https://dcbadge.vercel.app/api/server/wPdN8RCYew?compact=true&style=flat)](https://discord.gg/wPdN8RCYew) +![ci_status](https://github.com/landing-ai/vision-agent/actions/workflows/ci_cd.yml/badge.svg) +[![PyPI version](https://badge.fury.io/py/vision-agent.svg)](https://badge.fury.io/py/vision-agent) +![version](https://img.shields.io/pypi/pyversions/vision-agent) + Vision Agent is a library that helps you utilize agent frameworks to generate code to solve your vision task. Many current vision problems can easily take hours or days to @@ -160,20 +165,18 @@ result = agent.chat_with_workflow(conv) ### Tools There are a variety of tools for the model or the user to use. Some are executed locally -while others are hosted for you. You can also ask an LMM directly to build a tool for -you. For example: +while others are hosted for you. You can easily access them yourself, for example if +you want to run `owl_v2` and visualize the output you can run: ```python ->>> import vision_agent as va ->>> lmm = va.lmm.OpenAILMM() ->>> detector = lmm.generate_detector("Can you build a jar detector for me?") ->>> detector(va.tools.load_image("jar.jpg")) -[{"labels": ["jar",], - "scores": [0.99], - "bboxes": [ - [0.58, 0.2, 0.72, 0.45], - ] -}] +import vision_agent.tools as T +import matplotlib.pyplot as plt + +image = T.load_image("dogs.jpg") +dets = T.owl_v2("dogs", image) +viz = T.overlay_bounding_boxes(image, dets) +plt.imshow(viz) +plt.show() ``` You can also add custom tools to the agent: @@ -206,6 +209,40 @@ function. Make sure the documentation is in the same format above with descripti `Parameters:`, `Returns:`, and `Example\n-------`. You can find an example use case [here](examples/custom_tools/) as this is what the agent uses to pick and use the tool. +## Additional LLMs +### Ollama +We also provide a `VisionAgentCoder` that uses Ollama. To get started you must download +a few models: + +```bash +ollama pull llama3.1 +ollama pull mxbai-embed-large +``` + +`llama3.1` is used for the `OllamaLMM` for `OllamaVisionAgentCoder`. Normally we would +use an actual LMM such as `llava` but `llava` cannot handle the long context lengths +required by the agent. Since `llama3.1` cannot handle images you may see some +performance degredation. `mxbai-embed-large` is the embedding model used to look up +tools. You can use it just like you would use `VisionAgentCoder`: + +```python +>>> import vision_agent as va +>>> agent = va.agent.OllamaVisionAgentCoder() +>>> agent("Count the apples in the image", media="apples.jpg") +``` + +### Azure OpenAI +We also provide a `AzureVisionAgentCoder` that uses Azure OpenAI models. To get started +follow the Azure Setup section below. You can use it just like you would use= +`VisionAgentCoder`: + +```python +>>> import vision_agent as va +>>> agent = va.agent.AzureVisionAgentCoder() +>>> agent("Count the apples in the image", media="apples.jpg") +``` +> WARNING: VisionAgent doesn't work well unless the underlying LMM is sufficiently powerful. Do not expect good results or even working code with smaller models like Llama 3.1 8B. + ### Azure Setup If you want to use Azure OpenAI models, you need to have two OpenAI model deployments: @@ -244,6 +281,6 @@ agent = va.agent.AzureVisionAgentCoder() 2. Follow the instructions to purchase and manage your API credits. 3. Ensure your API key is correctly configured in your project settings. -Failure to have sufficient API credits may result in limited or no functionality for the features that rely on the OpenAI API. - -For more details on managing your API usage and credits, please refer to the OpenAI API documentation. +Failure to have sufficient API credits may result in limited or no functionality for +the features that rely on the OpenAI API. For more details on managing your API usage +and credits, please refer to the OpenAI API documentation. diff --git a/docs/lmms.md b/docs/lmms.md deleted file mode 100644 index 21c329e0..00000000 --- a/docs/lmms.md +++ /dev/null @@ -1,20 +0,0 @@ -### LMMs -One of the problems of dealing with image data is it can be difficult to organize and -search. For example, you might have a bunch of pictures of houses and want to count how -many yellow houses you have, or how many houses with adobe roofs. The vision agent -library uses LMMs to help create tags or descriptions of images to allow you to search -over them, or use them in a database to carry out other operations. - -To get started, you can use an LMM to start generating text from images. The following -code will use the LLaVA-1.6 34B model to generate a description of the image you pass it. - -```python -import vision_agent as va - -model = va.lmm.get_lmm("llava") -model.generate("Describe this image", "image.png") ->>> "A yellow house with a green lawn." -``` - -**WARNING** We are hosting the LLaVA-1.6 34B model, if it times out please wait ~3-5 -min for the server to warm up as it shuts down when usage is low. diff --git a/tests/unit/fixtures.py b/tests/unit/fixtures.py index ccad51e8..a56ebac6 100644 --- a/tests/unit/fixtures.py +++ b/tests/unit/fixtures.py @@ -31,3 +31,27 @@ def generator(): mock_instance = mock.return_value mock_instance.chat.completions.create.return_value = mock_generate() yield mock_instance + + +@pytest.fixture +def generate_ollama_lmm_mock(request): + content = request.param + + mock_resp = MagicMock() + mock_resp.status_code = 200 + mock_resp.json.return_value = {"response": content} + with patch("vision_agent.lmm.lmm.requests.post") as mock: + mock.return_value = mock_resp + yield mock + + +@pytest.fixture +def chat_ollama_lmm_mock(request): + content = request.param + + mock_resp = MagicMock() + mock_resp.status_code = 200 + mock_resp.json.return_value = {"message": {"content": content}} + with patch("vision_agent.lmm.lmm.requests.post") as mock: + mock.return_value = mock_resp + yield mock diff --git a/tests/unit/test_lmm.py b/tests/unit/test_lmm.py index 9cb43650..c954b173 100644 --- a/tests/unit/test_lmm.py +++ b/tests/unit/test_lmm.py @@ -1,3 +1,4 @@ +import json import tempfile from unittest.mock import patch @@ -5,9 +6,13 @@ import pytest from PIL import Image -from vision_agent.lmm.lmm import OpenAILMM +from vision_agent.lmm.lmm import OllamaLMM, OpenAILMM -from .fixtures import openai_lmm_mock # noqa: F401 +from .fixtures import ( # noqa: F401 + chat_ollama_lmm_mock, + generate_ollama_lmm_mock, + openai_lmm_mock, +) def create_temp_image(image_format="jpeg"): @@ -135,6 +140,31 @@ def test_call_with_mock_stream(openai_lmm_mock): # noqa: F811 ) +@pytest.mark.parametrize( + "generate_ollama_lmm_mock", + ["mocked response"], + indirect=["generate_ollama_lmm_mock"], +) +def test_generate_ollama_mock(generate_ollama_lmm_mock): # noqa: F811 + temp_image = create_temp_image() + lmm = OllamaLMM() + response = lmm.generate("test prompt", media=[temp_image]) + assert response == "mocked response" + call_args = json.loads(generate_ollama_lmm_mock.call_args.kwargs["data"]) + assert call_args["prompt"] == "test prompt" + + +@pytest.mark.parametrize( + "chat_ollama_lmm_mock", ["mocked response"], indirect=["chat_ollama_lmm_mock"] +) +def test_chat_ollama_mock(chat_ollama_lmm_mock): # noqa: F811 + lmm = OllamaLMM() + response = lmm.chat([{"role": "user", "content": "test prompt"}]) + assert response == "mocked response" + call_args = json.loads(chat_ollama_lmm_mock.call_args.kwargs["data"]) + assert call_args["messages"][0]["content"] == "test prompt" + + @pytest.mark.parametrize( "openai_lmm_mock", ['{"Parameters": {"prompt": "cat"}}'], diff --git a/vision_agent/agent/__init__.py b/vision_agent/agent/__init__.py index 79b6abae..2164d688 100644 --- a/vision_agent/agent/__init__.py +++ b/vision_agent/agent/__init__.py @@ -1,3 +1,7 @@ from .agent import Agent from .vision_agent import VisionAgent -from .vision_agent_coder import AzureVisionAgentCoder, VisionAgentCoder +from .vision_agent_coder import ( + AzureVisionAgentCoder, + OllamaVisionAgentCoder, + VisionAgentCoder, +) diff --git a/vision_agent/agent/agent_utils.py b/vision_agent/agent/agent_utils.py index 5d55e963..eb951ccc 100644 --- a/vision_agent/agent/agent_utils.py +++ b/vision_agent/agent/agent_utils.py @@ -1,9 +1,24 @@ import json import logging +import re import sys -from typing import Any, Dict +from typing import Any, Dict, Optional logging.basicConfig(stream=sys.stdout) +_LOGGER = logging.getLogger(__name__) + + +def _extract_sub_json(json_str: str) -> Optional[Dict[str, Any]]: + json_pattern = r"\{.*\}" + match = re.search(json_pattern, json_str, re.DOTALL) + if match: + json_str = match.group() + try: + json_dict = json.loads(json_str) + return json_dict # type: ignore + except json.JSONDecodeError: + return None + return None def extract_json(json_str: str) -> Dict[str, Any]: @@ -18,8 +33,16 @@ def extract_json(json_str: str) -> Dict[str, Any]: json_str = json_str[json_str.find("```") + len("```") :] # get the last ``` not one from an intermediate string json_str = json_str[: json_str.find("}```")] + try: + json_dict = json.loads(json_str) + except json.JSONDecodeError as e: + json_dict = _extract_sub_json(json_str) + if json_dict is not None: + return json_dict # type: ignore + error_msg = f"Could not extract JSON from the given str: {json_str}" + _LOGGER.exception(error_msg) + raise ValueError(error_msg) from e - json_dict = json.loads(json_str) return json_dict # type: ignore diff --git a/vision_agent/agent/vision_agent_coder.py b/vision_agent/agent/vision_agent_coder.py index 6bba2905..b10988c6 100644 --- a/vision_agent/agent/vision_agent_coder.py +++ b/vision_agent/agent/vision_agent_coder.py @@ -28,11 +28,11 @@ TEST_PLANS, USER_REQ, ) -from vision_agent.lmm import LMM, AzureOpenAILMM, Message, OpenAILMM +from vision_agent.lmm import LMM, AzureOpenAILMM, Message, OllamaLMM, OpenAILMM from vision_agent.utils import CodeInterpreterFactory, Execution from vision_agent.utils.execute import CodeInterpreter from vision_agent.utils.image_utils import b64_to_pil -from vision_agent.utils.sim import AzureSim, Sim +from vision_agent.utils.sim import AzureSim, OllamaSim, Sim from vision_agent.utils.video import play_video logging.basicConfig(stream=sys.stdout) @@ -267,7 +267,11 @@ def pick_plan( pass count += 1 - if best_plan is None: + if ( + best_plan is None + or "best_plan" not in best_plan + or ("best_plan" in best_plan and best_plan["best_plan"] not in plans) + ): best_plan = {"best_plan": list(plans.keys())[0]} if verbosity >= 1: @@ -589,8 +593,8 @@ class VisionAgentCoder(Agent): Example ------- - >>> from vision_agent.agent import VisionAgentCoder - >>> agent = VisionAgentCoder() + >>> import vision_agent as va + >>> agent = va.agent.VisionAgentCoder() >>> code = agent("What percentage of the area of the jar is filled with coffee beans?", media="jar.jpg") """ @@ -857,6 +861,64 @@ def log_progress(self, data: Dict[str, Any]) -> None: self.report_progress_callback(data) +class OllamaVisionAgentCoder(VisionAgentCoder): + """VisionAgentCoder that uses Ollama models for planning, coding, testing. + + Pre-requisites: + 1. Run ollama pull llama3.1 for the LLM + 2. Run ollama pull mxbai-embed-large for the embedding similarity model + + Technically you should use a VLM such as llava but llava is not able to handle the + context length and crashes. + + Example + ------- + >>> image vision_agent as va + >>> agent = va.agent.OllamaVisionAgentCoder() + >>> code = agent("What percentage of the area of the jar is filled with coffee beans?", media="jar.jpg") + """ + + def __init__( + self, + planner: Optional[LMM] = None, + coder: Optional[LMM] = None, + tester: Optional[LMM] = None, + debugger: Optional[LMM] = None, + tool_recommender: Optional[Sim] = None, + verbosity: int = 0, + report_progress_callback: Optional[Callable[[Dict[str, Any]], None]] = None, + ) -> None: + super().__init__( + planner=( + OllamaLMM(model_name="llama3.1", temperature=0.0, json_mode=True) + if planner is None + else planner + ), + coder=( + OllamaLMM(model_name="llama3.1", temperature=0.0) + if coder is None + else coder + ), + tester=( + OllamaLMM(model_name="llama3.1", temperature=0.0) + if tester is None + else tester + ), + debugger=( + OllamaLMM(model_name="llama3.1", temperature=0.0, json_mode=True) + if debugger is None + else debugger + ), + tool_recommender=( + OllamaSim(T.TOOLS_DF, sim_key="desc") + if tool_recommender is None + else tool_recommender + ), + verbosity=verbosity, + report_progress_callback=report_progress_callback, + ) + + class AzureVisionAgentCoder(VisionAgentCoder): """VisionAgentCoder that uses Azure OpenAI APIs for planning, coding, testing. @@ -866,8 +928,8 @@ class AzureVisionAgentCoder(VisionAgentCoder): Example ------- - >>> from vision_agent import AzureVisionAgentCoder - >>> agent = AzureVisionAgentCoder() + >>> import vision_agent as va + >>> agent = va.agent.AzureVisionAgentCoder() >>> code = agent("What percentage of the area of the jar is filled with coffee beans?", media="jar.jpg") """ diff --git a/vision_agent/lmm/lmm.py b/vision_agent/lmm/lmm.py index 9a8c5bf1..e78a0593 100644 --- a/vision_agent/lmm/lmm.py +++ b/vision_agent/lmm/lmm.py @@ -330,12 +330,28 @@ def __init__( model_name: str = "llava", base_url: Optional[str] = "http://localhost:11434/api", json_mode: bool = False, + num_ctx: int = 128_000, **kwargs: Any, ): + """Initializes the Ollama LMM. kwargs are passed as 'options' to the model. + More information on options can be found here + https://github.com/ollama/ollama/blob/main/docs/modelfile.md#valid-parameters-and-values + + Parameters: + model_name (str): The ollama name of the model. + base_url (str): The base URL of the Ollama API. + json_mode (bool): Whether to use JSON mode. + num_ctx (int): The context length for the model. + kwargs (Any): Additional options to pass to the model. + """ + self.url = base_url self.model_name = model_name - self.json_mode = json_mode - self.kwargs = kwargs + self.kwargs = {"options": kwargs} + + if json_mode: + self.kwargs["format"] = "json" # type: ignore + self.kwargs["options"]["num_cxt"] = num_ctx def __call__( self, @@ -369,13 +385,14 @@ def chat( url = f"{self.url}/chat" model = self.model_name messages = fixed_chat - data = {"model": model, "messages": messages} + data: Dict[str, Any] = {"model": model, "messages": messages} tmp_kwargs = self.kwargs | kwargs data.update(tmp_kwargs) - json_data = json.dumps(data) if "stream" in tmp_kwargs and tmp_kwargs["stream"]: + json_data = json.dumps(data) + def f() -> Iterator[Optional[str]]: with requests.post(url, data=json_data, stream=True) as stream: if stream.status_code != 200: @@ -392,13 +409,14 @@ def f() -> Iterator[Optional[str]]: return f() else: - stream = requests.post(url, data=json_data) - if stream.status_code != 200: - raise ValueError( - f"Request failed with status code {stream.status_code}" - ) - stream = stream.json() - return stream["message"]["content"] # type: ignore + data["stream"] = False + json_data = json.dumps(data) + resp = requests.post(url, data=json_data) + + if resp.status_code != 200: + raise ValueError(f"Request failed with status code {resp.status_code}") + resp = resp.json() + return resp["message"]["content"] # type: ignore def generate( self, @@ -408,7 +426,7 @@ def generate( ) -> Union[str, Iterator[Optional[str]]]: url = f"{self.url}/generate" - data = { + data: Dict[str, Any] = { "model": self.model_name, "prompt": prompt, "images": [], @@ -416,13 +434,14 @@ def generate( if media and len(media) > 0: for m in media: - data["images"].append(encode_media(m)) # type: ignore + data["images"].append(encode_media(m)) tmp_kwargs = self.kwargs | kwargs data.update(tmp_kwargs) - json_data = json.dumps(data) if "stream" in tmp_kwargs and tmp_kwargs["stream"]: + json_data = json.dumps(data) + def f() -> Iterator[Optional[str]]: with requests.post(url, data=json_data, stream=True) as stream: if stream.status_code != 200: @@ -439,15 +458,15 @@ def f() -> Iterator[Optional[str]]: return f() else: - stream = requests.post(url, data=json_data) + data["stream"] = False + json_data = json.dumps(data) + resp = requests.post(url, data=json_data) - if stream.status_code != 200: - raise ValueError( - f"Request failed with status code {stream.status_code}" - ) + if resp.status_code != 200: + raise ValueError(f"Request failed with status code {resp.status_code}") - stream = stream.json() - return stream["response"] # type: ignore + resp = resp.json() + return resp["response"] # type: ignore class ClaudeSonnetLMM(LMM): diff --git a/vision_agent/utils/__init__.py b/vision_agent/utils/__init__.py index b440db51..9a5a271a 100644 --- a/vision_agent/utils/__init__.py +++ b/vision_agent/utils/__init__.py @@ -6,5 +6,5 @@ Logs, Result, ) -from .sim import AzureSim, Sim, load_sim, merge_sim +from .sim import AzureSim, OllamaSim, Sim, load_sim, merge_sim from .video import extract_frames_from_video diff --git a/vision_agent/utils/sim.py b/vision_agent/utils/sim.py index c3b26403..5c89f700 100644 --- a/vision_agent/utils/sim.py +++ b/vision_agent/utils/sim.py @@ -1,20 +1,21 @@ import os from functools import lru_cache from pathlib import Path -from typing import Dict, List, Optional, Sequence, Union +from typing import Callable, Dict, List, Optional, Sequence, Union import numpy as np import pandas as pd -from openai import AzureOpenAI, Client, OpenAI +import requests +from openai import AzureOpenAI, OpenAI from scipy.spatial.distance import cosine # type: ignore @lru_cache(maxsize=512) def get_embedding( - client: Client, text: str, model: str = "text-embedding-3-small" + emb_call: Callable[[List[str]], List[float]], text: str ) -> List[float]: text = text.replace("\n", " ") - return client.embeddings.create(input=[text], model=model).data[0].embedding + return emb_call([text]) class Sim: @@ -35,14 +36,19 @@ def __init__( model: str: The model to use for embeddings. """ self.df = df - self.client = OpenAI(api_key=api_key) + client = OpenAI(api_key=api_key) + self.emb_call = ( + lambda text: client.embeddings.create(input=text, model=model) + .data[0] + .embedding + ) self.model = model if "embs" not in df.columns and sim_key is None: raise ValueError("key is required if no column 'embs' is present.") if sim_key is not None: self.df["embs"] = self.df[sim_key].apply( - lambda x: get_embedding(self.client, x, model=self.model) + lambda x: get_embedding(self.emb_call, x) ) def save(self, sim_file: Union[str, Path]) -> None: @@ -70,7 +76,7 @@ def top_k( Sequence[Dict]: The top k most similar items. """ - embedding = get_embedding(self.client, query, model=self.model) + embedding = get_embedding(self.emb_call, query) self.df["sim"] = self.df.embs.apply(lambda x: 1 - cosine(x, embedding)) res = self.df.sort_values("sim", ascending=False).head(k) if thresh is not None: @@ -105,17 +111,51 @@ def __init__( ) self.df = df - self.client = AzureOpenAI( + client = AzureOpenAI( api_key=api_key, api_version=api_version, azure_endpoint=azure_endpoint ) + self.emb_call = ( + lambda text: client.embeddings.create(input=text, model=model) + .data[0] + .embedding + ) self.model = model + if "embs" not in df.columns and sim_key is None: + raise ValueError("key is required if no column 'embs' is present.") + + if sim_key is not None: + self.df["embs"] = self.df[sim_key].apply(lambda x: get_embedding(client, x)) + + +class OllamaSim(Sim): + def __init__( + self, + df: pd.DataFrame, + sim_key: Optional[str] = None, + model_name: Optional[str] = None, + base_url: Optional[str] = None, + ) -> None: + self.df = df + if base_url is None: + base_url = "http://localhost:11434/api/embeddings" + if model_name is None: + model_name = "mxbai-embed-large" + + def emb_call(text: List[str]) -> List[float]: + resp = requests.post( + base_url, json={"prompt": text[0], "model": model_name} + ) + return resp.json()["embedding"] # type: ignore + + self.emb_call = emb_call + if "embs" not in df.columns and sim_key is None: raise ValueError("key is required if no column 'embs' is present.") if sim_key is not None: self.df["embs"] = self.df[sim_key].apply( - lambda x: get_embedding(self.client, x, model=self.model) + lambda x: get_embedding(emb_call, x) )