From cfea046cd491e0c555e5127cbbbdc234e38aa9a9 Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Sun, 25 Aug 2024 17:43:35 -0700 Subject: [PATCH] added ollama vision agent coder --- vision_agent/agent/__init__.py | 6 ++- vision_agent/agent/vision_agent_coder.py | 59 +++++++++++++++++++++--- 2 files changed, 58 insertions(+), 7 deletions(-) diff --git a/vision_agent/agent/__init__.py b/vision_agent/agent/__init__.py index 79b6abae..2164d688 100644 --- a/vision_agent/agent/__init__.py +++ b/vision_agent/agent/__init__.py @@ -1,3 +1,7 @@ from .agent import Agent from .vision_agent import VisionAgent -from .vision_agent_coder import AzureVisionAgentCoder, VisionAgentCoder +from .vision_agent_coder import ( + AzureVisionAgentCoder, + OllamaVisionAgentCoder, + VisionAgentCoder, +) diff --git a/vision_agent/agent/vision_agent_coder.py b/vision_agent/agent/vision_agent_coder.py index 3a370c5e..48bf5b70 100644 --- a/vision_agent/agent/vision_agent_coder.py +++ b/vision_agent/agent/vision_agent_coder.py @@ -27,11 +27,11 @@ TEST_PLANS, USER_REQ, ) -from vision_agent.lmm import LMM, AzureOpenAILMM, Message, OpenAILMM +from vision_agent.lmm import LMM, AzureOpenAILMM, Message, OllamaLMM, OpenAILMM from vision_agent.utils import CodeInterpreterFactory, Execution from vision_agent.utils.execute import CodeInterpreter from vision_agent.utils.image_utils import b64_to_pil -from vision_agent.utils.sim import AzureSim, Sim +from vision_agent.utils.sim import AzureSim, OllamaSim, Sim from vision_agent.utils.video import play_video logging.basicConfig(stream=sys.stdout) @@ -572,8 +572,8 @@ class VisionAgentCoder(Agent): Example ------- - >>> from vision_agent.agent import VisionAgentCoder - >>> agent = VisionAgentCoder() + >>> import vision_agent as va + >>> agent = va.agent.VisionAgentCoder() >>> code = agent("What percentage of the area of the jar is filled with coffee beans?", media="jar.jpg") """ @@ -841,6 +841,53 @@ def log_progress(self, data: Dict[str, Any]) -> None: self.report_progress_callback(data) +class OllamaVisionAgentCoder(VisionAgentCoder): + """VisionAgentCoder that uses Ollama models for planning, coding, testing. + + Pre-requisites: + 1. Run ollama pull llava for the LMM (or any other LMM model that can consume images) + 2. Run ollama pull mxbai-embed-large for the embedding similarity model + + Example + ------- + >>> image vision_agent as va + >>> agent = va.agent.OllamaVisionAgentCoder() + >>> code = agent("What percentage of the area of the jar is filled with coffee beans?", media="jar.jpg") + """ + + def __init__( + self, + planner: Optional[LMM] = None, + coder: Optional[LMM] = None, + tester: Optional[LMM] = None, + debugger: Optional[LMM] = None, + tool_recommender: Optional[Sim] = None, + verbosity: int = 0, + report_progress_callback: Optional[Callable[[Dict[str, Any]], None]] = None, + ) -> None: + super().__init__( + planner=( + OllamaLMM(temperature=0.0, json_mode=True) + if planner is None + else planner + ), + coder=OllamaLMM(temperature=0.0) if coder is None else coder, + tester=OllamaLMM(temperature=0.0) if tester is None else tester, + debugger=( + OllamaLMM(temperature=0.0, json_mode=True) + if debugger is None + else debugger + ), + tool_recommender=( + OllamaSim(T.TOOLS_DF, sim_key="desc") + if tool_recommender is None + else tool_recommender + ), + verbosity=verbosity, + report_progress_callback=report_progress_callback, + ) + + class AzureVisionAgentCoder(VisionAgentCoder): """VisionAgentCoder that uses Azure OpenAI APIs for planning, coding, testing. @@ -850,8 +897,8 @@ class AzureVisionAgentCoder(VisionAgentCoder): Example ------- - >>> from vision_agent import AzureVisionAgentCoder - >>> agent = AzureVisionAgentCoder() + >>> import vision_agent as va + >>> agent = va.agent.AzureVisionAgentCoder() >>> code = agent("What percentage of the area of the jar is filled with coffee beans?", media="jar.jpg") """