diff --git a/CITATION.cff b/CITATION.cff new file mode 100644 index 00000000..e205ad2b --- /dev/null +++ b/CITATION.cff @@ -0,0 +1,15 @@ +cff-version: 1.2.0 +message: "If you use this software, please cite it as below." +authors: +- family-names: "Laird" + given-names: "Dillon" +- family-names: "Jagadeesan" + given-name: "Shankar" +- family-name: "Cao" + given-name: "Yazhou" +- family-name: "Ng" + given-name: "Andrew" +title: "Vision Agent" +version: 0.2 +date-released: 2024-02-12 +url: "https://github.com/landing-ai/vision-agent" diff --git a/README.md b/README.md index 87b39d22..a5f24480 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,7 @@ code to solve the task for them. Check out our discord for updates and roadmaps! ## Web Application -Try Vision Agent live on [va.landing.ai](https://va.landing.ai/) +Try Vision Agent live on (note this may not be running the most up-to-date version) [va.landing.ai](https://va.landing.ai/) ## Documentation @@ -40,16 +40,44 @@ using Azure OpenAI please see the Azure setup section): export OPENAI_API_KEY="your-api-key" ``` -### Important Note on API Usage -Please be aware that using the API in this project requires you to have API credits (minimum of five US dollars). This is different from the OpenAI subscription used in this chatbot. If you don't have credit, further information can be found [here](https://github.com/landing-ai/vision-agent?tab=readme-ov-file#how-to-get-started-with-openai-api-credits) - ### Vision Agent +There are two agents that you can use. Vision Agent is a conversational agent that has +access to tools that allow it to write an navigate python code and file systems. It can +converse with the user in natural language. VisionAgentCoder is an agent that can write +code for vision tasks, such as counting people in an image. However, it cannot converse +and can only respond with code. VisionAgent can call VisionAgentCoder to write vision +code. + #### Basic Usage -You can interact with the agent as you would with any LLM or LMM model: +To run the streamlit app locally to chat with Vision Agent, you can run the following +command: + +```bash +pip install -r examples/chat/requirements.txt +export WORKSPACE=/path/to/your/workspace +export ZMQ_PORT=5555 +streamlit run examples/chat/app.py +``` +You can find more details about the streamlit app [here](examples/chat/). +#### Basic Programmatic Usage ```python >>> from vision_agent.agent import VisionAgent >>> agent = VisionAgent() +>>> resp = agent("Hello") +>>> print(resp) +[{"role": "user", "content": "Hello"}, {"role": "assistant", "content": "{'thoughts': 'The user has greeted me. I will respond with a greeting and ask how I can assist them.', 'response': 'Hello! How can I assist you today?', 'let_user_respond': True}"}] +>>> resp.append({"role": "user", "content": "Can you count the number of people in this image?", "media": ["people.jpg"]}) +>>> resp = agent(resp) +``` + +### Vision Agent Coder +#### Basic Usage +You can interact with the agent as you would with any LLM or LMM model: + +```python +>>> from vision_agent.agent import VisionAgentCoder +>>> agent = VisionAgentCoder() >>> code = agent("What percentage of the area of the jar is filled with coffee beans?", media="jar.jpg") ``` @@ -90,7 +118,7 @@ To better understand how the model came up with it's answer, you can run it in d mode by passing in the verbose argument: ```python ->>> agent = VisionAgent(verbose=2) +>>> agent = VisionAgentCoder(verbose=2) ``` #### Detailed Usage @@ -180,9 +208,11 @@ def custom_tool(image_path: str) -> str: return np.zeros((10, 10)) ``` -You need to ensure you call `@va.tools.register_tool` with any imports it might use and -ensure the documentation is in the same format above with description, `Parameters:`, -`Returns:`, and `Example\n-------`. You can find an example use case [here](examples/custom_tools/). +You need to ensure you call `@va.tools.register_tool` with any imports it uses. Global +variables will not be captured by `register_tool` so you need to include them in the +function. Make sure the documentation is in the same format above with description, +`Parameters:`, `Returns:`, and `Example\n-------`. You can find an example use case +[here](examples/custom_tools/) as this is what the agent uses to pick and use the tool. ### Azure Setup If you want to use Azure OpenAI models, you need to have two OpenAI model deployments: @@ -209,7 +239,7 @@ You can then run Vision Agent using the Azure OpenAI models: ```python import vision_agent as va -agent = va.agent.AzureVisionAgent() +agent = va.agent.AzureVisionAgentCoder() ``` ****************************************************************************************************************************** @@ -218,7 +248,7 @@ agent = va.agent.AzureVisionAgent() #### How to get started with OpenAI API credits -1. Visit the[OpenAI API platform](https://beta.openai.com/signup/) to sign up for an API key. +1. Visit the [OpenAI API platform](https://beta.openai.com/signup/) to sign up for an API key. 2. Follow the instructions to purchase and manage your API credits. 3. Ensure your API key is correctly configured in your project settings. diff --git a/docs/api/agent.md b/docs/api/agent.md index ca663a6e..df51ea90 100644 --- a/docs/api/agent.md +++ b/docs/api/agent.md @@ -1,3 +1,7 @@ ::: vision_agent.agent.agent.Agent ::: vision_agent.agent.vision_agent.VisionAgent + +::: vision_agent.agent.vision_agent_coder.VisionAgentCoder + +::: vision_agent.agent.vision_agent_coder.AzureVisionAgentCoder diff --git a/docs/api/lmm.md b/docs/api/lmm.md index 496b6959..da7ea71d 100644 --- a/docs/api/lmm.md +++ b/docs/api/lmm.md @@ -1,3 +1,7 @@ ::: vision_agent.lmm.OpenAILMM ::: vision_agent.lmm.AzureOpenAILMM + +::: vision_agent.lmm.OllamaLMM + +::: vision_agent.lmm.ClaudeSonnetLMM diff --git a/docs/index.md b/docs/index.md index 68670be1..d0ed8178 100644 --- a/docs/index.md +++ b/docs/index.md @@ -10,7 +10,7 @@ code to solve the task for them. Check out our discord for updates and roadmaps! ## Web Application -Try Vision Agent live on [va.landing.ai](https://va.landing.ai/) +Try Vision Agent live on (note this may not be running the most up-to-date version) [va.landing.ai](https://va.landing.ai/) ## Documentation @@ -32,16 +32,44 @@ using Azure OpenAI please see the Azure setup section): export OPENAI_API_KEY="your-api-key" ``` -### Important Note on API Usage -Please be aware that using the API in this project requires you to have API credits (minimum of five US dollars). This is different from the OpenAI subscription used in this chatbot. If you don't have credit, further information can be found [here](https://github.com/landing-ai/vision-agent?tab=readme-ov-file#how-to-get-started-with-openai-api-credits) - ### Vision Agent +There are two agents that you can use. Vision Agent is a conversational agent that has +access to tools that allow it to write an navigate python code and file systems. It can +converse with the user in natural language. VisionAgentCoder is an agent that can write +code for vision tasks, such as counting people in an image. However, it cannot converse +and can only respond with code. VisionAgent can call VisionAgentCoder to write vision +code. + #### Basic Usage -You can interact with the agent as you would with any LLM or LMM model: +To run the streamlit app locally to chat with Vision Agent, you can run the following +command: + +```bash +pip install -r examples/chat/requirements.txt +export WORKSPACE=/path/to/your/workspace +export ZMQ_PORT=5555 +streamlit run examples/chat/app.py +``` +You can find more details about the streamlit app [here](examples/chat/). +#### Basic Programmatic Usage ```python >>> from vision_agent.agent import VisionAgent >>> agent = VisionAgent() +>>> resp = agent("Hello") +>>> print(resp) +[{"role": "user", "content": "Hello"}, {"role": "assistant", "content": "{'thoughts': 'The user has greeted me. I will respond with a greeting and ask how I can assist them.', 'response': 'Hello! How can I assist you today?', 'let_user_respond': True}"}] +>>> resp.append({"role": "user", "content": "Can you count the number of people in this image?", "media": ["people.jpg"]}) +>>> resp = agent(resp) +``` + +### Vision Agent Coder +#### Basic Usage +You can interact with the agent as you would with any LLM or LMM model: + +```python +>>> from vision_agent.agent import VisionAgentCoder +>>> agent = VisionAgentCoder() >>> code = agent("What percentage of the area of the jar is filled with coffee beans?", media="jar.jpg") ``` @@ -82,7 +110,7 @@ To better understand how the model came up with it's answer, you can run it in d mode by passing in the verbose argument: ```python ->>> agent = VisionAgent(verbose=2) +>>> agent = VisionAgentCoder(verbose=2) ``` #### Detailed Usage @@ -172,9 +200,11 @@ def custom_tool(image_path: str) -> str: return np.zeros((10, 10)) ``` -You need to ensure you call `@va.tools.register_tool` with any imports it might use and -ensure the documentation is in the same format above with description, `Parameters:`, -`Returns:`, and `Example\n-------`. You can find an example use case [here](examples/custom_tools/). +You need to ensure you call `@va.tools.register_tool` with any imports it uses. Global +variables will not be captured by `register_tool` so you need to include them in the +function. Make sure the documentation is in the same format above with description, +`Parameters:`, `Returns:`, and `Example\n-------`. You can find an example use case +[here](examples/custom_tools/) as this is what the agent uses to pick and use the tool. ### Azure Setup If you want to use Azure OpenAI models, you need to have two OpenAI model deployments: @@ -201,7 +231,7 @@ You can then run Vision Agent using the Azure OpenAI models: ```python import vision_agent as va -agent = va.agent.AzureVisionAgent() +agent = va.agent.AzureVisionAgentCoder() ``` ****************************************************************************************************************************** @@ -210,7 +240,7 @@ agent = va.agent.AzureVisionAgent() #### How to get started with OpenAI API credits -1. Visit the[OpenAI API platform](https://beta.openai.com/signup/) to sign up for an API key. +1. Visit the [OpenAI API platform](https://beta.openai.com/signup/) to sign up for an API key. 2. Follow the instructions to purchase and manage your API credits. 3. Ensure your API key is correctly configured in your project settings. diff --git a/examples/chat/README.md b/examples/chat/README.md new file mode 100644 index 00000000..4016c72b --- /dev/null +++ b/examples/chat/README.md @@ -0,0 +1,51 @@ +# Vision Agent Chat Application + +The Vision Agent chat appliction allows you to have conversations with the agent system +to accomplish a wider variety of tasks. + +## Get Started +To get started first install the requirements by running the following command: +```bash +pip install -r requirements.txt +``` + +There are two environment variables you must set, the first is `WORKSPACE` which is +where the agent will look for and write files to: +```bash +export WORKSPACE=/path/to/your/workspace +``` + +The second is `ZMQ_PORT`, this is how the agent collects logs from subprocesses it runs +for writing code: +```bash +export ZMQ_PORT=5555 +``` + +Finally you can launch the app with the following command: +```bash +streamlit run app.py +``` + +You can upload an image to your workspace in the right column first tab, then ask the +agent to do a task, (be sure to include which image you want it to use for testing) for +example: +``` +Can you count the number of people in this image? Use image.jpg for testing. +``` + +## Layout +The are two columns, left and right, each with two tabs. + +`Chat` the left column first tab is where you can chat with Vision Agent. It can answer +your questions and execute python code on your behalf. Note if you ask it to generate +vision code it may take awhile to run. + +`Code Execution Logs` the left column second tab is where you will see intermediate logs +when Vision Agent is generating vision code. Because code generation can take some +time, you can monitor this tab to see what the agent is doing. + +`File Browser` the right column first tab is where you can see the files in your +workspace. + +`Code Editor` the right column second tab is where you can examine code files the agent +has written. You can also modify the code and save it in case the code is incorrect. diff --git a/examples/chat/app.py b/examples/chat/app.py new file mode 100644 index 00000000..f1cd62e7 --- /dev/null +++ b/examples/chat/app.py @@ -0,0 +1,173 @@ +import os +import threading +import time +from pathlib import Path + +import streamlit as st +import zmq +from code_editor import code_editor +from streamlit_autorefresh import st_autorefresh + +import vision_agent as va + +WORKSPACE = Path(os.environ.get("WORKSPACE", "")) +ZMQ_PORT = os.environ.get("ZMQ_PORT", None) +if ZMQ_PORT is None: + ZMQ_PORT = "5555" + os.environ["ZMQ_PORT"] = ZMQ_PORT + + +CACHE = Path(".cache.pkl") +SAVE = { + "name": "Save", + "feather": "Save", + "hasText": True, + "commands": ["save-state", ["response", "saved"]], + "response": "saved", + "style": {"bottom": "calc(50% - 4.25rem", "right": "0.4rem"}, +} +agent = va.agent.VisionAgent(verbosity=1) + +st.set_page_config(layout="wide") + +if "file_path" not in st.session_state: + st.session_state.file_path = None + +if "messages" not in st.session_state: + st.session_state.messages = [] + +if "updates" not in st.session_state: + st.session_state.updates = [] + +if "input_text" not in st.session_state: + st.session_state.input_text = "" + + +def update_messages(messages, lock): + new_chat = agent.chat_with_code(messages) + with lock: + for new_message in new_chat: + if new_message not in messages: + messages.append(new_message) + + +def get_updates(updates, lock): + context = zmq.Context() + socket = context.socket(zmq.PULL) + socket.bind(f"tcp://*:{ZMQ_PORT}") + + while True: + message = socket.recv_json() + with lock: + updates.append(message) + time.sleep(0.1) + + +def submit(): + st.session_state.input_text = st.session_state.widget + st.session_state.widget = "" + + +update_lock = threading.Lock() +message_lock = threading.Lock() + +st_autorefresh(interval=1000, key="refresh") + + +def main(): + st.title("Vision Agent") + left_column, right_column = st.columns([2, 3]) + + with left_column: + st.title("Chat & Code Execution") + tabs = st.tabs(["Chat", "Code Execution Logs"]) + + with tabs[0]: + messages = st.container(height=400) + for message in st.session_state.messages: + if message["role"] in {"user", "assistant"}: + msg = message["content"] + msg = msg.replace("", "`") + msg = msg.replace("", "`") + messages.chat_message(message["role"]).write(msg) + else: + messages.chat_message("observation").text(message["content"]) + + st.text_input("Chat here", key="widget", on_change=submit) + prompt = st.session_state.input_text + + if prompt: + st.session_state.messages.append({"role": "user", "content": prompt}) + messages.chat_message("user").write(prompt) + message_thread = threading.Thread( + target=update_messages, + args=(st.session_state.messages, message_lock), + ) + message_thread.daemon = True + message_thread.start() + st.session_state.input_text = "" + + with tabs[1]: + updates = st.container(height=400) + for update in st.session_state.updates: + updates.chat_message("coder").write(update) + + with right_column: + st.title("File Browser & Code Editor") + tabs = st.tabs(["File Browser", "Code Editor"]) + + with tabs[0]: + uploaded_file = st.file_uploader("Upload a file") + if uploaded_file is not None: + with open(WORKSPACE / uploaded_file.name, "wb") as f: + f.write(uploaded_file.getbuffer()) + + for file in WORKSPACE.iterdir(): + if "__pycache__" not in str(file) and not str(file).startswith("."): + if st.button(file.name): + st.session_state.file_path = file + + if ( + "file_path" in st.session_state + and st.session_state.file_path is not None + and st.session_state.file_path.suffix + in ( + ".jpg", + ".jpeg", + ".png", + ".bmp", + ".webp", + ) + ): + st.image( + str(WORKSPACE / st.session_state.file_path), use_column_width=True + ) + + with tabs[1]: + if ( + "file_path" not in st.session_state + or st.session_state.file_path is None + or st.session_state.file_path.suffix != ".py" + ): + st.write("Please select a python file from the file browser.") + else: + with open(WORKSPACE / st.session_state.file_path, "r") as f: + code = f.read() + + resp = code_editor(code, lang="python", buttons=[SAVE]) + if resp["type"] == "saved": + text = resp["text"] + with open(st.session_state.file_path, "w") as f: + f.write(text) + + if "update_thread_started" not in st.session_state: + update_thread = threading.Thread( + target=get_updates, args=(st.session_state.updates, update_lock) + ) + update_thread.daemon = True + update_thread.start() + st.session_state.update_thread_started = True + + +if __name__ == "__main__": + main() diff --git a/examples/chat/requirements.txt b/examples/chat/requirements.txt new file mode 100644 index 00000000..51fc7b8c --- /dev/null +++ b/examples/chat/requirements.txt @@ -0,0 +1,4 @@ +streamlit +streamlit_code_editor +streamlit-autorefresh +zmq diff --git a/mkdocs.yml b/mkdocs.yml index d53aa4b9..f6cb39aa 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -3,18 +3,59 @@ site_url: https://landing-ai.github.io/ repo_url: https://github.com/landing-ai/vision-agent edit_uri: edit/main/docs/ - theme: - name: "material" + name: material custom_dir: docs/_overrides + palette: + - media: "(prefers-color-scheme: light)" + scheme: default + primary: teal + accent: amber + toggle: + icon: material/lightbulb + name: Switch to dark mode + - media: "(prefers-color-scheme: dark)" + scheme: slate + primary: teal + accent: amber + toggle: + icon: material/lightbulb-outline + name: Switch to light mode features: - - content.code.copy + - search.suggest + - search.highlight + - content.tabs.link + - navigation.indexes + - content.tooltips + - navigation.path - content.code.annotate - - content.action.edit + - content.code.copy + - content.code.select + - navigation.tabs + icon: + repo: fontawesome/brands/github-alt + plugins: - - mkdocstrings - - search + search: null + mkdocstrings: + handlers: + python: + options: + show_root_heading: true + show_if_no_docstring: true + inherited_members: true + members_order: source + separate_signature: true + unwrap_annotated: true + filters: + - '!^_' + merge_init_into_class: true + docstring_section_style: spacy + signature_crossrefs: true + show_symbol_type_heading: true + show_symbol_type_toc: true + markdown_extensions: # Syntax highlight @@ -33,8 +74,7 @@ markdown_extensions: nav: - Quick start: index.md - APIs: - - vision_agent.agent: api/agent.md - - vision_agent.tools: api/tools.md - - vision_agent.llm: api/llm.md - - vision_agent.lmm: api/lmm.md - - vision_agent.utils: api/utils.md + - vision_agent.agent: api/agent.md + - vision_agent.tools: api/tools.md + - vision_agent.lmm: api/lmm.md + - vision_agent.utils: api/utils.md diff --git a/vision_agent/agent/__init__.py b/vision_agent/agent/__init__.py index cbe0f381..79b6abae 100644 --- a/vision_agent/agent/__init__.py +++ b/vision_agent/agent/__init__.py @@ -1,2 +1,3 @@ from .agent import Agent -from .vision_agent import AzureVisionAgent, VisionAgent +from .vision_agent import VisionAgent +from .vision_agent_coder import AzureVisionAgentCoder, VisionAgentCoder diff --git a/vision_agent/agent/agent.py b/vision_agent/agent/agent.py index aaf02d1f..6b11f297 100644 --- a/vision_agent/agent/agent.py +++ b/vision_agent/agent/agent.py @@ -2,7 +2,7 @@ from pathlib import Path from typing import Any, Dict, List, Optional, Union -from vision_agent.lmm import Message +from vision_agent.lmm.types import Message class Agent(ABC): diff --git a/vision_agent/agent/agent_utils.py b/vision_agent/agent/agent_utils.py new file mode 100644 index 00000000..e4e678d7 --- /dev/null +++ b/vision_agent/agent/agent_utils.py @@ -0,0 +1,43 @@ +import json +import logging +import sys +from typing import Any, Dict + +logging.basicConfig(stream=sys.stdout) +_LOGGER = logging.getLogger(__name__) + + +def extract_json(json_str: str) -> Dict[str, Any]: + try: + json_dict = json.loads(json_str) + except json.JSONDecodeError: + input_json_str = json_str + if "```json" in json_str: + json_str = json_str[json_str.find("```json") + len("```json") :] + json_str = json_str[: json_str.find("```")] + elif "```" in json_str: + json_str = json_str[json_str.find("```") + len("```") :] + # get the last ``` not one from an intermediate string + json_str = json_str[: json_str.find("}```")] + try: + json_dict = json.loads(json_str) + except json.JSONDecodeError as e: + error_msg = f"Could not extract JSON from the given str: {json_str}.\nFunction input:\n{input_json_str}" + _LOGGER.exception(error_msg) + raise ValueError(error_msg) from e + return json_dict # type: ignore + + +def extract_code(code: str) -> str: + if "\n```python" in code: + start = "\n```python" + elif "```python" in code: + start = "```python" + else: + return code + + code = code[code.find(start) + len(start) :] + code = code[: code.find("```")] + if code.startswith("python\n"): + code = code[len("python\n") :] + return code diff --git a/vision_agent/agent/vision_agent.py b/vision_agent/agent/vision_agent.py index 21772b81..95560ecd 100644 --- a/vision_agent/agent/vision_agent.py +++ b/vision_agent/agent/vision_agent.py @@ -1,57 +1,39 @@ import copy -import difflib -import json import logging -import sys -import tempfile +import os from pathlib import Path -from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union, cast +from typing import Any, Dict, List, Optional, Union, cast -from langsmith import traceable -from PIL import Image -from rich.console import Console -from rich.style import Style -from rich.syntax import Syntax -from tabulate import tabulate - -import vision_agent.tools as T from vision_agent.agent import Agent +from vision_agent.agent.agent_utils import extract_json from vision_agent.agent.vision_agent_prompts import ( - CODE, - FIX_BUG, - FULL_TASK, - PICK_PLAN, - PLAN, - PREVIOUS_FAILED, - SIMPLE_TEST, - TEST_PLANS, - USER_REQ, + EXAMPLES_CODE1, + EXAMPLES_CODE2, + VA_CODE, ) -from vision_agent.lmm import LMM, AzureOpenAILMM, Message, OpenAILMM -from vision_agent.utils import CodeInterpreterFactory, Execution +from vision_agent.lmm import LMM, Message, OpenAILMM +from vision_agent.tools import META_TOOL_DOCSTRING +from vision_agent.utils import CodeInterpreterFactory from vision_agent.utils.execute import CodeInterpreter -from vision_agent.utils.image_utils import b64_to_pil -from vision_agent.utils.sim import AzureSim, Sim -from vision_agent.utils.video import play_video -logging.basicConfig(stream=sys.stdout) +logging.basicConfig(level=logging.INFO) _LOGGER = logging.getLogger(__name__) -_MAX_TABULATE_COL_WIDTH = 80 -_CONSOLE = Console() +WORKSPACE = Path(os.getenv("WORKSPACE", "")) +WORKSPACE.mkdir(parents=True, exist_ok=True) +if str(WORKSPACE) != "": + os.environ["PYTHONPATH"] = f"{WORKSPACE}:{os.getenv('PYTHONPATH', '')}" class DefaultImports: - """Container for default imports used in the code execution.""" - - common_imports = [ + code = [ "from typing import *", - "from pillow_heif import register_heif_opener", - "register_heif_opener()", + "from vision_agent.utils.execute import CodeInterpreter", + "from vision_agent.tools.meta_tools import generate_vision_code, edit_vision_code, open_file, create_file, scroll_up, scroll_down, edit_file, get_tool_descriptions", ] @staticmethod def to_code_string() -> str: - return "\n".join(DefaultImports.common_imports + T.__new_tools__) + return "\n".join(DefaultImports.code) @staticmethod def prepend_imports(code: str) -> str: @@ -61,686 +43,143 @@ def prepend_imports(code: str) -> str: return DefaultImports.to_code_string() + "\n\n" + code -def get_diff(before: str, after: str) -> str: - return "".join( - difflib.unified_diff( - before.splitlines(keepends=True), after.splitlines(keepends=True) - ) - ) - - -def format_memory(memory: List[Dict[str, str]]) -> str: - output_str = "" - for i, m in enumerate(memory): - output_str += f"### Feedback {i}:\n" - output_str += f"Code {i}:\n```python\n{m['code']}```\n\n" - output_str += f"Feedback {i}: {m['feedback']}\n\n" - if "edits" in m: - output_str += f"Edits {i}:\n{m['edits']}\n" - output_str += "\n" - - return output_str - - -def format_plans(plans: Dict[str, Any]) -> str: - plan_str = "" - for k, v in plans.items(): - plan_str += f"{k}:\n" - plan_str += "-" + "\n-".join([e["instructions"] for e in v]) - - return plan_str - - -def extract_code(code: str) -> str: - if "\n```python" in code: - start = "\n```python" - elif "```python" in code: - start = "```python" - else: - return code - - code = code[code.find(start) + len(start) :] - code = code[: code.find("```")] - if code.startswith("python\n"): - code = code[len("python\n") :] - return code - - -def extract_json(json_str: str) -> Dict[str, Any]: - try: - json_dict = json.loads(json_str) - except json.JSONDecodeError: - input_json_str = json_str - if "```json" in json_str: - json_str = json_str[json_str.find("```json") + len("```json") :] - json_str = json_str[: json_str.find("```")] - elif "```" in json_str: - json_str = json_str[json_str.find("```") + len("```") :] - # get the last ``` not one from an intermediate string - json_str = json_str[: json_str.find("}```")] - try: - json_dict = json.loads(json_str) - except json.JSONDecodeError as e: - error_msg = f"Could not extract JSON from the given str: {json_str}.\nFunction input:\n{input_json_str}" - _LOGGER.exception(error_msg) - raise ValueError(error_msg) from e - return json_dict # type: ignore - - -def extract_image( - media: Optional[Sequence[Union[str, Path]]] -) -> Optional[Sequence[Union[str, Path]]]: - if media is None: - return None - - new_media = [] - for m in media: - m = Path(m) - extension = m.suffix - if extension in [".jpg", ".jpeg", ".png", ".bmp"]: - new_media.append(m) - elif extension in [".mp4", ".mov"]: - frames = T.extract_frames(m) - with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp: - if len(frames) > 0: - Image.fromarray(frames[0][0]).save(tmp.name) - new_media.append(Path(tmp.name)) - if len(new_media) == 0: - return None - return new_media - - -@traceable -def write_plans( - chat: List[Message], - tool_desc: str, - working_memory: str, - model: LMM, -) -> Dict[str, Any]: - chat = copy.deepcopy(chat) - if chat[-1]["role"] != "user": - raise ValueError("Last chat message must be from the user.") - - user_request = chat[-1]["content"] - context = USER_REQ.format(user_request=user_request) - prompt = PLAN.format(context=context, tool_desc=tool_desc, feedback=working_memory) - chat[-1]["content"] = prompt - return extract_json(model.chat(chat)) - - -@traceable -def pick_plan( - chat: List[Message], - plans: Dict[str, Any], - tool_infos: Dict[str, str], - model: LMM, - code_interpreter: CodeInterpreter, - test_multi_plan: bool, - log_progress: Callable[[Dict[str, Any]], None], - verbosity: int = 0, - max_retries: int = 3, -) -> Tuple[Any, str, str]: - if not test_multi_plan: - k = list(plans.keys())[0] - log_progress( - { - "type": "log", - "log_content": "Plans created", - "status": "completed", - "payload": plans[k], - } - ) - return plans[k], tool_infos[k], "" - - log_progress( - { - "type": "log", - "log_content": "Generating code to pick best plan", - "status": "started", - } - ) - all_tool_info = tool_infos["all"] +def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]: chat = copy.deepcopy(chat) - if chat[-1]["role"] != "user": - raise ValueError("Last chat message must be from the user.") - - plan_str = format_plans(plans) - prompt = TEST_PLANS.format( - docstring=all_tool_info, plans=plan_str, previous_attempts="" - ) - code = extract_code(model(prompt)) - log_progress( - { - "type": "log", - "log_content": "Executing code to test plan", - "code": code, - "status": "running", - } - ) - tool_output = code_interpreter.exec_isolation(DefaultImports.prepend_imports(code)) - tool_output_str = "" - if len(tool_output.logs.stdout) > 0: - tool_output_str = tool_output.logs.stdout[0] - - if verbosity == 2: - _print_code("Initial code and tests:", code) - _LOGGER.info(f"Initial code execution result:\n{tool_output.text()}") - - log_progress( - { - "type": "log", - "log_content": ( - "Code execution succeed" - if tool_output.success - else "Code execution failed" - ), - "payload": tool_output.to_json(), - "status": "completed" if tool_output.success else "failed", - } - ) - # retry if the tool output is empty or code fails - count = 0 - while (not tool_output.success or tool_output_str == "") and count < max_retries: - prompt = TEST_PLANS.format( - docstring=all_tool_info, - plans=plan_str, - previous_attempts=PREVIOUS_FAILED.format( - code=code, error=tool_output.text() - ), - ) - log_progress( - { - "type": "log", - "log_content": "Retry running code", - "code": code, - "status": "running", - } - ) - code = extract_code(model(prompt)) - tool_output = code_interpreter.exec_isolation( - DefaultImports.prepend_imports(code) - ) - log_progress( - { - "type": "log", - "log_content": ( - "Code execution succeed" - if tool_output.success - else "Code execution failed" - ), - "code": code, - "payload": { - "result": tool_output.to_json(), - }, - "status": "completed" if tool_output.success else "failed", - } - ) - tool_output_str = "" - if len(tool_output.logs.stdout) > 0: - tool_output_str = tool_output.logs.stdout[0] - - if verbosity == 2: - _print_code("Code and test after attempted fix:", code) - _LOGGER.info(f"Code execution result after attempte {count}") - - count += 1 - - if verbosity >= 1: - _print_code("Final code:", code) - - user_req = chat[-1]["content"] - context = USER_REQ.format(user_request=user_req) - # because the tool picker model gets the image as well, we have to be careful with - # how much text we send it, so we truncate the tool output to 20,000 characters - prompt = PICK_PLAN.format( - context=context, - plans=format_plans(plans), - tool_output=tool_output_str[:20_000], - ) - chat[-1]["content"] = prompt - best_plan = extract_json(model(chat)) - if verbosity >= 1: - _LOGGER.info(f"Best plan:\n{best_plan}") - - plan = best_plan["best_plan"] - if plan in plans and plan in tool_infos: - best_plans = plans[plan] - best_tool_infos = tool_infos[plan] + conversation = "" + for chat_i in chat: + if chat_i["role"] == "user": + conversation += f"USER: {chat_i['content']}\n\n" + elif chat_i["role"] == "observation": + conversation += f"OBSERVATION:\n{chat_i['content']}\n\n" + elif chat_i["role"] == "assistant": + conversation += f"AGENT: {chat_i['content']}\n\n" + else: + raise ValueError(f"role {chat_i['role']} is not supported") + + prompt = VA_CODE.format( + documentation=META_TOOL_DOCSTRING, + examples=f"{EXAMPLES_CODE1}\n{EXAMPLES_CODE2}", + dir=WORKSPACE, + conversation=conversation, + ) + return extract_json(orch([{"role": "user", "content": prompt}])) + + +def run_code_action(code: str, code_interpreter: CodeInterpreter) -> str: + # Note the code interpreter needs to keep running in the same environment because + # the SWE tools hold state like line numbers and currently open files. + result = code_interpreter.exec_cell(DefaultImports.prepend_imports(code)) + + return_str = "" + if result.success: + for res in result.results: + if res.text is not None: + return_str += res.text.replace("\\n", "\n") + if result.logs.stdout: + return_str += "----- stdout -----\n" + for log in result.logs.stdout: + return_str += log.replace("\\n", "\n") else: - if verbosity >= 1: - _LOGGER.warning( - f"Best plan {plan} not found in plans or tool_infos. Using the first plan and tool info." + # for log in result.logs.stderr: + # return_str += log.replace("\\n", "\n") + if result.error: + return_str += ( + "\n" + result.error.value + "\n".join(result.error.traceback_raw) ) - k = list(plans.keys())[0] - best_plans = plans[k] - best_tool_infos = tool_infos[k] - - log_progress( - { - "type": "log", - "log_content": "Picked best plan", - "status": "complete", - "payload": best_plans, - } - ) - return best_plans, best_tool_infos, tool_output_str - - -@traceable -def write_code( - coder: LMM, - chat: List[Message], - plan: str, - tool_info: str, - tool_output: str, - feedback: str, -) -> str: - chat = copy.deepcopy(chat) - if chat[-1]["role"] != "user": - raise ValueError("Last chat message must be from the user.") - - user_request = chat[-1]["content"] - prompt = CODE.format( - docstring=tool_info, - question=FULL_TASK.format(user_request=user_request, subtasks=plan), - tool_output=tool_output, - feedback=feedback, - ) - chat[-1]["content"] = prompt - return extract_code(coder(chat)) - - -@traceable -def write_test( - tester: LMM, - chat: List[Message], - tool_utils: str, - code: str, - feedback: str, - media: Optional[Sequence[Union[str, Path]]] = None, -) -> str: - chat = copy.deepcopy(chat) - if chat[-1]["role"] != "user": - raise ValueError("Last chat message must be from the user.") - - user_request = chat[-1]["content"] - prompt = SIMPLE_TEST.format( - docstring=tool_utils, - question=user_request, - code=code, - feedback=feedback, - media=media, - ) - chat[-1]["content"] = prompt - return extract_code(tester(chat)) - - -def write_and_test_code( - chat: List[Message], - plan: str, - tool_info: str, - tool_output: str, - tool_utils: str, - working_memory: List[Dict[str, str]], - coder: LMM, - tester: LMM, - debugger: LMM, - code_interpreter: CodeInterpreter, - log_progress: Callable[[Dict[str, Any]], None], - verbosity: int = 0, - max_retries: int = 3, - media: Optional[Sequence[Union[str, Path]]] = None, -) -> Dict[str, Any]: - log_progress( - { - "type": "log", - "log_content": "Generating code", - "status": "started", - } - ) - code = write_code( - coder, - chat, - plan, - tool_info, - tool_output, - format_memory(working_memory), - ) - test = write_test( - tester, chat, tool_utils, code, format_memory(working_memory), media - ) - log_progress( - { - "type": "log", - "log_content": "Running code", - "status": "running", - "code": DefaultImports.prepend_imports(code), - "payload": { - "test": test, - }, - } - ) - result = code_interpreter.exec_isolation( - f"{DefaultImports.to_code_string()}\n{code}\n{test}" - ) - log_progress( - { - "type": "log", - "log_content": ( - "Code execution succeed" if result.success else "Code execution failed" - ), - "status": "completed" if result.success else "failed", - "code": DefaultImports.prepend_imports(code), - "payload": { - "test": test, - "result": result.to_json(), - }, - } - ) - if verbosity == 2: - _print_code("Initial code and tests:", code, test) - _LOGGER.info( - f"Initial code execution result:\n{result.text(include_logs=True)}" - ) + return return_str - count = 0 - new_working_memory: List[Dict[str, str]] = [] - while not result.success and count < max_retries: - if verbosity == 2: - _LOGGER.info(f"Start debugging attempt {count + 1}") - code, test, result = debug_code( - working_memory, - debugger, - code_interpreter, - code, - test, - result, - new_working_memory, - log_progress, - verbosity, - ) - count += 1 - - if verbosity >= 1: - _print_code("Final code and tests:", code, test) - - return { - "code": code, - "test": test, - "success": result.success, - "test_result": result, - "working_memory": new_working_memory, - } - - -@traceable -def debug_code( - working_memory: List[Dict[str, str]], - debugger: LMM, - code_interpreter: CodeInterpreter, - code: str, - test: str, - result: Execution, - new_working_memory: List[Dict[str, str]], - log_progress: Callable[[Dict[str, Any]], None], - verbosity: int = 0, -) -> tuple[str, str, Execution]: - log_progress( - { - "type": "code", - "status": "started", - } - ) - fixed_code_and_test = {"code": "", "test": "", "reflections": ""} - success = False - count = 0 - while not success and count < 3: - try: - fixed_code_and_test = extract_json( - debugger( - FIX_BUG.format( - code=code, - tests=test, - result="\n".join(result.text().splitlines()[-100:]), - feedback=format_memory(working_memory + new_working_memory), - ) - ) - ) - success = True - except Exception as e: - _LOGGER.exception(f"Error while extracting JSON: {e}") - - count += 1 - - old_code = code - old_test = test - - if fixed_code_and_test["code"].strip() != "": - code = extract_code(fixed_code_and_test["code"]) - if fixed_code_and_test["test"].strip() != "": - test = extract_code(fixed_code_and_test["test"]) - - new_working_memory.append( - { - "code": f"{code}\n{test}", - "feedback": fixed_code_and_test["reflections"], - "edits": get_diff(f"{old_code}\n{old_test}", f"{code}\n{test}"), - } - ) - log_progress( - { - "type": "code", - "status": "running", - "payload": { - "code": DefaultImports.prepend_imports(code), - "test": test, - }, - } - ) - - result = code_interpreter.exec_isolation( - f"{DefaultImports.to_code_string()}\n{code}\n{test}" - ) - log_progress( - { - "type": "code", - "status": "completed" if result.success else "failed", - "payload": { - "code": DefaultImports.prepend_imports(code), - "test": test, - "result": result.to_json(), - }, - } - ) - if verbosity == 2: - _print_code("Code and test after attempted fix:", code, test) - _LOGGER.info( - f"Reflection: {fixed_code_and_test['reflections']}\nCode execution result after attempted fix: {result.text(include_logs=True)}" - ) - - return code, test, result - - -def _print_code(title: str, code: str, test: Optional[str] = None) -> None: - _CONSOLE.print(title, style=Style(bgcolor="dark_orange3", bold=True)) - _CONSOLE.print("=" * 30 + " Code " + "=" * 30) - _CONSOLE.print( - Syntax( - DefaultImports.prepend_imports(code), - "python", - theme="gruvbox-dark", - line_numbers=True, - ) - ) - if test: - _CONSOLE.print("=" * 30 + " Test " + "=" * 30) - _CONSOLE.print(Syntax(test, "python", theme="gruvbox-dark", line_numbers=True)) - - -def retrieve_tools( - plans: Dict[str, List[Dict[str, str]]], - tool_recommender: Sim, - verbosity: int = 0, -) -> Tuple[Dict[str, str], Dict[str, List[Dict[str, str]]]]: - tool_info = [] - tool_desc = [] - tool_lists: Dict[str, List[Dict[str, str]]] = {} - for k, plan in plans.items(): - tool_lists[k] = [] - for task in plan: - tools = tool_recommender.top_k(task["instructions"], k=2, thresh=0.3) - tool_info.extend([e["doc"] for e in tools]) - tool_desc.extend([e["desc"] for e in tools]) - tool_lists[k].extend( - { - "plan": task["instructions"] if index == 0 else "", - "tool": e["desc"].strip().split()[0], - "documentation": e["doc"], - } - for index, e in enumerate(tools) - ) - - if verbosity == 2: - tool_desc_str = "\n".join(set(tool_desc)) - _LOGGER.info(f"Tools Description:\n{tool_desc_str}") - - tool_lists_unique = {} - for k in tool_lists: - tool_lists_unique[k] = "\n\n".join( - set(e["documentation"] for e in tool_lists[k]) - ) - all_tools = "\n\n".join(set(tool_info)) - tool_lists_unique["all"] = all_tools - return tool_lists_unique, tool_lists +def parse_execution(response: str) -> Optional[str]: + code = None + if "" in response: + code = response[response.find("") + len("") :] + code = code[: code.find("")] + return code class VisionAgent(Agent): - """Vision Agent is an agentic framework that can output code based on a user - request. It can plan tasks, retrieve relevant tools, write code, write tests and - reflect on failed test cases to debug code. It is inspired by AgentCoder - https://arxiv.org/abs/2312.13010 and Data Interpeter - https://arxiv.org/abs/2402.18679 + """Vision Agent is an agent that can chat with the user and call tools or other + agents to generate code for it. Vision Agent uses python code to execute actions for + the user. Vision Agent is inspired by by OpenDev + https://github.com/OpenDevin/OpenDevin and CodeAct https://arxiv.org/abs/2402.01030 Example ------- - >>> from vision_agent import VisionAgent + >>> from vision_agent.agent import VisionAgent >>> agent = VisionAgent() - >>> code = agent("What percentage of the area of the jar is filled with coffee beans?", media="jar.jpg") + >>> resp = agent("Hello") + >>> resp.append({"role": "user", "content": "Can you write a function that counts dogs?", "media": ["dog.jpg"]}) + >>> resp = agent(resp) """ def __init__( self, - planner: Optional[LMM] = None, - coder: Optional[LMM] = None, - tester: Optional[LMM] = None, - debugger: Optional[LMM] = None, - tool_recommender: Optional[Sim] = None, + agent: Optional[LMM] = None, verbosity: int = 0, - report_progress_callback: Optional[Callable[[Dict[str, Any]], None]] = None, code_sandbox_runtime: Optional[str] = None, ) -> None: - """Initialize the Vision Agent. - - Parameters: - planner (Optional[LMM]): The planner model to use. Defaults to OpenAILMM. - coder (Optional[LMM]): The coder model to use. Defaults to OpenAILMM. - tester (Optional[LMM]): The tester model to use. Defaults to OpenAILMM. - debugger (Optional[LMM]): The debugger model to - tool_recommender (Optional[Sim]): The tool recommender model to use. - verbosity (int): The verbosity level of the agent. Defaults to 0. 2 is the - highest verbosity level which will output all intermediate debugging - code. - report_progress_callback: a callback to report the progress of the agent. - This is useful for streaming logs in a web application where multiple - VisionAgent instances are running in parallel. This callback ensures - that the progress are not mixed up. - code_sandbox_runtime: the code sandbox runtime to use. A code sandbox is - used to run the generated code. It can be one of the following - values: None, "local" or "e2b". If None, Vision Agent will read the - value from the environment variable CODE_SANDBOX_RUNTIME. If it's - also None, the local python runtime environment will be used. - """ - - self.planner = ( - OpenAILMM(temperature=0.0, json_mode=True) if planner is None else planner - ) - self.coder = OpenAILMM(temperature=0.0) if coder is None else coder - self.tester = OpenAILMM(temperature=0.0) if tester is None else tester - self.debugger = ( - OpenAILMM(temperature=0.0, json_mode=True) if debugger is None else debugger - ) - - self.tool_recommender = ( - Sim(T.TOOLS_DF, sim_key="desc") - if tool_recommender is None - else tool_recommender + self.agent = ( + OpenAILMM(temperature=0.0, json_mode=True) if agent is None else agent ) + self.max_iterations = 100 self.verbosity = verbosity - self.max_retries = 2 - self.report_progress_callback = report_progress_callback self.code_sandbox_runtime = code_sandbox_runtime + if self.verbosity >= 1: + _LOGGER.setLevel(logging.INFO) def __call__( self, input: Union[str, List[Message]], media: Optional[Union[str, Path]] = None, ) -> str: - """Chat with Vision Agent and return intermediate information regarding the task. + """Chat with VisionAgent and get the conversation response. Parameters: - input (Union[List[Dict[str, str]], str]): A conversation in the format of - [{"role": "user", "content": "describe your task here..."}] or a string - of just the contents. + input (Union[str, List[Message]): A conversation in the format of + [{"role": "user", "content": "describe your task here..."}, ...] or a + string of just the contents. media (Optional[Union[str, Path]]): The media file to be used in the task. Returns: - str: The code output by the Vision Agent. + str: The conversation response. """ - if isinstance(input, str): input = [{"role": "user", "content": input}] if media is not None: input[0]["media"] = [media] - results = self.chat_with_workflow(input) - results.pop("working_memory") + results = self.chat_with_code(input) return results # type: ignore - @traceable - def chat_with_workflow( + def chat_with_code( self, chat: List[Message], - test_multi_plan: bool = True, - display_visualization: bool = False, - ) -> Dict[str, Any]: - """Chat with Vision Agent and return intermediate information regarding the task. + ) -> List[Message]: + """Chat with VisionAgent, it will use code to execute actions to accomplish + its tasks. Parameters: - chat (List[MediaChatItem]): A conversation + chat (List[Message]): A conversation in the format of: [{"role": "user", "content": "describe your task here..."}] or if it contains media files, it should be in the format of: [{"role": "user", "content": "describe your task here...", "media": ["image1.jpg", "image2.jpg"]}] - display_visualization (bool): If True, it opens a new window locally to - show the image(s) created by visualization code (if there is any). Returns: - Dict[str, Any]: A dictionary containing the code, test, test result, plan, - and working memory of the agent. + List[Message]: The conversation response. """ if not chat: - raise ValueError("Chat cannot be empty.") + raise ValueError("chat cannot be empty") - # NOTE: each chat should have a dedicated code interpreter instance to avoid concurrency issues with CodeInterpreterFactory.new_instance( code_sandbox_runtime=self.code_sandbox_runtime ) as code_interpreter: - chat = copy.deepcopy(chat) + orig_chat = copy.deepcopy(chat) + int_chat = copy.deepcopy(chat) media_list = [] - for chat_i in chat: + for chat_i in int_chat: if "media" in chat_i: for media in chat_i["media"]: media = code_interpreter.upload_file(media) @@ -759,180 +198,33 @@ def chat_with_workflow( if "media" in c else {"role": c["role"], "content": c["content"]} ) - for c in chat + for c in int_chat ], ) - code = "" - test = "" - working_memory: List[Dict[str, str]] = [] - results = {"code": "", "test": "", "plan": []} - plan = [] - - self.log_progress( - { - "type": "log", - "log_content": "Creating plans", - "status": "started", - } - ) - plans = write_plans( - int_chat, - T.TOOL_DESCRIPTIONS, - format_memory(working_memory), - self.planner, - ) - - if self.verbosity >= 1 and test_multi_plan: - for p in plans: - _LOGGER.info( - f"\n{tabulate(tabular_data=plans[p], headers='keys', tablefmt='mixed_grid', maxcolwidths=_MAX_TABULATE_COL_WIDTH)}" - ) + finished = False + iterations = 0 + while not finished and iterations < self.max_iterations: + response = run_conversation(self.agent, int_chat) + if self.verbosity >= 1: + _LOGGER.info(response) + int_chat.append({"role": "assistant", "content": str(response)}) + orig_chat.append({"role": "assistant", "content": str(response)}) - tool_infos, tool_lists = retrieve_tools( - plans, - self.tool_recommender, - self.verbosity, - ) + if response["let_user_respond"]: + break - if test_multi_plan: - self.log_progress( - { - "type": "log", - "log_content": "Creating plans", - "status": "completed", - "payload": tool_lists, - } - ) - - best_plan, best_tool_info, tool_output_str = pick_plan( - int_chat, - plans, - tool_infos, - self.coder, - code_interpreter, - test_multi_plan, - self.log_progress, - verbosity=self.verbosity, - ) + code_action = parse_execution(response["response"]) - if self.verbosity >= 1: - _LOGGER.info( - f"Picked best plan:\n{tabulate(tabular_data=best_plan, headers='keys', tablefmt='mixed_grid', maxcolwidths=_MAX_TABULATE_COL_WIDTH)}" - ) - - results = write_and_test_code( - chat=[{"role": c["role"], "content": c["content"]} for c in int_chat], - plan="\n-" + "\n-".join([e["instructions"] for e in best_plan]), - tool_info=best_tool_info, - tool_output=tool_output_str, - tool_utils=T.UTILITIES_DOCSTRING, - working_memory=working_memory, - coder=self.coder, - tester=self.tester, - debugger=self.debugger, - code_interpreter=code_interpreter, - log_progress=self.log_progress, - verbosity=self.verbosity, - media=media_list, - ) - success = cast(bool, results["success"]) - code = cast(str, results["code"]) - test = cast(str, results["test"]) - working_memory.extend(results["working_memory"]) # type: ignore - plan.append({"code": code, "test": test, "plan": best_plan}) - - execution_result = cast(Execution, results["test_result"]) - self.log_progress( - { - "type": "final_code", - "status": "completed" if success else "failed", - "payload": { - "code": DefaultImports.prepend_imports(code), - "test": test, - "result": execution_result.to_json(), - }, - } - ) + if code_action is not None: + obs = run_code_action(code_action, code_interpreter) + if self.verbosity >= 1: + _LOGGER.info(obs) + int_chat.append({"role": "observation", "content": obs}) + orig_chat.append({"role": "observation", "content": obs}) - if display_visualization: - for res in execution_result.results: - if res.png: - b64_to_pil(res.png).show() - if res.mp4: - play_video(res.mp4) - - return { - "code": DefaultImports.prepend_imports(code), - "test": test, - "test_result": execution_result, - "plan": plan, - "working_memory": working_memory, - } + iterations += 1 + return orig_chat def log_progress(self, data: Dict[str, Any]) -> None: - if self.report_progress_callback is not None: - self.report_progress_callback(data) - - -class AzureVisionAgent(VisionAgent): - """Vision Agent that uses Azure OpenAI APIs for planning, coding, testing. - - Pre-requisites: - 1. Set the environment variable AZURE_OPENAI_API_KEY to your Azure OpenAI API key. - 2. Set the environment variable AZURE_OPENAI_ENDPOINT to your Azure OpenAI endpoint. - - Example - ------- - >>> from vision_agent import AzureVisionAgent - >>> agent = AzureVisionAgent() - >>> code = agent("What percentage of the area of the jar is filled with coffee beans?", media="jar.jpg") - """ - - def __init__( - self, - planner: Optional[LMM] = None, - coder: Optional[LMM] = None, - tester: Optional[LMM] = None, - debugger: Optional[LMM] = None, - tool_recommender: Optional[Sim] = None, - verbosity: int = 0, - report_progress_callback: Optional[Callable[[Dict[str, Any]], None]] = None, - ) -> None: - """Initialize the Vision Agent. - - Parameters: - planner (Optional[LMM]): The planner model to use. Defaults to OpenAILMM. - coder (Optional[LMM]): The coder model to use. Defaults to OpenAILMM. - tester (Optional[LMM]): The tester model to use. Defaults to OpenAILMM. - debugger (Optional[LMM]): The debugger model to - tool_recommender (Optional[Sim]): The tool recommender model to use. - verbosity (int): The verbosity level of the agent. Defaults to 0. 2 is the - highest verbosity level which will output all intermediate debugging - code. - report_progress_callback: a callback to report the progress of the agent. - This is useful for streaming logs in a web application where multiple - VisionAgent instances are running in parallel. This callback ensures - that the progress are not mixed up. - """ - super().__init__( - planner=( - AzureOpenAILMM(temperature=0.0, json_mode=True) - if planner is None - else planner - ), - coder=AzureOpenAILMM(temperature=0.0) if coder is None else coder, - tester=AzureOpenAILMM(temperature=0.0) if tester is None else tester, - debugger=( - AzureOpenAILMM(temperature=0.0, json_mode=True) - if debugger is None - else debugger - ), - tool_recommender=( - AzureSim(T.TOOLS_DF, sim_key="desc") - if tool_recommender is None - else tool_recommender - ), - verbosity=verbosity, - report_progress_callback=report_progress_callback, - ) + pass diff --git a/vision_agent/agent/vision_agent_coder.py b/vision_agent/agent/vision_agent_coder.py new file mode 100644 index 00000000..ebf9ead4 --- /dev/null +++ b/vision_agent/agent/vision_agent_coder.py @@ -0,0 +1,897 @@ +import copy +import difflib +import logging +import os +import sys +import tempfile +from pathlib import Path +from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union, cast + +from PIL import Image +from rich.console import Console +from rich.style import Style +from rich.syntax import Syntax +from tabulate import tabulate + +import vision_agent.tools as T +from vision_agent.agent import Agent +from vision_agent.agent.agent_utils import extract_code, extract_json +from vision_agent.agent.vision_agent_coder_prompts import ( + CODE, + FIX_BUG, + FULL_TASK, + PICK_PLAN, + PLAN, + PREVIOUS_FAILED, + SIMPLE_TEST, + TEST_PLANS, + USER_REQ, +) +from vision_agent.lmm import LMM, AzureOpenAILMM, Message, OpenAILMM +from vision_agent.utils import CodeInterpreterFactory, Execution +from vision_agent.utils.execute import CodeInterpreter +from vision_agent.utils.image_utils import b64_to_pil +from vision_agent.utils.sim import AzureSim, Sim +from vision_agent.utils.video import play_video + +logging.basicConfig(stream=sys.stdout) +WORKSPACE = Path(os.getenv("WORKSPACE", "")) +_LOGGER = logging.getLogger(__name__) +_MAX_TABULATE_COL_WIDTH = 80 +_CONSOLE = Console() + + +class DefaultImports: + """Container for default imports used in the code execution.""" + + common_imports = [ + "from typing import *", + "from pillow_heif import register_heif_opener", + "register_heif_opener()", + ] + + @staticmethod + def to_code_string() -> str: + return "\n".join(DefaultImports.common_imports + T.__new_tools__) + + @staticmethod + def prepend_imports(code: str) -> str: + """Run this method to prepend the default imports to the code. + NOTE: be sure to run this method after the custom tools have been registered. + """ + return DefaultImports.to_code_string() + "\n\n" + code + + +def get_diff(before: str, after: str) -> str: + return "".join( + difflib.unified_diff( + before.splitlines(keepends=True), after.splitlines(keepends=True) + ) + ) + + +def format_memory(memory: List[Dict[str, str]]) -> str: + output_str = "" + for i, m in enumerate(memory): + output_str += f"### Feedback {i}:\n" + output_str += f"Code {i}:\n```python\n{m['code']}```\n\n" + output_str += f"Feedback {i}: {m['feedback']}\n\n" + if "edits" in m: + output_str += f"Edits {i}:\n{m['edits']}\n" + output_str += "\n" + + return output_str + + +def format_plans(plans: Dict[str, Any]) -> str: + plan_str = "" + for k, v in plans.items(): + plan_str += f"{k}:\n" + plan_str += "-" + "\n-".join([e["instructions"] for e in v]) + + return plan_str + + +def extract_image( + media: Optional[Sequence[Union[str, Path]]] +) -> Optional[Sequence[Union[str, Path]]]: + if media is None: + return None + + new_media = [] + for m in media: + m = Path(m) + extension = m.suffix + if extension in [".jpg", ".jpeg", ".png", ".bmp"]: + new_media.append(m) + elif extension in [".mp4", ".mov"]: + frames = T.extract_frames(m) + with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp: + if len(frames) > 0: + Image.fromarray(frames[0][0]).save(tmp.name) + new_media.append(Path(tmp.name)) + if len(new_media) == 0: + return None + return new_media + + +def write_plans( + chat: List[Message], + tool_desc: str, + working_memory: str, + model: LMM, +) -> Dict[str, Any]: + chat = copy.deepcopy(chat) + if chat[-1]["role"] != "user": + raise ValueError("Last chat message must be from the user.") + + user_request = chat[-1]["content"] + context = USER_REQ.format(user_request=user_request) + prompt = PLAN.format(context=context, tool_desc=tool_desc, feedback=working_memory) + chat[-1]["content"] = prompt + return extract_json(model.chat(chat)) + + +def pick_plan( + chat: List[Message], + plans: Dict[str, Any], + tool_info: str, + model: LMM, + code_interpreter: CodeInterpreter, + media: List[str], + log_progress: Callable[[Dict[str, Any]], None], + verbosity: int = 0, + max_retries: int = 3, +) -> Tuple[str, str]: + log_progress( + { + "type": "log", + "log_content": "Generating code to pick the best plan", + "status": "started", + } + ) + + chat = copy.deepcopy(chat) + if chat[-1]["role"] != "user": + raise ValueError("Last chat message must be from the user.") + + plan_str = format_plans(plans) + prompt = TEST_PLANS.format( + docstring=tool_info, plans=plan_str, previous_attempts="", media=media + ) + + code = extract_code(model(prompt)) + log_progress( + { + "type": "log", + "log_content": "Executing code to test plans", + "code": DefaultImports.prepend_imports(code), + "status": "running", + } + ) + tool_output = code_interpreter.exec_isolation(DefaultImports.prepend_imports(code)) + tool_output_str = "" + if len(tool_output.logs.stdout) > 0: + tool_output_str = tool_output.logs.stdout[0] + + if verbosity == 2: + _print_code("Initial code and tests:", code) + _LOGGER.info(f"Initial code execution result:\n{tool_output.text()}") + + log_progress( + { + "type": "log", + "log_content": ( + "Code execution succeeded" + if tool_output.success + else "Code execution failed" + ), + "payload": tool_output.to_json(), + "status": "completed" if tool_output.success else "failed", + } + ) + + # retry if the tool output is empty or code fails + count = 0 + while (not tool_output.success or tool_output_str == "") and count < max_retries: + prompt = TEST_PLANS.format( + docstring=tool_info, + plans=plan_str, + previous_attempts=PREVIOUS_FAILED.format( + code=code, error=tool_output.text() + ), + media=media, + ) + log_progress( + { + "type": "log", + "log_content": "Retrying code to test plans", + "status": "running", + "code": DefaultImports.prepend_imports(code), + } + ) + code = extract_code(model(prompt)) + log_progress( + { + "type": "log", + "log_content": ( + "Code execution succeeded" + if tool_output.success + else "Code execution failed" + ), + "code": DefaultImports.prepend_imports(code), + "payload": tool_output.to_json(), + "status": "completed" if tool_output.success else "failed", + } + ) + tool_output = code_interpreter.exec_isolation( + DefaultImports.prepend_imports(code) + ) + tool_output_str = "" + if len(tool_output.logs.stdout) > 0: + tool_output_str = tool_output.logs.stdout[0] + + if verbosity == 2: + _print_code("Code and test after attempted fix:", code) + _LOGGER.info(f"Code execution result after attempte {count}") + + count += 1 + + if verbosity >= 1: + _print_code("Final code:", code) + + user_req = chat[-1]["content"] + context = USER_REQ.format(user_request=user_req) + # because the tool picker model gets the image as well, we have to be careful with + # how much text we send it, so we truncate the tool output to 20,000 characters + prompt = PICK_PLAN.format( + context=context, + plans=format_plans(plans), + tool_output=tool_output_str[:20_000], + ) + chat[-1]["content"] = prompt + best_plan = extract_json(model(chat)) + + if verbosity >= 1: + _LOGGER.info(f"Best plan:\n{best_plan}") + log_progress( + { + "type": "log", + "log_content": "Picked best plan", + "status": "completed", + "payload": plans[best_plan["best_plan"]], + } + ) + return best_plan["best_plan"], tool_output_str + + +def write_code( + coder: LMM, + chat: List[Message], + plan: str, + tool_info: str, + tool_output: str, + feedback: str, +) -> str: + chat = copy.deepcopy(chat) + if chat[-1]["role"] != "user": + raise ValueError("Last chat message must be from the user.") + + user_request = chat[-1]["content"] + prompt = CODE.format( + docstring=tool_info, + question=FULL_TASK.format(user_request=user_request, subtasks=plan), + tool_output=tool_output, + feedback=feedback, + ) + chat[-1]["content"] = prompt + return extract_code(coder(chat)) + + +def write_test( + tester: LMM, + chat: List[Message], + tool_utils: str, + code: str, + feedback: str, + media: Optional[Sequence[Union[str, Path]]] = None, +) -> str: + chat = copy.deepcopy(chat) + if chat[-1]["role"] != "user": + raise ValueError("Last chat message must be from the user.") + + user_request = chat[-1]["content"] + prompt = SIMPLE_TEST.format( + docstring=tool_utils, + question=user_request, + code=code, + feedback=feedback, + media=media, + ) + chat[-1]["content"] = prompt + return extract_code(tester(chat)) + + +def write_and_test_code( + chat: List[Message], + plan: str, + tool_info: str, + tool_output: str, + tool_utils: str, + working_memory: List[Dict[str, str]], + coder: LMM, + tester: LMM, + debugger: LMM, + code_interpreter: CodeInterpreter, + log_progress: Callable[[Dict[str, Any]], None], + verbosity: int = 0, + max_retries: int = 3, + media: Optional[Sequence[Union[str, Path]]] = None, +) -> Dict[str, Any]: + log_progress( + { + "type": "log", + "log_content": "Generating code", + "status": "started", + } + ) + code = write_code( + coder, + chat, + plan, + tool_info, + tool_output, + format_memory(working_memory), + ) + test = write_test( + tester, chat, tool_utils, code, format_memory(working_memory), media + ) + + log_progress( + { + "type": "log", + "log_content": "Running code", + "status": "running", + "code": DefaultImports.prepend_imports(code), + "payload": { + "test": test, + }, + } + ) + result = code_interpreter.exec_isolation( + f"{DefaultImports.to_code_string()}\n{code}\n{test}" + ) + log_progress( + { + "type": "log", + "log_content": ( + "Code execution succeeded" + if result.success + else "Code execution failed" + ), + "status": "completed" if result.success else "failed", + "code": DefaultImports.prepend_imports(code), + "payload": { + "test": test, + "result": result.to_json(), + }, + } + ) + if verbosity == 2: + _print_code("Initial code and tests:", code, test) + _LOGGER.info( + f"Initial code execution result:\n{result.text(include_logs=True)}" + ) + + count = 0 + new_working_memory: List[Dict[str, str]] = [] + while not result.success and count < max_retries: + if verbosity == 2: + _LOGGER.info(f"Start debugging attempt {count + 1}") + code, test, result = debug_code( + working_memory, + debugger, + code_interpreter, + code, + test, + result, + new_working_memory, + log_progress, + verbosity, + ) + count += 1 + + if verbosity >= 1: + _print_code("Final code and tests:", code, test) + + return { + "code": code, + "test": test, + "success": result.success, + "test_result": result, + "working_memory": new_working_memory, + } + + +def debug_code( + working_memory: List[Dict[str, str]], + debugger: LMM, + code_interpreter: CodeInterpreter, + code: str, + test: str, + result: Execution, + new_working_memory: List[Dict[str, str]], + log_progress: Callable[[Dict[str, Any]], None], + verbosity: int = 0, +) -> tuple[str, str, Execution]: + log_progress( + { + "type": "code", + "status": "started", + } + ) + + fixed_code_and_test = {"code": "", "test": "", "reflections": ""} + success = False + count = 0 + while not success and count < 3: + try: + fixed_code_and_test = extract_json( + debugger( + FIX_BUG.format( + code=code, + tests=test, + result="\n".join(result.text().splitlines()[-50:]), + feedback=format_memory(working_memory + new_working_memory), + ) + ) + ) + success = True + except Exception as e: + _LOGGER.exception(f"Error while extracting JSON: {e}") + + count += 1 + + old_code = code + old_test = test + + if fixed_code_and_test["code"].strip() != "": + code = extract_code(fixed_code_and_test["code"]) + if fixed_code_and_test["test"].strip() != "": + test = extract_code(fixed_code_and_test["test"]) + + new_working_memory.append( + { + "code": f"{code}\n{test}", + "feedback": fixed_code_and_test["reflections"], + "edits": get_diff(f"{old_code}\n{old_test}", f"{code}\n{test}"), + } + ) + log_progress( + { + "type": "code", + "status": "running", + "payload": { + "code": DefaultImports.prepend_imports(code), + "test": test, + }, + } + ) + + result = code_interpreter.exec_isolation( + f"{DefaultImports.to_code_string()}\n{code}\n{test}" + ) + log_progress( + { + "type": "code", + "status": "completed" if result.success else "failed", + "payload": { + "code": DefaultImports.prepend_imports(code), + "test": test, + "result": result.to_json(), + }, + } + ) + if verbosity == 2: + _print_code("Code and test after attempted fix:", code, test) + _LOGGER.info( + f"Reflection: {fixed_code_and_test['reflections']}\nCode execution result after attempted fix: {result.text(include_logs=True)}" + ) + + return code, test, result + + +def _print_code(title: str, code: str, test: Optional[str] = None) -> None: + _CONSOLE.print(title, style=Style(bgcolor="dark_orange3", bold=True)) + _CONSOLE.print("=" * 30 + " Code " + "=" * 30) + _CONSOLE.print( + Syntax( + DefaultImports.prepend_imports(code), + "python", + theme="gruvbox-dark", + line_numbers=True, + ) + ) + if test: + _CONSOLE.print("=" * 30 + " Test " + "=" * 30) + _CONSOLE.print(Syntax(test, "python", theme="gruvbox-dark", line_numbers=True)) + + +def retrieve_tools( + plans: Dict[str, List[Dict[str, str]]], + tool_recommender: Sim, + log_progress: Callable[[Dict[str, Any]], None], + verbosity: int = 0, +) -> Dict[str, str]: + log_progress( + { + "type": "tools", + "status": "started", + } + ) + tool_info = [] + tool_desc = [] + tool_lists: Dict[str, List[Dict[str, str]]] = {} + for k, plan in plans.items(): + tool_lists[k] = [] + for task in plan: + tools = tool_recommender.top_k(task["instructions"], k=2, thresh=0.3) + tool_info.extend([e["doc"] for e in tools]) + tool_desc.extend([e["desc"] for e in tools]) + tool_lists[k].extend( + {"description": e["desc"], "documentation": e["doc"]} for e in tools + ) + + if verbosity == 2: + tool_desc_str = "\n".join(set(tool_desc)) + _LOGGER.info(f"Tools Description:\n{tool_desc_str}") + + tool_lists_unique = {} + for k in tool_lists: + tool_lists_unique[k] = "\n\n".join( + set(e["documentation"] for e in tool_lists[k]) + ) + all_tools = "\n\n".join(set(tool_info)) + tool_lists_unique["all"] = all_tools + return tool_lists_unique + + +class VisionAgentCoder(Agent): + """Vision Agent Coder is an agentic framework that can output code based on a user + request. It can plan tasks, retrieve relevant tools, write code, write tests and + reflect on failed test cases to debug code. It is inspired by AgentCoder + https://arxiv.org/abs/2312.13010 and Data Interpeter https://arxiv.org/abs/2402.18679 + + Example + ------- + >>> from vision_agent.agent import VisionAgentCoder + >>> agent = VisionAgentCoder() + >>> code = agent("What percentage of the area of the jar is filled with coffee beans?", media="jar.jpg") + """ + + def __init__( + self, + planner: Optional[LMM] = None, + coder: Optional[LMM] = None, + tester: Optional[LMM] = None, + debugger: Optional[LMM] = None, + tool_recommender: Optional[Sim] = None, + verbosity: int = 0, + report_progress_callback: Optional[Callable[[Dict[str, Any]], None]] = None, + code_sandbox_runtime: Optional[str] = None, + ) -> None: + """Initialize the Vision Agent Coder. + + Parameters: + planner (Optional[LMM]): The planner model to use. Defaults to OpenAILMM. + coder (Optional[LMM]): The coder model to use. Defaults to OpenAILMM. + tester (Optional[LMM]): The tester model to use. Defaults to OpenAILMM. + debugger (Optional[LMM]): The debugger model to + tool_recommender (Optional[Sim]): The tool recommender model to use. + verbosity (int): The verbosity level of the agent. Defaults to 0. 2 is the + highest verbosity level which will output all intermediate debugging + code. + report_progress_callback: a callback to report the progress of the agent. + This is useful for streaming logs in a web application where multiple + VisionAgentCoder instances are running in parallel. This callback + ensures that the progress are not mixed up. + code_sandbox_runtime: the code sandbox runtime to use. A code sandbox is + used to run the generated code. It can be one of the following + values: None, "local" or "e2b". If None, VisionAgentCoder will read + the value from the environment variable CODE_SANDBOX_RUNTIME. If it's + also None, the local python runtime environment will be used. + """ + + self.planner = ( + OpenAILMM(temperature=0.0, json_mode=True) if planner is None else planner + ) + self.coder = OpenAILMM(temperature=0.0) if coder is None else coder + self.tester = OpenAILMM(temperature=0.0) if tester is None else tester + self.debugger = ( + OpenAILMM(temperature=0.0, json_mode=True) if debugger is None else debugger + ) + self.verbosity = verbosity + if self.verbosity > 0: + _LOGGER.setLevel(logging.INFO) + + self.tool_recommender = ( + Sim(T.TOOLS_DF, sim_key="desc") + if tool_recommender is None + else tool_recommender + ) + self.report_progress_callback = report_progress_callback + self.code_sandbox_runtime = code_sandbox_runtime + + def __call__( + self, + input: Union[str, List[Message]], + media: Optional[Union[str, Path]] = None, + ) -> str: + """Chat with VisionAgentCoder and return intermediate information regarding the + task. + + Parameters: + input (Union[str, List[Message]]): A conversation in the format of + [{"role": "user", "content": "describe your task here..."}] or a string + of just the contents. + media (Optional[Union[str, Path]]): The media file to be used in the task. + + Returns: + str: The code output by the VisionAgentCoder. + """ + + if isinstance(input, str): + input = [{"role": "user", "content": input}] + if media is not None: + input[0]["media"] = [media] + results = self.chat_with_workflow(input) + results.pop("working_memory") + return results["code"] # type: ignore + + def chat_with_workflow( + self, + chat: List[Message], + test_multi_plan: bool = True, + display_visualization: bool = False, + ) -> Dict[str, Any]: + """Chat with VisionAgentCoder and return intermediate information regarding the + task. + + Parameters: + chat (List[Message]): A conversation + in the format of: + [{"role": "user", "content": "describe your task here..."}] + or if it contains media files, it should be in the format of: + [{"role": "user", "content": "describe your task here...", "media": ["image1.jpg", "image2.jpg"]}] + test_multi_plan (bool): If True, it will test tools for multiple plans and + pick the best one based off of the tool results. If False, it will go + with the first plan. + display_visualization (bool): If True, it opens a new window locally to + show the image(s) created by visualization code (if there is any). + + Returns: + Dict[str, Any]: A dictionary containing the code, test, test result, plan, + and working memory of the agent. + """ + + if not chat: + raise ValueError("Chat cannot be empty.") + + # NOTE: each chat should have a dedicated code interpreter instance to avoid concurrency issues + with CodeInterpreterFactory.new_instance( + code_sandbox_runtime=self.code_sandbox_runtime + ) as code_interpreter: + chat = copy.deepcopy(chat) + media_list = [] + for chat_i in chat: + if "media" in chat_i: + for media in chat_i["media"]: + media = code_interpreter.upload_file(media) + chat_i["content"] += f" Media name {media}" # type: ignore + media_list.append(media) + + int_chat = cast( + List[Message], + [ + ( + { + "role": c["role"], + "content": c["content"], + "media": c["media"], + } + if "media" in c + else {"role": c["role"], "content": c["content"]} + ) + for c in chat + ], + ) + + code = "" + test = "" + working_memory: List[Dict[str, str]] = [] + results = {"code": "", "test": "", "plan": []} + plan = [] + success = False + self.log_progress( + { + "type": "log", + "log_content": "Creating plans", + "status": "started", + } + ) + plans = write_plans( + int_chat, + T.TOOL_DESCRIPTIONS, + format_memory(working_memory), + self.planner, + ) + + if self.verbosity >= 1: + for p in plans: + _LOGGER.info( + f"\n{tabulate(tabular_data=plans[p], headers='keys', tablefmt='mixed_grid', maxcolwidths=_MAX_TABULATE_COL_WIDTH)}" + ) + + tool_infos = retrieve_tools( + plans, + self.tool_recommender, + self.log_progress, + self.verbosity, + ) + + if test_multi_plan: + best_plan, tool_output_str = pick_plan( + int_chat, + plans, + tool_infos["all"], + self.coder, + code_interpreter, + media_list, + self.log_progress, + verbosity=self.verbosity, + ) + else: + best_plan = list(plans.keys())[0] + tool_output_str = "" + + if best_plan in plans and best_plan in tool_infos: + plan_i = plans[best_plan] + tool_info = tool_infos[best_plan] + else: + if self.verbosity >= 1: + _LOGGER.warning( + f"Best plan {best_plan} not found in plans or tool_infos. Using the first plan and tool info." + ) + k = list(plans.keys())[0] + plan_i = plans[k] + tool_info = tool_infos[k] + + self.log_progress( + { + "type": "log", + "log_content": "Creating plans", + "status": "completed", + "payload": tool_info, + } + ) + + if self.verbosity >= 1: + _LOGGER.info( + f"Picked best plan:\n{tabulate(tabular_data=plan_i, headers='keys', tablefmt='mixed_grid', maxcolwidths=_MAX_TABULATE_COL_WIDTH)}" + ) + + results = write_and_test_code( + chat=[{"role": c["role"], "content": c["content"]} for c in int_chat], + plan="\n-" + "\n-".join([e["instructions"] for e in plan_i]), + tool_info=tool_info, + tool_output=tool_output_str, + tool_utils=T.UTILITIES_DOCSTRING, + working_memory=working_memory, + coder=self.coder, + tester=self.tester, + debugger=self.debugger, + code_interpreter=code_interpreter, + log_progress=self.log_progress, + verbosity=self.verbosity, + media=media_list, + ) + success = cast(bool, results["success"]) + code = cast(str, results["code"]) + test = cast(str, results["test"]) + working_memory.extend(results["working_memory"]) # type: ignore + plan.append({"code": code, "test": test, "plan": plan_i}) + + execution_result = cast(Execution, results["test_result"]) + self.log_progress( + { + "type": "final_code", + "status": "completed" if success else "failed", + "payload": { + "code": DefaultImports.prepend_imports(code), + "test": test, + "result": execution_result.to_json(), + }, + } + ) + + if display_visualization: + for res in execution_result.results: + if res.png: + b64_to_pil(res.png).show() + if res.mp4: + play_video(res.mp4) + + return { + "code": DefaultImports.prepend_imports(code), + "test": test, + "test_result": execution_result, + "plan": plan, + "working_memory": working_memory, + } + + def log_progress(self, data: Dict[str, Any]) -> None: + if self.report_progress_callback is not None: + self.report_progress_callback(data) + + +class AzureVisionAgentCoder(VisionAgentCoder): + """VisionAgentCoder that uses Azure OpenAI APIs for planning, coding, testing. + + Pre-requisites: + 1. Set the environment variable AZURE_OPENAI_API_KEY to your Azure OpenAI API key. + 2. Set the environment variable AZURE_OPENAI_ENDPOINT to your Azure OpenAI endpoint. + + Example + ------- + >>> from vision_agent import AzureVisionAgentCoder + >>> agent = AzureVisionAgentCoder() + >>> code = agent("What percentage of the area of the jar is filled with coffee beans?", media="jar.jpg") + """ + + def __init__( + self, + planner: Optional[LMM] = None, + coder: Optional[LMM] = None, + tester: Optional[LMM] = None, + debugger: Optional[LMM] = None, + tool_recommender: Optional[Sim] = None, + verbosity: int = 0, + report_progress_callback: Optional[Callable[[Dict[str, Any]], None]] = None, + ) -> None: + """Initialize the Vision Agent Coder. + + Parameters: + planner (Optional[LMM]): The planner model to use. Defaults to OpenAILMM. + coder (Optional[LMM]): The coder model to use. Defaults to OpenAILMM. + tester (Optional[LMM]): The tester model to use. Defaults to OpenAILMM. + debugger (Optional[LMM]): The debugger model to + tool_recommender (Optional[Sim]): The tool recommender model to use. + verbosity (int): The verbosity level of the agent. Defaults to 0. 2 is the + highest verbosity level which will output all intermediate debugging + code. + report_progress_callback: a callback to report the progress of the agent. + This is useful for streaming logs in a web application where multiple + VisionAgentCoder instances are running in parallel. This callback + ensures that the progress are not mixed up. + """ + super().__init__( + planner=( + AzureOpenAILMM(temperature=0.0, json_mode=True) + if planner is None + else planner + ), + coder=AzureOpenAILMM(temperature=0.0) if coder is None else coder, + tester=AzureOpenAILMM(temperature=0.0) if tester is None else tester, + debugger=( + AzureOpenAILMM(temperature=0.0, json_mode=True) + if debugger is None + else debugger + ), + tool_recommender=( + AzureSim(T.TOOLS_DF, sim_key="desc") + if tool_recommender is None + else tool_recommender + ), + verbosity=verbosity, + report_progress_callback=report_progress_callback, + ) diff --git a/vision_agent/agent/vision_agent_coder_prompts.py b/vision_agent/agent/vision_agent_coder_prompts.py new file mode 100644 index 00000000..cb4c3eeb --- /dev/null +++ b/vision_agent/agent/vision_agent_coder_prompts.py @@ -0,0 +1,328 @@ +USER_REQ = """ +## User Request +{user_request} +""" + +FULL_TASK = """ +## User Request +{user_request} + +## Subtasks +{subtasks} +""" + +FEEDBACK = """ +## This contains code and feedback from previous runs and is used for providing context so you do not make the same mistake again. + +{feedback} +""" + + +PLAN = """ +**Context**: +{context} + +**Tools Available**: +{tool_desc} + +**Previous Feedback**: +{feedback} + +**Instructions**: +1. Based on the context and tools you have available, create a plan of subtasks to achieve the user request. +2. Output three different plans each utilize a different strategy or tool. + +Output a list of jsons in the following format + +```json +{{ + "plan1": + [ + {{ + "instructions": str # what you should do in this task associated with a tool + }} + ], + "plan2": ..., + "plan3": ... +}} +``` +""" + + +TEST_PLANS = """ +**Role**: You are a software programmer responsible for testing different tools. + +**Task**: Your responsibility is to take a set of several plans and test the different tools for each plan. + +**Documentation**: +This is the documentation for the functions you have access to. You may call any of these functions to help you complete the task. They are available through importing `from vision_agent.tools import *`. + +{docstring} + +**Plans**: +{plans} + +{previous_attempts} + +**Instructions**: +1. Write a program to load the media and call each tool and save it's output. +2. Create a dictionary where the keys are the tool name and the values are the tool outputs. Remove numpy arrays from the printed dictionary. +3. Your test case MUST run only on the given images which are {media} +4. Print this final dictionary. + +**Example**: +plan1: +- Load the image from the provided file path 'image.jpg'. +- Use the 'owl_v2' tool with the prompt 'person' to detect and count the number of people in the image. +plan2: +- Load the image from the provided file path 'image.jpg'. +- Use the 'grounding_sam' tool with the prompt 'person' to detect and count the number of people in the image. +- Count the number of detected objects labeled as 'person'. +plan3: +- Load the image from the provided file path 'image.jpg'. +- Use the 'loca_zero_shot_counting' tool to count the dominant foreground object, which in this case is people. + +```python +from vision_agent.tools import load_image, owl_v2, grounding_sam, loca_zero_shot_counting +image = load_image("image.jpg") +owl_v2_out = owl_v2("person", image) + +gsam_out = grounding_sam("person", image) +gsam_out = [{{k: v for k, v in o.items() if k != "mask"}} for o in gsam_out] + +loca_out = loca_zero_shot_counting(image) +loca_out = loca_out["count"] + +final_out = {{"owl_v2": owl_v2_out, "florencev2_object_detection": florencev2_out, "loca_zero_shot_counting": loca_out}} +print(final_out) +``` +""" + + +PREVIOUS_FAILED = """ +**Previous Failed Attempts**: +You previously ran this code: +```python +{code} +``` + +But got the following error or no stdout: +{error} +""" + + +PICK_PLAN = """ +**Role**: You are a software programmer. + +**Task**: Your responsibility is to pick the best plan from the three plans provided. + +**Context**: +{context} + +**Plans**: +{plans} + +**Tool Output**: +{tool_output} + +**Instructions**: +1. Given the plans, image, and tool outputs, decide which plan is the best to achieve the user request. +2. Output a JSON object with the following format: +{{ + "thoughts": str # your thought process for choosing the best plan + "best_plan": str # the best plan you have chosen +}} +""" + +CODE = """ +**Role**: You are a software programmer. + +**Task**: As a programmer, you are required to complete the function. Use a Chain-of-Thought approach to break down the problem, create pseudocode, and then write the code in Python language. Ensure that your code is efficient, readable, and well-commented. Return the requested information from the function you create. Do not call your code, a test will be run after the code is submitted. + +**Documentation**: +This is the documentation for the functions you have access to. You may call any of these functions to help you complete the task. They are available through importing `from vision_agent.tools import *`. + +{docstring} + +**Input Code Snippet**: +```python +# Your code here +``` + +**User Instructions**: +{question} + +**Tool Output**: +{tool_output} + +**Previous Feedback**: +{feedback} + +**Instructions**: +1. **Understand and Clarify**: Make sure you understand the task. +2. **Algorithm/Method Selection**: Decide on the most efficient way. +3. **Pseudocode Creation**: Write down the steps you will follow in pseudocode. +4. **Code Generation**: Translate your pseudocode into executable Python code. Ensure you use correct arguments, remember coordinates are always returned normalized from `vision_agent.tools`. All images from `vision_agent.tools` are in RGB format, red is (255, 0, 0) and blue is (0, 0, 255). +""" + +TEST = """ +**Role**: As a tester, your task is to create comprehensive test cases for the provided code. These test cases should encompass Basic and Edge case scenarios to ensure the code's robustness and reliability if possible. + +**Documentation**: +This is the documentation for the functions you have access to. You may call any of these functions to help you complete the task. They are available through importing `from vision_agent.tools import *`. You do not need to test these functions. Test only the code provided by the user. + +{docstring} + +**User Instructions**: +{question} + +**Input Code Snippet**: +```python +### Please decided how would you want to generate test cases. Based on incomplete code or completed version. +{code} +``` + +**Instructions**: +1. Verify the fundamental functionality under normal conditions. +2. Ensure each test case is well-documented with comments explaining the scenario it covers. +3. DO NOT use any files that are not provided by the user's instructions, your test must be run and will crash if it tries to load a non-existent file. +4. DO NOT mock any functions, you must test their functionality as is. + +You should format your test cases at the end of your response wrapped in ```python ``` tags like in the following example: +```python +# You can run assertions to ensure the function is working as expected +assert function(input) == expected_output, "Test case description" + +# You can simply call the function to ensure it runs +function(input) + +# Or you can visualize the output +output = function(input) +visualize(output) +``` + +**Examples**: +## Prompt 1: +```python +def detect_cats_and_dogs(image_path: str) -> Dict[str, List[List[float]]]: + \""" Detects cats and dogs in an image. Returns a dictionary with + {{ + "cats": [[x1, y1, x2, y2], ...], "dogs": [[x1, y1, x2, y2], ...] + }} + \""" +``` + +## Completion 1: +```python +# We can test to ensure the output has the correct structure but we cannot test the +# content of the output without knowing the image. We can test on "image.jpg" because +# it is provided by the user so we know it exists. +output = detect_cats_and_dogs("image.jpg") +assert "cats" in output, "The output should contain 'cats' +assert "dogs" in output, "The output should contain 'dogs' +``` + +## Prompt 2: +```python +def find_text(image_path: str, text: str) -> str: + \""" Finds the text in the image and returns the text. \""" + +## Completion 2: +```python +# Because we do not know ahead of time what text is in the image, we can only run the +# code and print the results. We can test on "image.jpg" because it is provided by the +# user so we know it exists. +found_text = find_text("image.jpg", "Hello World") +print(found_text) +``` +""" + +SIMPLE_TEST = """ +**Role**: As a tester, your task is to create a simple test case for the provided code. This test case should verify the fundamental functionality under normal conditions. + +**Documentation**: +This is the documentation for the functions you have access to. You may call any of these functions to help you complete the task. They are available through importing `from vision_agent.tools import *`. You do not need to test these functions, only the code provided by the user. + +{docstring} + +**User Instructions**: +{question} + +**Input Code Snippet**: +```python +### Please decide how would you want to generate test cases. Based on incomplete code or completed version. +{code} +``` + +**Previous Feedback**: +{feedback} + +**Instructions**: +1. Verify the fundamental functionality under normal conditions. +2. Ensure each test case is well-documented with comments explaining the scenario it covers. +3. Your test case MUST run only on the given images which are {media} +4. Your test case MUST run only with the given values which is available in the question - {question} +5. DO NOT use any non-existent or dummy image or video files that are not provided by the user's instructions. +6. DO NOT mock any functions, you must test their functionality as is. +7. DO NOT assert the output value, run the code and assert only the output format or data structure. +8. DO NOT use try except block to handle the error, let the error be raised if the code is incorrect. +9. DO NOT import the testing function as it will available in the testing environment. +10. Print the output of the function that is being tested. +11. Use the output of the function that is being tested as the return value of the testing function. +12. Run the testing function in the end and don't assign a variable to its output. +""" + + +FIX_BUG = """ +**Role** As a coder, your job is to find the error in the code and fix it. You are running in a notebook setting so you can run !pip install to install missing packages. + +**Instructions**: +Please re-complete the code to fix the error message. Here is the previous version: +```python +{code} +``` + +When we run this test code: +```python +{tests} +``` + +It raises this error: +{result} + +This is previous feedback provided on the code: +{feedback} + +Please fix the bug by follow the error information and return a JSON object with the following format: +{{ + "reflections": str # any thoughts you have about the bug and how you fixed it + "code": str # the fixed code if any, else an empty string + "test": str # the fixed test code if any, else an empty string +}} +""" + + +REFLECT = """ +**Role**: You are a reflection agent. Your job is to look at the original user request and the code produced and determine if the code satisfies the user's request. If it does not, you must provide feedback on how to improve the code. You are concerned only if the code meets the user request, not if the code is good or bad. + +**Context**: +{context} + +**Plan**: +{plan} + +**Code**: +{code} + +**Instructions**: +1. **Understand the User Request**: Read the user request and understand what the user is asking for. +2. **Review the Plan**: Check the plan to see if it is a viable approach to solving the user request. +3. **Review the Code**: Check the code to see if it solves the user request. +4. DO NOT add any reflections for test cases, these are taken care of. + +Respond in JSON format with the following structure: +{{ + "feedback": str # the feedback you would give to the coder and tester + "success": bool # whether the code and tests meet the user request +}} +""" diff --git a/vision_agent/agent/vision_agent_prompts.py b/vision_agent/agent/vision_agent_prompts.py index 8f5e689b..6f0fdf74 100644 --- a/vision_agent/agent/vision_agent_prompts.py +++ b/vision_agent/agent/vision_agent_prompts.py @@ -1,327 +1,114 @@ -USER_REQ = """ -## User Request -{user_request} -""" - -FULL_TASK = """ -## User Request -{user_request} - -## Subtasks -{subtasks} -""" - -FEEDBACK = """ -## This contains code and feedback from previous runs and is used for providing context so you do not make the same mistake again. - -{feedback} -""" - - -PLAN = """ -**Context**: -{context} - -**Tools Available**: -{tool_desc} - -**Previous Feedback**: -{feedback} - -**Instructions**: -1. Based on the context and tools you have available, create a plan of subtasks to achieve the user request. -2. Output three different plans each utilize a different strategy or tool. - -Output a list of jsons in the following format - -```json -{{ - "plan1": - [ - {{ - "instructions": str # what you should do in this task associated with a tool - }} - ], - "plan2": ..., - "plan3": ... -}} -``` -""" - - -TEST_PLANS = """ -**Role**: You are a software programmer responsible for testing different tools. - -**Task**: Your responsibility is to take a set of several plans and test the different tools for each plan. - -**Documentation**: -This is the documentation for the functions you have access to. You may call any of these functions to help you complete the task. They are available through importing `from vision_agent.tools import *`. - -{docstring} - -**Plans**: -{plans} - -{previous_attempts} - -**Instructions**: -1. Write a program to load the media and call each tool and save it's output. -2. Create a dictionary where the keys are the tool name and the values are the tool outputs. Remove any array types from the printed dictionary. -3. Print this final dictionary. - -**Example**: -plan1: -- Load the image from the provided file path 'image.jpg'. -- Use the 'owl_v2' tool with the prompt 'person' to detect and count the number of people in the image. -plan2: -- Load the image from the provided file path 'image.jpg'. -- Use the 'grounding_sam' tool with the prompt 'person' to detect and count the number of people in the image. -- Count the number of detected objects labeled as 'person'. -plan3: -- Load the image from the provided file path 'image.jpg'. -- Use the 'loca_zero_shot_counting' tool to count the dominant foreground object, which in this case is people. - -```python -from vision_agent.tools import load_image, owl_v2, grounding_sam, loca_zero_shot_counting -image = load_image("image.jpg") -owl_v2_out = owl_v2("person", image) - -gsam_out = grounding_sam("person", image) -gsam_out = [{{k: v for k, v in o.items() if k != "mask"}} for o in gsam_out] - -loca_out = loca_zero_shot_counting(image) -loca_out = loca_out["count"] - -final_out = {{"owl_v2": owl_v2_out, "florencev2_object_detection": florencev2_out, "loca_zero_shot_counting": loca_out}} -print(final_out) -``` -""" - - -PREVIOUS_FAILED = """ -**Previous Failed Attempts**: -You previously ran this code: -```python -{code} -``` - -But got the following error or no stdout: -{error} -""" - - -PICK_PLAN = """ -**Role**: You are a software programmer. - -**Task**: Your responsibility is to pick the best plan from the three plans provided. +VA_CODE = """ +**Role**: You are a helpful conversational agent that assists users with their requests by writing code to solve it. -**Context**: -{context} +**Taks**: As a conversational agent, you are required to understand the user's request and provide a helpful response. Use a Chain-of-Thought approach to break down the problem, create a plan, and then provide a response. Ensure that your response is clear, concise, and helpful. You can use an interactive Python (Jupyter Notebook) environment, executing code with . -**Plans**: -{plans} - -**Tool Output**: -{tool_output} - -**Instructions**: -1. Given the plans, image, and tool outputs, decide which plan is the best to achieve the user request. -2. Output a JSON object with the following format: -{{ - "thoughts": str # your thought process for choosing the best plan - "best_plan": str # the best plan you have chosen -}} -""" - -CODE = """ -**Role**: You are a software programmer. - -**Task**: As a programmer, you are required to complete the function. Use a Chain-of-Thought approach to break down the problem, create pseudocode, and then write the code in Python language. Ensure that your code is efficient, readable, and well-commented. Return the requested information from the function you create. Do not call your code, a test will be run after the code is submitted. - -**Documentation**: -This is the documentation for the functions you have access to. You may call any of these functions to help you complete the task. They are available through importing `from vision_agent.tools import *`. - -{docstring} - -**Input Code Snippet**: -```python -# Your code here -``` - -**User Instructions**: -{question} - -**Tool Output**: -{tool_output} - -**Previous Feedback**: -{feedback} - -**Instructions**: -1. **Understand and Clarify**: Make sure you understand the task. -2. **Algorithm/Method Selection**: Decide on the most efficient way. -3. **Pseudocode Creation**: Write down the steps you will follow in pseudocode. -4. **Code Generation**: Translate your pseudocode into executable Python code. Ensure you use correct arguments, remember coordinates are always returned normalized from `vision_agent.tools`. All images from `vision_agent.tools` are in RGB format, red is (255, 0, 0) and blue is (0, 0, 255). -""" - -TEST = """ -**Role**: As a tester, your task is to create comprehensive test cases for the provided code. These test cases should encompass Basic and Edge case scenarios to ensure the code's robustness and reliability if possible. + +print("Hello World!") + **Documentation**: -This is the documentation for the functions you have access to. You may call any of these functions to help you complete the task. They are available through importing `from vision_agent.tools import *`. You do not need to test these functions. Test only the code provided by the user. - -{docstring} +This is the documentation for the different actions you can take: -**User Instructions**: -{question} - -**Input Code Snippet**: -```python -### Please decided how would you want to generate test cases. Based on incomplete code or completed version. -{code} -``` - -**Instructions**: -1. Verify the fundamental functionality under normal conditions. -2. Ensure each test case is well-documented with comments explaining the scenario it covers. -3. DO NOT use any files that are not provided by the user's instructions, your test must be run and will crash if it tries to load a non-existent file. -4. DO NOT mock any functions, you must test their functionality as is. - -You should format your test cases at the end of your response wrapped in ```python ``` tags like in the following example: -```python -# You can run assertions to ensure the function is working as expected -assert function(input) == expected_output, "Test case description" - -# You can simply call the function to ensure it runs -function(input) - -# Or you can visualize the output -output = function(input) -visualize(output) -``` +{documentation} **Examples**: -## Prompt 1: -```python -def detect_cats_and_dogs(image_path: str) -> Dict[str, List[List[float]]]: - \""" Detects cats and dogs in an image. Returns a dictionary with - {{ - "cats": [[x1, y1, x2, y2], ...], "dogs": [[x1, y1, x2, y2], ...] - }} - \""" -``` - -## Completion 1: -```python -# We can test to ensure the output has the correct structure but we cannot test the -# content of the output without knowing the image. We can test on "image.jpg" because -# it is provided by the user so we know it exists. -output = detect_cats_and_dogs("image.jpg") -assert "cats" in output, "The output should contain 'cats' -assert "dogs" in output, "The output should contain 'dogs' -``` - -## Prompt 2: -```python -def find_text(image_path: str, text: str) -> str: - \""" Finds the text in the image and returns the text. \""" - -## Completion 2: -```python -# Because we do not know ahead of time what text is in the image, we can only run the -# code and print the results. We can test on "image.jpg" because it is provided by the -# user so we know it exists. -found_text = find_text("image.jpg", "Hello World") -print(found_text) -``` -""" - -SIMPLE_TEST = """ -**Role**: As a tester, your task is to create a simple test case for the provided code. This test case should verify the fundamental functionality under normal conditions. - -**Documentation**: -This is the documentation for the functions you have access to. You may call any of these functions to help you complete the task. They are available through importing `from vision_agent.tools import *`. You do not need to test these functions, only the code provided by the user. - -{docstring} - -**User Instructions**: -{question} - -**Input Code Snippet**: -```python -### Please decide how would you want to generate test cases. Based on incomplete code or completed version. -{code} -``` - -**Previous Feedback**: -{feedback} +Here is an example of how you can interact with a user and Actions to complete a task: +--- START EXAMPLES --- +[Current directory: /workspace/test] +{examples} +--- END EXAMPLES --- **Instructions**: -1. Verify the fundamental functionality under normal conditions. -2. Ensure each test case is well-documented with comments explaining the scenario it covers. -3. Your test case MUST run only on the given images which are {media} -4. Your test case MUST run only with the given values which is available in the question - {question} -5. DO NOT use any non-existent or dummy image or video files that are not provided by the user's instructions. -6. DO NOT mock any functions, you must test their functionality as is. -7. DO NOT assert the output value, run the code and assert only the output format or data structure. -8. DO NOT use try except block to handle the error, let the error be raised if the code is incorrect. -9. DO NOT import the testing function as it will available in the testing environment. -10. Print the output of the function that is being tested. -11. Use the output of the function that is being tested as the return value of the testing function. -12. Run the testing function in the end and don't assign a variable to its output. +1. **Understand and Clarify**: Make sure you understand the task, ask clarifying questions if the task is not clear. +2. **Output in JSON**: Respond in JSON format, {{"thoughts": , "response": , "let_user_respond": }}. + +**Conversation**: +Here is the current conversation so far: +--- START CONVERSATION --- +[Current directory: {dir}] +{conversation} """ +EXAMPLES_CODE1 = """ +USER: Can you detect the dogs in this image? Media name dog.jpg -FIX_BUG = """ -**Role** As a coder, your job is to find the error in the code and fix it. You are running in a notebook setting so you can run !pip install to install missing packages. +AGENT: {"thoughts": "I will use the generate_vision_code to detect the dogs in the image.", "response": "generate_vision_code('/workspace/test/dog_detector.py', 'Can you write code to detect dogs in this image?', media=['/workspace/test/dog.jpg'])", "let_user_respond": false} -**Instructions**: -Please re-complete the code to fix the error message. Here is the previous version: -```python -{code} -``` +OBSERVATION: +[File /workspace/test/dog_detector.py] +0|from vision_agent.tools import load_image, owl_v2 +1|def detect_dogs(image_path: str): +2| image = load_image(image_path) +3| dogs = owl_v2("dog", image) +4| return dogs +[End of file] -When we run this test code: -```python -{tests} -``` +AGENT: {"thoughts": "I have generated the code to detect the dogs in the image, I must now run the code to get the output.", "response": "from dog_detector import detect_dogs\n print(detect_dogs('/workspace/test/dog.jpg'))", "let_user_respond": false} -It raises this error: -{result} +OBSERVATION: +----- stdout ----- +[{'score': 0.99, 'label': 'dog', 'box': [0.1, 0.2, 0.3, 0.4]}, {'score': 0.23, 'label': 'dog', 'box': [0.2, 0.3, 0.4, 0.5]}] -This is previous feedback provided on the code: -{feedback} -Please fix the bug by follow the error information and return a JSON object with the following format: -{{ - "reflections": str # any thoughts you have about the bug and how you fixed it - "code": str # the fixed code if any, else an empty string - "test": str # the fixed test code if any, else an empty string -}} -""" +AGENT: {"thoughts": "Two dogs are detected, I will show this to the user and ask them if the result looks good.", "response": "I have written the code to detect dogs and shown the output, do the results look good to you?", "let_user_respond": true} +USER: The the image only has one dog, can you fix this? -REFLECT = """ -**Role**: You are a reflection agent. Your job is to look at the original user request and the code produced and determine if the code satisfies the user's request. If it does not, you must provide feedback on how to improve the code. You are concerned only if the code meets the user request, not if the code is good or bad. +AGENT: {"thoughts": "I will use edit_vision_code to fix the code to detect only one dog. The lower score is 0.23 so I will set the threshold above that.", "response": "edit_vision_code('/workspace/test/dog_detector.py', ['Can you write code to detect dogs in this image?', 'Can you increase the threshold to 0.24?'], media=['/workspace/test/dog.jpg'])", "let_user_respond": false} -**Context**: -{context} +OBSERVATION: +[File /workspace/test/dog_detector.py] +0|from vision_agent.tools import load_image, owl_v2 +1|def detect_dogs(image_path: str): +2| image = load_image(image_path) +3| dogs = owl_v2("dog", image, threshold=0.24) +4| return dogs +[End of file] -**Plan**: -{plan} +AGENT: {"thoughts": "I have edited the code to detect only one dog, I must now run the code to get the output.", "response": "from dog_detector import detect_dogs\n print(detect_dogs('/workspace/test/dog.jpg'))", "let_user_respond": false} -**Code**: -{code} +OBSERVATION: +----- stdout ----- +[{'score': 0.99, 'label': 'dog', 'box': [0.1, 0.2, 0.3, 0.4]}] -**Instructions**: -1. **Understand the User Request**: Read the user request and understand what the user is asking for. -2. **Review the Plan**: Check the plan to see if it is a viable approach to solving the user request. -3. **Review the Code**: Check the code to see if it solves the user request. -4. DO NOT add any reflections for test cases, these are taken care of. +AGENT: {"thoughts": "One dog is detected, I will show this to the user and ask them if the result looks good.", "response": "I have written the code to detect one dog and shown you the output, do the results look good to you?", "let_user_respond": true} +""" -Respond in JSON format with the following structure: -{{ - "feedback": str # the feedback you would give to the coder and tester - "success": bool # whether the code and tests meet the user request -}} +EXAMPLES_CODE2 = """ +USER: Can you create a function to count workers with helmets? + +AGENT: {"thoughts": "The user has asked to count workers with helmets but has not provided an image. I will ask the user for an image and then generate the code to count workers with helmets.", "response": "Can you provide an image of workers with helmets?", "let_user_respond": true} + +USER: Yes you can use workers.png + +AGENT: {"thoughts": "I will use the generate_vision_code to count the workers with helmets in the image.", "response": "generate_vision_code('/workspace/test/code.py', 'Can you write code to count workers with helmets in this image?', media=['/workspace/test/workers.png'])", "let_user_respond": false} + +OBSERVATION: +[File /workspace/test/code.py] +0|from vision_agent.tools import load_image, owl_v2, closest_box_distance +1|def count_workers_with_helmets(image_path: str): +2| image = load_image(image_path) +3| workers = owl_v2("worker", image) +4| helmets = owl_v2("helmet", image) +5| count = 0 +6| for worker in workers: +7| person_box = worker['bbox'] +8| person_has_helmet = False +9| for helmet in helmets: +10| if closest_box_distance(worker['box'], helmet['box']) < 0.01: +11| person_has_helmet = True +12| break +13| if person_has_helmet: +14| count += 1 +15| return count +[End of file] + +AGENT: {"thoughts": "I have generated the code to count the workers with helmets in the image, I must now run the code to get the output.", "response": "from code import count_workers_with_helmets\n print(count_workers_with_helmets('/workspace/test/workers.png'))", "let_user_respond": false} + +OBSERVATION: +----- stdout ----- +2 + +AGENT: {"thoughts": "Two workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "I have written the code to count the workers wearing helmets in code.py", "let_user_respond": true} """ diff --git a/vision_agent/lmm/__init__.py b/vision_agent/lmm/__init__.py index 41945da6..61ee1794 100644 --- a/vision_agent/lmm/__init__.py +++ b/vision_agent/lmm/__init__.py @@ -1 +1,2 @@ -from .lmm import LMM, AzureOpenAILMM, ClaudeSonnetLMM, Message, OllamaLMM, OpenAILMM +from .lmm import LMM, AzureOpenAILMM, ClaudeSonnetLMM, OllamaLMM, OpenAILMM +from .types import Message diff --git a/vision_agent/lmm/lmm.py b/vision_agent/lmm/lmm.py index ddcb495b..8ed6b71a 100644 --- a/vision_agent/lmm/lmm.py +++ b/vision_agent/lmm/lmm.py @@ -16,6 +16,8 @@ import vision_agent.tools as T from vision_agent.tools.prompts import CHOOSE_PARAMS, SYSTEM_PROMPT +from .types import Message + _LOGGER = logging.getLogger(__name__) @@ -53,10 +55,6 @@ def encode_media(media: Union[str, Path]) -> str: return encode_image_bytes(image_bytes) -TextOrImage = Union[str, List[Union[str, Path]]] -Message = Dict[str, TextOrImage] - - class LMM(ABC): @abstractmethod def generate( @@ -136,7 +134,7 @@ def chat( { "type": "image_url", "image_url": { - "url": f"data:image/png;base64,{encoded_media}", # type: ignore + "url": f"data:image/png;base64,{encoded_media}", "detail": "low", }, }, diff --git a/vision_agent/lmm/types.py b/vision_agent/lmm/types.py new file mode 100644 index 00000000..ded6a42b --- /dev/null +++ b/vision_agent/lmm/types.py @@ -0,0 +1,5 @@ +from pathlib import Path +from typing import Dict, Sequence, Union + +TextOrImage = Union[str, Sequence[Union[str, Path]]] +Message = Dict[str, TextOrImage] diff --git a/vision_agent/tools/__init__.py b/vision_agent/tools/__init__.py index b354274f..e95a78c7 100644 --- a/vision_agent/tools/__init__.py +++ b/vision_agent/tools/__init__.py @@ -1,5 +1,6 @@ from typing import Callable, List, Optional +from .meta_tools import META_TOOL_DOCSTRING from .prompts import CHOOSE_PARAMS, SYSTEM_PROMPT from .tools import ( TOOL_DESCRIPTIONS, diff --git a/vision_agent/tools/meta_tools.py b/vision_agent/tools/meta_tools.py new file mode 100644 index 00000000..c5860a60 --- /dev/null +++ b/vision_agent/tools/meta_tools.py @@ -0,0 +1,402 @@ +import os +import subprocess +from pathlib import Path +from typing import Any, Dict, List, Union + +import vision_agent as va +from vision_agent.lmm.types import Message +from vision_agent.tools.tool_utils import get_tool_documentation +from vision_agent.tools.tools import TOOL_DESCRIPTIONS + +# These tools are adapted from SWE-Agent https://github.com/princeton-nlp/SWE-agent + +CURRENT_FILE = None +CURRENT_LINE = 0 +DEFAULT_WINDOW_SIZE = 100 +ZMQ_PORT = os.environ.get("ZMQ_PORT", None) + + +def report_progress_callback(port: int, inp: Dict[str, Any]) -> None: + import zmq + + context = zmq.Context() + socket = context.socket(zmq.PUSH) + socket.connect(f"tcp://localhost:{port}") + socket.send_json(inp) + + +def filter_file(file_name: Union[str, Path]) -> bool: + file_name_p = Path(file_name) + return ( + file_name_p.is_file() + and "__pycache__" not in str(file_name_p) + and file_name_p.suffix in [".py", ".txt"] + and not file_name_p.name.startswith(".") + ) + + +def generate_vision_code(save_file: str, chat: str, media: List[str]) -> str: + """Generates python code to solve vision based tasks. + + Parameters: + save_file (str): The file path to save the code. + chat (str): The chat message from the user. + media (List[str]): The media files to use. + + Returns: + str: The generated code. + + Examples + -------- + >>> generate_vision_code("code.py", "Can you detect the dogs in this image?", ["image.jpg"]) + from vision_agent.tools import load_image, owl_v2 + def detect_dogs(image_path: str): + image = load_image(image_path) + dogs = owl_v2("dog", image) + return dogs + """ + + if ZMQ_PORT is not None: + agent = va.agent.VisionAgentCoder( + report_progress_callback=lambda inp: report_progress_callback( + int(ZMQ_PORT), inp + ) + ) + else: + agent = va.agent.VisionAgentCoder() + try: + fixed_chat: List[Message] = [{"role": "user", "content": chat, "media": media}] + response = agent.chat_with_workflow(fixed_chat) + code = response["code"] + with open(save_file, "w") as f: + f.write(code) + code_lines = code.splitlines(keepends=True) + total_lines = len(code_lines) + return view_lines(code_lines, 0, total_lines, save_file, total_lines) + except Exception as e: + return str(e) + + +def edit_vision_code(code_file: str, chat_history: List[str], media: List[str]) -> str: + """Edits python code to solve a vision based task. + + Parameters: + code_file (str): The file path to the code. + chat_history (List[str]): The chat history to used to generate the code. + + Returns: + str: The edited code. + + Examples + -------- + >>> edit_vision_code( + >>> "code.py", + >>> ["Can you detect the dogs in this image?", "Can you use a higher threshold?"], + >>> ["dog.jpg"], + >>> ) + from vision_agent.tools import load_image, owl_v2 + def detect_dogs(image_path: str): + image = load_image(image_path) + dogs = owl_v2("dog", image, threshold=0.8) + return dogs + """ + + agent = va.agent.VisionAgentCoder() + with open(code_file, "r") as f: + code = f.read() + + # Append latest code to second to last message from assistant + fixed_chat_history: List[Message] = [] + for i, chat in enumerate(chat_history): + if i == 0: + fixed_chat_history.append({"role": "user", "content": chat, "media": media}) + elif i > 0 and i < len(chat_history) - 1: + fixed_chat_history.append({"role": "user", "content": chat}) + elif i == len(chat_history) - 1: + fixed_chat_history.append({"role": "assistant", "content": code}) + fixed_chat_history.append({"role": "user", "content": chat}) + + try: + response = agent.chat_with_workflow(fixed_chat_history, test_multi_plan=False) + code = response["code"] + with open(code_file, "w") as f: + f.write(code) + code_lines = code.splitlines(keepends=True) + total_lines = len(code_lines) + return view_lines(code_lines, 0, total_lines, code_file, total_lines) + except Exception as e: + return str(e) + + +def format_lines(lines: List[str], start_idx: int) -> str: + output = "" + for i, line in enumerate(lines): + output += f"{i + start_idx}|{line}" + return output + + +def view_lines( + lines: List[str], line_num: int, window_size: int, file_path: str, total_lines: int +) -> str: + start = max(0, line_num - window_size) + end = min(len(lines), line_num + window_size) + return ( + f"[File: {file_path} ({total_lines} lines total)]\n" + + format_lines(lines[start:end], start) + + ("[End of file]" if end == len(lines) else f"[{len(lines) - end} more lines]") + ) + + +def open_file(file_path: str, line_num: int = 0, window_size: int = 100) -> str: + """Opens the file at at the given path in the editor. If `line_num` is provided, + the window will be moved to include that line. It only shows the first 100 lines by + default! Max `window_size` supported is 2000. use `scroll up/down` to view the file + if you want to see more. + + Parameters: + file_path (str): The file path to open, preferred absolute path. + line_num (int): The line number to move the window to. + window_size (int): The number of lines to show above and below the line. + """ + + file_path_p = Path(file_path) + if not file_path_p.exists(): + return f"[File {file_path} does not exist]" + + total_lines = sum(1 for _ in open(file_path_p)) + window_size = min(window_size, 2000) + window_size = window_size // 2 + if line_num - window_size < 0: + line_num = window_size + elif line_num >= total_lines: + line_num = total_lines - 1 - window_size + + global CURRENT_LINE, CURRENT_FILE + CURRENT_LINE = line_num + CURRENT_FILE = file_path + + with open(file_path, "r") as f: + lines = f.readlines() + + return view_lines(lines, line_num, window_size, file_path, total_lines) + + +def create_file(file_path: str) -> str: + """Creates and opens a new file with the given name. + + Parameters: + file_path (str): The file path to create, preferred absolute path. + """ + + file_path_p = Path(file_path) + if file_path_p.exists(): + return f"[File {file_path} already exists]" + file_path_p.touch() + global CURRENT_FILE + CURRENT_FILE = file_path + return f"[File created {file_path}]" + + +def scroll_up() -> str: + """Moves the window up by 100 lines.""" + if CURRENT_FILE is None: + return "[No file is open]" + + return open_file(CURRENT_FILE, CURRENT_LINE + DEFAULT_WINDOW_SIZE) + + +def scroll_down() -> str: + """Moves the window down by 100 lines.""" + if CURRENT_FILE is None: + return "[No file is open]" + + return open_file(CURRENT_FILE, CURRENT_LINE - DEFAULT_WINDOW_SIZE) + + +def search_dir(search_term: str, dir_path: str) -> str: + """Searches for search_term in all files in a directory. + + Parameters: + search_term (str): The search term to look for. + dir_path (str): The directory path to search in, preferred absolute path. + """ + + dir_path_p = Path(dir_path) + if not dir_path_p.exists(): + return f"[Directory {dir_path} does not exist]" + + matches = [] + for file in dir_path_p.glob("**/*"): + if filter_file(file): + with open(file, "r") as f: + lines = f.readlines() + for i, line in enumerate(lines): + if search_term in line: + matches.append(f"{file}:{i}|{line.strip()}\n") + if not matches: + return f"[No matches found for {search_term} in {dir_path}]" + if len(matches) > 100: + return f"[More than {len(matches)} matches found for {search_term} in {dir_path}. Please narrow your search]" + + return_str = f"[Found {len(matches)} matches for {search_term} in {dir_path}]\n" + for match in matches: + return_str += match + + return_str += f"[End of matches for {search_term} in {dir_path}]" + return return_str + + +def search_file(search_term: str, file_path: str) -> str: + """Searches the file for the given search term. + + Parameters: + search_term (str): The search term to look for. + file_path (str): The file path to search in, preferred absolute path. + """ + + file_path_p = Path(file_path) + if not file_path_p.exists(): + return f"[File {file_path} does not exist]" + + with open(file_path_p, "r") as f: + lines = f.readlines() + + search_results = [] + for i, line in enumerate(lines): + if search_term in line: + search_results.append(f"{i}|{line.strip()}\n") + + if not search_results: + return f"[No matches found for {search_term} in {file_path}]" + + return_str = ( + f"[Found {len(search_results)} matches for {search_term} in {file_path}]\n" + ) + for result in search_results: + return_str += result + + return_str += f"[End of matches for {search_term} in {file_path}]" + return return_str + + +def find_file(file_name: str, dir_path: str = "./") -> str: + """Finds all files with the given name in the specified directory. + + Parameters: + file_name (str): The file name to look for. + dir_path (str): The directory path to search in, preferred absolute path. + """ + + dir_path_p = Path(dir_path) + if not dir_path_p.exists(): + return f"[Directory {dir_path} does not exist]" + + files = list(dir_path_p.glob(f"**/*{file_name}*")) + files = [f for f in files if filter_file(f)] + if not files: + return f"[No files found in {dir_path} with name {file_name}]" + + return_str = f"[Found {len(files)} matches for {file_name} in {dir_path}]\n" + for match in files: + return_str += str(match) + "\n" + + return_str += f"[End of matches for {file_name} in {dir_path}]" + return return_str + + +def edit_file(file_path: str, start: int, end: int, content: str) -> str: + """Edits the file at the given path with the provided content. The content will be + inserted between the `start` and `end` line numbers. If the `start` and `end` are + the same, the content will be inserted at the `start` line number. If the `end` is + greater than the total number of lines in the file, the content will be inserted at + the end of the file. If the `start` or `end` are negative, the function will return + an error message. + + Parameters: + file_path (str): The file path to edit, preferred absolute path. + start (int): The line number to start the edit. + end (int): The line number to end the edit. + content (str): The content to insert. + """ + file_path_p = Path(file_path) + if not file_path_p.exists(): + return f"[File {file_path} does not exist]" + + total_lines = sum(1 for _ in open(file_path_p)) + if start < 0 or end < 0 or start > end or end > total_lines: + return "[Invalid line range]" + if start == end: + end += 1 + + new_content_lines = content.splitlines(keepends=True) + new_content_lines = [ + line if line.endswith("\n") else line + "\n" for line in new_content_lines + ] + with open(file_path_p, "r") as f: + lines = f.readlines() + edited_lines = lines[:start] + new_content_lines + lines[end:] + + cur_line = start + len(content.split("\n")) // 2 + tmp_file = file_path_p.with_suffix(".tmp") + with open(tmp_file, "w") as f: + f.writelines(edited_lines) + + process = subprocess.Popen( + [ + "flake8", + "--isolated", + "--select=F821,F822,F831,E111,E112,E113,E999,E902", + tmp_file, + ], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + ) + stdout, _ = process.communicate() + tmp_file.unlink() + if stdout != "": + stdout = stdout.replace(tmp_file.name, file_path) + error_msg = "[Edit failed with the following status]\n" + stdout + original_view = view_lines( + lines, + start + ((end - start) // 2), + DEFAULT_WINDOW_SIZE, + file_path, + total_lines, + ) + total_lines_edit = sum(1 for _ in edited_lines) + edited_view = view_lines( + edited_lines, cur_line, DEFAULT_WINDOW_SIZE, file_path, total_lines_edit + ) + + error_msg += f"\n[This is how your edit would have looked like if applied]\n{edited_view}\n\n[This is the original code before your edit]\n{original_view}" + return error_msg + + with open(file_path_p, "w") as f: + f.writelines(edited_lines) + + return open_file(file_path, cur_line) + + +def get_tool_descriptions() -> str: + """Returns a description of all the tools that `generate_vision_code` has access to. + Helpful for answerings questions about what types of vision tasks you can do with + `generate_vision_code`.""" + return TOOL_DESCRIPTIONS + + +META_TOOL_DOCSTRING = get_tool_documentation( + [ + get_tool_descriptions, + generate_vision_code, + edit_vision_code, + open_file, + create_file, + scroll_up, + scroll_down, + edit_file, + search_dir, + search_file, + find_file, + ] +) diff --git a/vision_agent/tools/tool_utils.py b/vision_agent/tools/tool_utils.py index 452d7f76..664466bc 100644 --- a/vision_agent/tools/tool_utils.py +++ b/vision_agent/tools/tool_utils.py @@ -1,7 +1,9 @@ +import inspect import logging import os -from typing import Any, Dict, MutableMapping, Optional +from typing import Any, Callable, Dict, List, MutableMapping, Optional +import pandas as pd from IPython.display import display from pydantic import BaseModel from requests import Session @@ -93,3 +95,47 @@ def _create_requests_session( session.mount(url, HTTPAdapter(max_retries=retries if num_retry > 0 else 0)) session.headers.update(headers) return session + + +def get_tool_documentation(funcs: List[Callable[..., Any]]) -> str: + docstrings = "" + for func in funcs: + docstrings += f"{func.__name__}{inspect.signature(func)}:\n{func.__doc__}\n\n" + + return docstrings + + +def get_tool_descriptions(funcs: List[Callable[..., Any]]) -> str: + descriptions = "" + for func in funcs: + description = func.__doc__ + if description is None: + description = "" + + if "Parameters:" in description: + description = ( + description[: description.find("Parameters:")] + .replace("\n", " ") + .strip() + ) + + description = " ".join(description.split()) + descriptions += f"- {func.__name__}{inspect.signature(func)}: {description}\n" + return descriptions + + +def get_tools_df(funcs: List[Callable[..., Any]]) -> pd.DataFrame: + data: Dict[str, List[str]] = {"desc": [], "doc": []} + + for func in funcs: + desc = func.__doc__ + if desc is None: + desc = "" + desc = desc[: desc.find("Parameters:")].replace("\n", " ").strip() + desc = " ".join(desc.split()) + + doc = f"{func.__name__}{inspect.signature(func)}:\n{func.__doc__}" + data["desc"].append(desc) + data["doc"].append(doc) + + return pd.DataFrame(data) # type: ignore diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py index a888dcc2..70dac144 100644 --- a/vision_agent/tools/tools.py +++ b/vision_agent/tools/tools.py @@ -1,22 +1,25 @@ -import inspect import io import json import logging import tempfile from importlib import resources from pathlib import Path -from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast +from typing import Any, Dict, List, Optional, Tuple, Union, cast import cv2 import numpy as np -import pandas as pd import requests from moviepy.editor import ImageSequenceClip from PIL import Image, ImageDraw, ImageFont from pillow_heif import register_heif_opener # type: ignore from pytube import YouTube # type: ignore -from vision_agent.tools.tool_utils import send_inference_request +from vision_agent.tools.tool_utils import ( + get_tool_descriptions, + get_tool_documentation, + get_tools_df, + send_inference_request, +) from vision_agent.utils import extract_frames_from_video from vision_agent.utils.execute import FileSerializer, MimeType from vision_agent.utils.image_utils import ( @@ -54,7 +57,6 @@ ] _API_KEY = "land_sk_WVYwP00xA3iXely2vuar6YUDZ3MJT9yLX6oW5noUkwICzYLiDV" _OCR_URL = "https://app.landing.ai/ocr/v1/detect-text" -logging.basicConfig(level=logging.INFO) _LOGGER = logging.getLogger(__name__) @@ -1220,50 +1222,6 @@ def overlay_heat_map( return np.array(combined) -def get_tool_documentation(funcs: List[Callable[..., Any]]) -> str: - docstrings = "" - for func in funcs: - docstrings += f"{func.__name__}{inspect.signature(func)}:\n{func.__doc__}\n\n" - - return docstrings - - -def get_tool_descriptions(funcs: List[Callable[..., Any]]) -> str: - descriptions = "" - for func in funcs: - description = func.__doc__ - if description is None: - description = "" - - if "Parameters:" in description: - description = ( - description[: description.find("Parameters:")] - .replace("\n", " ") - .strip() - ) - - description = " ".join(description.split()) - descriptions += f"- {func.__name__}{inspect.signature(func)}: {description}\n" - return descriptions - - -def get_tools_df(funcs: List[Callable[..., Any]]) -> pd.DataFrame: - data: Dict[str, List[str]] = {"desc": [], "doc": []} - - for func in funcs: - desc = func.__doc__ - if desc is None: - desc = "" - desc = desc[: desc.find("Parameters:")].replace("\n", " ").strip() - desc = " ".join(desc.split()) - - doc = f"{func.__name__}{inspect.signature(func)}:\n{func.__doc__}" - data["desc"].append(desc) - data["doc"].append(doc) - - return pd.DataFrame(data) # type: ignore - - TOOLS = [ owl_v2, grounding_sam, diff --git a/vision_agent/utils/execute.py b/vision_agent/utils/execute.py index aaea19cc..2ba8b69d 100644 --- a/vision_agent/utils/execute.py +++ b/vision_agent/utils/execute.py @@ -40,9 +40,7 @@ class MimeType(str, Enum): - """ - Represents a MIME type. - """ + """Represents a MIME type.""" TEXT_PLAIN = "text/plain" TEXT_HTML = "text/html" @@ -58,7 +56,9 @@ class MimeType(str, Enum): class FileSerializer: - """Adaptor class that allows IPython.display.display() to serialize a file to a base64 string representation.""" + """Adaptor class that allows IPython.display.display() to serialize a file to a + base64 string representation. + """ def __init__(self, file_uri: str): self.video_uri = file_uri @@ -76,13 +76,15 @@ def base64(self) -> str: class Result: - """ - Represents the data to be displayed as a result of executing a cell in a Jupyter notebook. - The result is similar to the structure returned by ipython kernel: https://ipython.readthedocs.io/en/stable/development/execution.html#execution-semantics + """Represents the data to be displayed as a result of executing a cell in a Jupyter + notebook. The result is similar to the structure returned by ipython kernel: + https://ipython.readthedocs.io/en/stable/development/execution.html#execution-semantics - The result can contain multiple types of data, such as text, images, plots, etc. Each type of data is represented - as a string, and the result can contain multiple types of data. The display calls don't have to have text representation, - for the actual result the representation is always present for the result, the other representations are always optional. + The result can contain multiple types of data, such as text, images, plots, etc. + Each type of data is represented as a string, and the result can contain multiple + types of data. The display calls don't have to have text representation, for the + actual result the representation is always present for the result, the other + representations are always optional. The class also provides methods to display the data in a Jupyter notebook. """ @@ -143,62 +145,43 @@ def __repr__(self) -> str: return str(self.raw) def _repr_html_(self) -> Optional[str]: - """ - Returns the HTML representation of the data. - """ + """Returns the HTML representation of the data.""" return self.html def _repr_markdown_(self) -> Optional[str]: - """ - Returns the Markdown representation of the data. - """ + """Returns the Markdown representation of the data.""" return self.markdown def _repr_svg_(self) -> Optional[str]: - """ - Returns the SVG representation of the data. - """ + """Returns the SVG representation of the data.""" return self.svg def _repr_png_(self) -> Optional[str]: - """ - Returns the base64 representation of the PNG data. - """ + """Returns the base64 representation of the PNG data.""" return self.png def _repr_jpeg_(self) -> Optional[str]: - """ - Returns the base64 representation of the JPEG data. - """ + """Returns the base64 representation of the JPEG data.""" return self.jpeg def _repr_pdf_(self) -> Optional[str]: - """ - Returns the PDF representation of the data. - """ + """Returns the PDF representation of the data.""" return self.pdf def _repr_latex_(self) -> Optional[str]: - """ - Returns the LaTeX representation of the data. - """ + """Returns the LaTeX representation of the data.""" return self.latex def _repr_json_(self) -> Optional[dict]: - """ - Returns the JSON representation of the data. - """ + """Returns the JSON representation of the data.""" return self.json def _repr_javascript_(self) -> Optional[str]: - """ - Returns the JavaScript representation of the data. - """ + """Returns the JavaScript representation of the data.""" return self.javascript def formats(self) -> Iterable[str]: - """ - Returns all available formats of the result. + """Returns all available formats of the result. :return: All available formats of the result in MIME types. """ @@ -239,8 +222,8 @@ def from_e2b_result(result: E2BResult) -> "Result": # type: ignore class Logs(BaseModel): - """ - Data printed to stdout and stderr during execution, usually by print statements, logs, warnings, subprocesses, etc. + """Data printed to stdout and stderr during execution, usually by print statements, + logs, warnings, subprocesses, etc. """ stdout: List[str] = [] @@ -257,9 +240,8 @@ def __str__(self) -> str: class Error(BaseModel): - """ - Represents an error that occurred during the execution of a cell. - The error contains the name of the error, the value of the error, and the traceback. + """Represents an error that occurred during the execution of a cell. The error + contains the name of the error, the value of the error, and the traceback. """ name: str @@ -290,9 +272,7 @@ def from_exception(e: Exception) -> "Error": class Execution(BaseModel): - """ - Represents the result of a cell execution. - """ + """Represents the result of a cell execution.""" class Config: arbitrary_types_allowed = True @@ -305,8 +285,8 @@ class Config: "Error object if an error occurred, None otherwise." def text(self, include_logs: bool = True) -> str: - """ - Returns the text representation of this object, i.e. including the main result or the error traceback, optionally along with the logs (stdout, stderr). + """Returns the text representation of this object, i.e. including the main + result or the error traceback, optionally along with the logs (stdout, stderr). """ prefix = str(self.logs) if include_logs else "" if self.error: @@ -330,9 +310,9 @@ def success(self) -> bool: return self.error is None def get_main_result(self) -> Optional[Result]: - """ - Get the main result of the execution. - An execution may have multiple results, e.g. intermediate outputs. The main result is the last output of the cell execution. + """Get the main result of the execution. An execution may have multiple + results, e.g. intermediate outputs. The main result is the last output of the + cell execution. """ if not self.success: _LOGGER.info("Result is not available as the execution was not successful.") @@ -345,16 +325,13 @@ def get_main_result(self) -> Optional[Result]: return main_result def to_json(self) -> str: - """ - Returns the JSON representation of the Execution object. - """ + """Returns the JSON representation of the Execution object.""" return self.model_dump_json(exclude_none=True) @field_serializer("results", when_used="json") def serialize_results(results: List[Result]) -> List[Dict[str, Union[str, bool]]]: # type: ignore - """ - Serializes the results to JSON. - This method is used by the Pydantic JSON encoder. + """Serializes the results to JSON. This method is used by the Pydantic JSON + encoder. """ serialized = [] for result in results: @@ -367,9 +344,7 @@ def serialize_results(results: List[Result]) -> List[Dict[str, Union[str, bool]] @staticmethod def from_exception(exec: Exception, traceback_raw: List[str]) -> "Execution": - """ - Creates an Execution object from an exception. - """ + """Creates an Execution object from an exception.""" return Execution( error=Error( name=exec.__class__.__name__, @@ -382,9 +357,7 @@ def from_exception(exec: Exception, traceback_raw: List[str]) -> "Execution": @staticmethod def from_e2b_execution(exec: E2BExecution) -> "Execution": # type: ignore - """ - Creates an Execution object from an E2BResult object. - """ + """Creates an Execution object from an E2BResult object.""" return Execution( results=[Result.from_e2b_result(res) for res in exec.results], logs=Logs(stdout=exec.logs.stdout, stderr=exec.logs.stderr), @@ -545,6 +518,8 @@ def __init__(self, timeout: int = _SESSION_TIMEOUT) -> None: nbformat version: {nbformat.__version__} Timeout: {self.timeout}""" ) + sleep(1) + self._new_kernel() def _new_kernel(self) -> None: if self.nb_client.kc is None or not run_sync(self.nb_client.kc.is_alive)(): # type: ignore @@ -557,17 +532,19 @@ def close(self) -> None: run_sync(self.nb_client.km.shutdown_kernel)(now=True) run_sync(self.nb_client.km.cleanup_resources)() - channels = [ - self.nb_client.kc.stdin_channel, - self.nb_client.kc.hb_channel, - self.nb_client.kc.control_channel, - ] + if self.nb_client.kc is not None: + channels = [ + self.nb_client.kc.stdin_channel, + self.nb_client.kc.hb_channel, + self.nb_client.kc.control_channel, + ] - for ch in channels: - if ch.is_alive(): - ch.stop() + for ch in channels: + if ch.is_alive(): + ch.stop() + self.nb_client.kc.stop_channels() - self.nb_client.kc = None + self.nb_client.kc = None self.nb_client.km = None def restart_kernel(self) -> None: @@ -634,9 +611,8 @@ def new_instance(code_sandbox_runtime: Optional[str] = None) -> CodeInterpreter: def _parse_local_code_interpreter_outputs(outputs: List[Dict[str, Any]]) -> Execution: - """ - Parse notebook cell outputs to Execution object. - Output types: https://nbformat.readthedocs.io/en/latest/format_description.html#code-cell-outputs + """Parse notebook cell outputs to Execution object. Output types: + https://nbformat.readthedocs.io/en/latest/format_description.html#code-cell-outputs """ execution = Execution() for data in outputs: diff --git a/vision_agent/utils/image_utils.py b/vision_agent/utils/image_utils.py index 5d638c62..217c9fa2 100644 --- a/vision_agent/utils/image_utils.py +++ b/vision_agent/utils/image_utils.py @@ -232,7 +232,7 @@ def overlay_heat_map( Parameters: image: the input image - masks: the heatmap to overlay + heat_map: the heatmap to overlay alpha: the transparency of the overlay Returns: