From 32263ffa7d38fbf93e2efae5ecdbd87e00a7cb72 Mon Sep 17 00:00:00 2001
From: Dillon Laird <dillonalaird@gmail.com>
Date: Tue, 30 Apr 2024 18:29:12 -0700
Subject: [PATCH] AgentCoder-like Example for Coding Solutions (#71)

* fix conflict

* use tester as debug

* added comments

* format fix

* fix type errors'

* fix trailing spaces

* fix newline

* fix flake8

* add ignores for windows

* undo window ignores

* do not run for windows

* format fix

* fix for windows

* test no type check

* fix type errors
---
 vision_agent/agent/__init__.py            |   1 +
 vision_agent/agent/agent_coder.py         | 170 +++++++++++++
 vision_agent/agent/agent_coder_prompts.py | 135 ++++++++++
 vision_agent/agent/execution.py           | 287 ++++++++++++++++++++++
 vision_agent/tools/tool_utils.py          |  27 ++
 vision_agent/tools/tools.py               |  21 +-
 vision_agent/tools/tools_v2.py            | 181 ++++++++++++++
 7 files changed, 802 insertions(+), 20 deletions(-)
 create mode 100644 vision_agent/agent/agent_coder.py
 create mode 100644 vision_agent/agent/agent_coder_prompts.py
 create mode 100644 vision_agent/agent/execution.py
 create mode 100644 vision_agent/tools/tool_utils.py
 create mode 100644 vision_agent/tools/tools_v2.py

diff --git a/vision_agent/agent/__init__.py b/vision_agent/agent/__init__.py
index 7d11231f..b90cbb16 100644
--- a/vision_agent/agent/__init__.py
+++ b/vision_agent/agent/__init__.py
@@ -1,4 +1,5 @@
 from .agent import Agent
+from .agent_coder import AgentCoder
 from .easytool import EasyTool
 from .reflexion import Reflexion
 from .vision_agent import VisionAgent
diff --git a/vision_agent/agent/agent_coder.py b/vision_agent/agent/agent_coder.py
new file mode 100644
index 00000000..89216f73
--- /dev/null
+++ b/vision_agent/agent/agent_coder.py
@@ -0,0 +1,170 @@
+import json
+import logging
+import os
+import sys
+from pathlib import Path
+from typing import Dict, List, Optional, Union
+
+from vision_agent.agent import Agent
+from vision_agent.llm import LLM, OpenAILLM
+from vision_agent.lmm import LMM, OpenAILMM
+from vision_agent.tools.tools_v2 import TOOLS_DOCSTRING, UTILITIES_DOCSTRING
+
+from .agent_coder_prompts import DEBUG, FIX_BUG, PROGRAM, TEST, VISUAL_TEST
+from .execution import IMPORT_HELPER, check_correctness
+
+logging.basicConfig(stream=sys.stdout)
+_LOGGER = logging.getLogger(__name__)
+
+
+def write_tests(question: str, code: str, model: LLM) -> str:
+    prompt = TEST.format(
+        question=question,
+        code=code,
+    )
+    completion = model(prompt)
+    return preprocess_data(completion)
+
+
+def preprocess_data(code: str) -> str:
+    if "```python" in code:
+        code = code[code.find("```python") + len("```python") :]
+        code = code[: code.find("```")]
+    return code
+
+
+def parse_file_name(s: str) -> str:
+    # We only output png files
+    return "".join([p for p in s.split(" ") if p.endswith(".png")])
+
+
+def write_program(question: str, feedback: str, model: LLM) -> str:
+    prompt = PROGRAM.format(
+        docstring=TOOLS_DOCSTRING, question=question, feedback=feedback
+    )
+    completion = model(prompt)
+    return preprocess_data(completion)
+
+
+def write_debug(question: str, code: str, feedback: str, model: LLM) -> str:
+    prompt = DEBUG.format(
+        docstring=UTILITIES_DOCSTRING,
+        code=code,
+        question=question,
+        feedback=feedback,
+    )
+    completion = model(prompt)
+    return preprocess_data(completion)
+
+
+def execute_tests(code: str, tests: str) -> Dict[str, Union[str, bool]]:
+    full_code = f"{IMPORT_HELPER}\n{code}\n{tests}"
+    return check_correctness(full_code, 20.0)
+
+
+def run_visual_tests(
+    question: str, code: str, viz_file: str, feedback: str, model: LMM
+) -> Dict[str, Union[str, bool]]:
+    prompt = VISUAL_TEST.format(
+        docstring=TOOLS_DOCSTRING,
+        code=code,
+        question=question,
+        feedback=feedback,
+    )
+    completion = model(prompt, images=[viz_file])
+    # type is from the prompt
+    return json.loads(completion)  # type: ignore
+
+
+def fix_bugs(code: str, tests: str, result: str, feedback: str, model: LLM) -> str:
+    prompt = FIX_BUG.format(completion=code, test_case=tests, result=result)
+    completion = model(prompt)
+    return preprocess_data(completion)
+
+
+class AgentCoder(Agent):
+    """AgentCoder is based off of the AgentCoder paper https://arxiv.org/abs/2312.13010
+    and it's open source code https://github.com/huangd1999/AgentCoder with some key
+    differences. AgentCoder comprises of 3 components: a coder agent, a tester agent,
+    and an executor. The tester agents writes code to test the code written by the coder
+    agent, but in our case because we are solving a vision task it's difficult to write
+    testing code. We instead have the tester agent write code to visualize the output
+    of the code written by the coder agent. If the code fails, we pass it back to the
+    coder agent to fix the bug, if it succeeds we pass it to a visual tester agent, which
+    is an LMM model like GPT4V, to visually inspect the output and make sure it looks
+    good."""
+
+    def __init__(
+        self,
+        coder_agent: Optional[LLM] = None,
+        tester_agent: Optional[LLM] = None,
+        visual_tester_agent: Optional[LMM] = None,
+        verbose: bool = False,
+    ) -> None:
+        self.coder_agent = (
+            OpenAILLM(temperature=0.1) if coder_agent is None else coder_agent
+        )
+        self.tester_agent = (
+            OpenAILLM(temperature=0.1) if tester_agent is None else tester_agent
+        )
+        self.visual_tester_agent = (
+            OpenAILMM(temperature=0.1, json_mode=True)
+            if visual_tester_agent is None
+            else visual_tester_agent
+        )
+        self.max_turns = 3
+        if verbose:
+            _LOGGER.setLevel(logging.INFO)
+
+    def __call__(
+        self,
+        input: Union[List[Dict[str, str]], str],
+        image: Optional[Union[str, Path]] = None,
+    ) -> str:
+        if isinstance(input, str):
+            input = [{"role": "user", "content": input}]
+        return self.chat(input, image)
+
+    def chat(
+        self,
+        input: List[Dict[str, str]],
+        image: Optional[Union[str, Path]] = None,
+    ) -> str:
+        question = input[0]["content"]
+        if image:
+            question += f" Input file path: {os.path.abspath(image)}"
+
+        code = ""
+        feedback = ""
+        for _ in range(self.max_turns):
+            code = write_program(question, feedback, self.coder_agent)
+            _LOGGER.info(f"code:\n{code}")
+            debug = write_debug(question, code, feedback, self.tester_agent)
+            _LOGGER.info(f"debug:\n{debug}")
+            results = execute_tests(code, debug)
+            _LOGGER.info(
+                f"execution results: passed: {results['passed']}\n{results['result']}"
+            )
+
+            if not results["passed"]:
+                code = fix_bugs(
+                    code, debug, results["result"].strip(), feedback, self.coder_agent  # type: ignore
+                )
+                _LOGGER.info(f"fixed code:\n{code}")
+            else:
+                # TODO: Sometimes it prints nothing, so we need to handle that case
+                # TODO: The visual agent reflection does not work very well, needs more testing
+                # viz_test_results = run_visual_tests(
+                #     question, code, parse_file_name(results["result"].strip()), feedback, self.visual_tester_agent
+                # )
+                # _LOGGER.info(f"visual test results:\n{viz_test_results}")
+                # if viz_test_results["finished"]:
+                #     return f"{IMPORT_HELPER}\n{code}"
+                # feedback += f"\n{viz_test_results['feedback']}"
+
+                return f"{IMPORT_HELPER}\n{code}"
+
+        return f"{IMPORT_HELPER}\n{code}"
+
+    def log_progress(self, description: str) -> None:
+        _LOGGER.info(description)
diff --git a/vision_agent/agent/agent_coder_prompts.py b/vision_agent/agent/agent_coder_prompts.py
new file mode 100644
index 00000000..81c365fb
--- /dev/null
+++ b/vision_agent/agent/agent_coder_prompts.py
@@ -0,0 +1,135 @@
+PROGRAM = """
+**Role**: You are a software programmer.
+
+**Task**: As a programmer, you are required to complete the function. Use a Chain-of-Thought approach to break down the problem, create pseudocode, and then write the code in Python language. Ensure that your code is efficient, readable, and well-commented. Return the requested information from the function you create.
+
+**Documentation**:
+This is the documentation for the functions you have access to. You may call any of these functions to help you complete the task, you do not need to worry about defining them or importing them and can assume they are available to you.
+{docstring}
+
+**Input Code Snippet**:
+```python
+def execute(image_path: str):
+    # Your code here
+```
+
+**User Instructions**:
+{question}
+
+**Previous Feedback**:
+{feedback}
+
+**Instructions**:
+1. **Understand and Clarify**: Make sure you understand the task.
+2. **Algorithm/Method Selection**: Decide on the most efficient way.
+3. **Pseudocode Creation**: Write down the steps you will follow in pseudocode.
+4. **Code Generation**: Translate your pseudocode into executable Python code.
+"""
+
+DEBUG = """
+**Role**: You are a software programmer.
+
+**Task**: Your task is to run the `execute` function and either print the output or print a file name containing visualized output for another agent to examine. The other agent will then use your output, either the printed return value of the function or the visualized output as a file, to determine if `execute` is functioning correctly.
+
+**Documentation**
+This is the documentation for the functions you have access to. You may call any of these functions to help you complete the task, you do not need to worry about defining them or importing them and can assume they are available to you.
+{docstring}
+
+**Input Code Snippet**:
+```python
+### Please decided how would you want to generate test cases. Based on incomplete code or completed version.
+{code}
+```
+
+**User Instructions**:
+{question}
+
+**Previous Feedback**:
+{feedback}
+
+**Instructions**:
+1. **Understand and Clarify**: Make sure you understand the task.
+2. **Code Execution**: Run the `execute` function with the given input from the user instructions.
+3. **Output Generation**: Print the output or save it as a file for visualization utilizing the functions you have access to.
+"""
+
+VISUAL_TEST = """
+**Role**: You are a machine vision expert.
+
+**Task**: Your task is to visually inspect the output of the `execute` function and determine if the visualization of the function output looks correct given the user's instructions. If not, you can provide suggestions to improve the `execute` function to imporve it.
+
+**Documentation**:
+This is the documentation for the functions you have access to. You may call any of these functions to help you complete the task, you do not need to worry about defining them or importing them and can assume they are available to you.
+{docstring}
+
+
+**Input Code Snippet**:
+This is the code that
+```python
+{code}
+```
+
+**User Instructions**:
+{question}
+
+**Previous Feedback**:
+{feedback}
+
+**Instructions**:
+1. **Visual Inspection**: Examine the visual output of the `execute` function.
+2. **Evaluation**: Determine if the visualization is correct based on the user's instructions.
+3. **Feedback**: Provide feedback on the visualization and suggest improvements if necessary.
+4. **Clear Concrete Instructions**: Provide clear concrete instructions to improve the results. You can only make coding suggestions based on the either the input code snippet or the documented code provided. For example, do not say the threshold needs to be adjust, instead provide an exact value for adjusting the threshold.
+
+Provide output in JSON format {{"finished": boolean, "feedback": "your feedback"}} where "finished" is True if the output is correct and False if not and "feedback" is your feedback.
+"""
+
+FIX_BUG = """
+Please re-complete the code to fix the error message. Here is the previous version:
+```python
+{code}
+```
+
+When we run this code:
+```python
+{tests}
+```
+
+It raises this error:
+```python
+{result}
+```
+
+This is previous feedback provided on the code:
+{feedback}
+
+Please fix the bug by follow the error information and only return python code. You do not need return the test cases. The re-completion code should in triple backticks format(i.e., in ```python ```).
+"""
+
+TEST = """
+**Role**: As a tester, your task is to create comprehensive test cases for the incomplete `execute` function. These test cases should encompass Basic, Edge, and Large Scale scenarios to ensure the code's robustness, reliability, and scalability.
+
+**User Instructions**:
+{question}
+
+**Input Code Snippet**:
+```python
+### Please decided how would you want to generate test cases. Based on incomplete code or completed version.
+{code}
+```
+
+**1. Basic Test Cases**:
+- **Objective**: To verify the fundamental functionality of the `has_close_elements` function under normal conditions.
+
+**2. Edge Test Cases**:
+- **Objective**: To evaluate the function's behavior under extreme or unusual conditions.
+
+**3. Large Scale Test Cases**:
+- **Objective**: To assess the function’s performance and scalability with large data samples.
+
+**Instructions**:
+- Implement a comprehensive set of test cases following the guidelines above.
+- Ensure each test case is well-documented with comments explaining the scenario it covers.
+- Pay special attention to edge cases as they often reveal hidden bugs.
+- For large-scale tests, focus on the function's efficiency and performance under heavy loads.
+"""
diff --git a/vision_agent/agent/execution.py b/vision_agent/agent/execution.py
new file mode 100644
index 00000000..5e1996b2
--- /dev/null
+++ b/vision_agent/agent/execution.py
@@ -0,0 +1,287 @@
+"""This code is based off of code from CodeGeeX https://github.com/THUDM/CodeGeeX"""
+
+import contextlib
+import faulthandler
+import io
+import multiprocessing
+import os
+import platform
+import signal
+import tempfile
+import traceback
+import typing
+from pathlib import Path
+from typing import Dict, Generator, List, Optional, Union
+
+IMPORT_HELPER = """
+import math
+import re
+import sys
+import copy
+import datetime
+import itertools
+import collections
+import heapq
+import statistics
+import functools
+import hashlib
+import numpy
+import numpy as np
+import string
+from typing import *
+from collections import *
+from vision_agent.tools.tools_v2 import *
+"""
+
+
+def unsafe_execute(code: str, timeout: float, result: List) -> None:
+    with create_tempdir() as dir:
+        code_path = Path(dir) / "code.py"
+        with open(code_path, "w") as f:
+            f.write(code)
+
+        # These system calls are needed when cleaning up tempdir.
+        import os
+        import shutil
+
+        rmtree = shutil.rmtree
+        rmdir = os.rmdir
+        chdir = os.chdir
+
+        # Disable functionalities that can make destructive changes to the test.
+        reliability_guard()
+
+        try:
+            with swallow_io() as s:
+                with time_limit(timeout):
+                    # WARNING
+                    # This program exists to execute untrusted model-generated code. Although
+                    # it is highly unlikely that model-generated code will do something overtly
+                    # malicious in response to this test suite, model-generated code may act
+                    # destructively due to a lack of model capability or alignment.
+                    # Users are strongly encouraged to sandbox this evaluation suite so that it
+                    # does not perform destructive actions on their host or network.
+                    # Once you have read this disclaimer and taken appropriate precautions,
+                    # uncomment the following line and proceed at your own risk:
+                    code = compile(code, code_path, "exec")  # type: ignore
+                    exec(code)
+            result.append({"output": s.getvalue(), "passed": True})
+        except TimeoutError:
+            result.append({"output": "Timed out", "passed": False})
+        except AssertionError:
+            result.append({"output": f"{traceback.format_exc()}", "passed": False})
+        except BaseException:
+            result.append({"output": f"{traceback.format_exc()}", "passed": False})
+
+        # Needed for cleaning up.
+        shutil.rmtree = rmtree
+        os.rmdir = rmdir
+        os.chdir = chdir
+
+        code_path.unlink()
+
+
+def check_correctness(
+    code: str,
+    timeout: float = 3.0,
+) -> Dict[str, Union[str, bool]]:
+    """Evaluates the functional correctness of a completion by running the test suite
+    provided in the problem.
+    """
+
+    manager = multiprocessing.Manager()
+    result = manager.list()
+
+    # p = multiprocessing.Process(target=unsafe_execute, args=(tmp_dir,))
+    p = multiprocessing.Process(target=unsafe_execute, args=(code, timeout, result))
+    p.start()
+    p.join(timeout=timeout + 1)
+    if p.is_alive():
+        p.kill()
+
+    if not result:
+        result.append("timed out")
+
+    return {
+        "code": code,
+        "result": result[0]["output"],
+        "passed": result[0]["passed"],
+    }
+
+
+# Copyright (c) OpenAI (https://openai.com)
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+# ============================================================================
+
+
+class redirect_stdin(contextlib._RedirectStream):
+    _stream = "stdin"
+
+
+@contextlib.contextmanager
+def chdir(root: str) -> Generator[None, None, None]:
+    if root == ".":
+        yield
+        return
+    cwd = os.getcwd()
+    os.chdir(root)
+    try:
+        yield
+    except BaseException as exc:
+        raise exc
+    finally:
+        os.chdir(cwd)
+
+
+class WriteOnlyStringIO(io.StringIO):
+    """StringIO that throws an exception when it's read from"""
+
+    def read(self, *args, **kwargs):  # type: ignore
+        raise IOError
+
+    def readline(self, *args, **kwargs):  # type: ignore
+        raise IOError
+
+    def readlines(self, *args, **kwargs):  # type: ignore
+        raise IOError
+
+    def readable(self, *args, **kwargs):  # type: ignore
+        """Returns True if the IO object can be read."""
+        return False
+
+
+@contextlib.contextmanager
+def create_tempdir() -> Generator[str, None, None]:
+    with tempfile.TemporaryDirectory() as dirname:
+        with chdir(dirname):
+            yield dirname
+
+
+@contextlib.contextmanager
+def swallow_io() -> Generator[WriteOnlyStringIO, None, None]:
+    stream = WriteOnlyStringIO()
+    with contextlib.redirect_stdout(stream):
+        with contextlib.redirect_stderr(stream):
+            with redirect_stdin(stream):
+                yield stream
+
+
+@typing.no_type_check
+@contextlib.contextmanager
+def time_limit(seconds: float) -> Generator[None, None, None]:
+    def signal_handler(signum, frame):
+        raise TimeoutError("Timed out!")
+
+    if platform.uname().system != "Windows":
+        signal.setitimer(signal.ITIMER_REAL, seconds)
+        signal.signal(signal.SIGALRM, signal_handler)
+        try:
+            yield
+        finally:
+            signal.setitimer(signal.ITIMER_REAL, 0)
+
+
+@typing.no_type_check
+def reliability_guard(maximum_memory_bytes: Optional[int] = None) -> None:
+    """
+    This disables various destructive functions and prevents the generated code
+    from interfering with the test (e.g. fork bomb, killing other processes,
+    removing filesystem files, etc.)
+
+    WARNING
+    This function is NOT a security sandbox. Untrusted code, including, model-
+    generated code, should not be blindly executed outside of one. See the
+    Codex paper for more information about OpenAI's code sandbox, and proceed
+    with caution.
+    """
+
+    if maximum_memory_bytes is not None:
+        if platform.uname().system != "Windows":
+            import resource
+
+            resource.setrlimit(
+                resource.RLIMIT_AS, (maximum_memory_bytes, maximum_memory_bytes)
+            )
+            resource.setrlimit(
+                resource.RLIMIT_DATA, (maximum_memory_bytes, maximum_memory_bytes)
+            )
+            if not platform.uname().system == "Darwin":
+                resource.setrlimit(
+                    resource.RLIMIT_STACK, (maximum_memory_bytes, maximum_memory_bytes)
+                )
+
+    faulthandler.disable()
+
+    import builtins
+
+    builtins.exit = None
+    builtins.quit = None
+
+    import os
+
+    os.environ["OMP_NUM_THREADS"] = "1"
+
+    os.kill = None
+    os.system = None
+    # os.putenv = None  # this causes numpy to fail on import
+    os.remove = None
+    os.removedirs = None
+    os.rmdir = None
+    os.fchdir = None
+    os.setuid = None
+    os.fork = None
+    os.forkpty = None
+    os.killpg = None
+    os.rename = None
+    os.renames = None
+    os.truncate = None
+    os.replace = None
+    os.unlink = None
+    os.fchmod = None
+    os.fchown = None
+    os.chmod = None
+    os.chown = None
+    os.chroot = None
+    os.fchdir = None
+    os.lchflags = None
+    os.lchmod = None
+    os.lchown = None
+    os.getcwd = None
+    os.chdir = None
+
+    import shutil
+
+    shutil.rmtree = None
+    shutil.move = None
+    shutil.chown = None
+
+    import subprocess
+
+    subprocess.Popen = None  # type: ignore
+
+    __builtins__["help"] = None
+
+    import sys
+
+    sys.modules["ipdb"] = None
+    sys.modules["joblib"] = None
+    sys.modules["resource"] = None
+    sys.modules["psutil"] = None
+    sys.modules["tkinter"] = None
diff --git a/vision_agent/tools/tool_utils.py b/vision_agent/tools/tool_utils.py
new file mode 100644
index 00000000..3f66b539
--- /dev/null
+++ b/vision_agent/tools/tool_utils.py
@@ -0,0 +1,27 @@
+import logging
+from typing import Any, Dict
+
+import requests
+
+from vision_agent.type_defs import LandingaiAPIKey
+
+_LOGGER = logging.getLogger(__name__)
+_LND_API_KEY = LandingaiAPIKey().api_key
+_LND_API_URL = "https://api.dev.landing.ai/v1/agent"
+
+
+def _send_inference_request(
+    payload: Dict[str, Any], endpoint_name: str
+) -> Dict[str, Any]:
+    res = requests.post(
+        f"{_LND_API_URL}/model/{endpoint_name}",
+        headers={
+            "Content-Type": "application/json",
+            "apikey": _LND_API_KEY,
+        },
+        json=payload,
+    )
+    if res.status_code != 200:
+        _LOGGER.error(f"Request failed: {res.text}")
+        raise ValueError(f"Request failed: {res.text}")
+    return res.json()["data"]  # type: ignore
diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py
index aec4f19b..743f9367 100644
--- a/vision_agent/tools/tools.py
+++ b/vision_agent/tools/tools.py
@@ -20,12 +20,10 @@
     rle_decode,
 )
 from vision_agent.lmm import OpenAILMM
+from vision_agent.tools.tool_utils import _send_inference_request
 from vision_agent.tools.video import extract_frames_from_video
-from vision_agent.type_defs import LandingaiAPIKey
 
 _LOGGER = logging.getLogger(__name__)
-_LND_API_KEY = LandingaiAPIKey().api_key
-_LND_API_URL = "https://api.dev.landing.ai/v1/agent"
 
 
 class Tool(ABC):
@@ -1221,20 +1219,3 @@ def register_tool(tool: Type[Tool]) -> Type[Tool]:
         "class": tool,
     }
     return tool
-
-
-def _send_inference_request(
-    payload: Dict[str, Any], endpoint_name: str
-) -> Dict[str, Any]:
-    res = requests.post(
-        f"{_LND_API_URL}/model/{endpoint_name}",
-        headers={
-            "Content-Type": "application/json",
-            "apikey": _LND_API_KEY,
-        },
-        json=payload,
-    )
-    if res.status_code != 200:
-        _LOGGER.error(f"Request failed: {res.text}")
-        raise ValueError(f"Request failed: {res.text}")
-    return res.json()["data"]  # type: ignore
diff --git a/vision_agent/tools/tools_v2.py b/vision_agent/tools/tools_v2.py
new file mode 100644
index 00000000..3e39107d
--- /dev/null
+++ b/vision_agent/tools/tools_v2.py
@@ -0,0 +1,181 @@
+import inspect
+import tempfile
+from importlib import resources
+from typing import Any, Callable, Dict, List
+
+import numpy as np
+from PIL import Image, ImageDraw, ImageFont
+
+from vision_agent.image_utils import convert_to_b64, normalize_bbox
+from vision_agent.tools.tool_utils import _send_inference_request
+
+COLORS = [
+    (158, 218, 229),
+    (219, 219, 141),
+    (23, 190, 207),
+    (188, 189, 34),
+    (199, 199, 199),
+    (247, 182, 210),
+    (127, 127, 127),
+    (227, 119, 194),
+    (196, 156, 148),
+    (197, 176, 213),
+    (140, 86, 75),
+    (148, 103, 189),
+    (255, 152, 150),
+    (152, 223, 138),
+    (214, 39, 40),
+    (44, 160, 44),
+    (255, 187, 120),
+    (174, 199, 232),
+    (255, 127, 14),
+    (31, 119, 180),
+]
+
+
+def grounding_dino(
+    prompt: str,
+    image: np.ndarray,
+    box_threshold: float = 0.20,
+    iou_threshold: float = 0.75,
+) -> List[Dict[str, Any]]:
+    """'grounding_dino' is a tool that can detect arbitrary objects with inputs such as
+    category names or referring expressions.
+
+    Parameters:
+        prompt (str): The prompt to ground to the image.
+        image (np.ndarray): The image to ground the prompt to.
+        box_threshold (float, optional): The threshold for the box detection. Defaults to 0.20.
+        iou_threshold (float, optional): The threshold for the Intersection over Union (IoU). Defaults to 0.75.
+
+    Returns:
+        List[Dict[str, Any]]: A list of dictionaries containing the score, label, and
+        bounding box of the detected objects with normalized coordinates.
+
+    Example
+    -------
+    >>> grounding_dino("car. dinosaur", image)
+    [{'score': 0.99, 'label': 'dinosaur', 'bbox': [0.1, 0.11, 0.35, 0.4]}]
+    """
+    image_size = image.shape[:2]
+    image_b64 = convert_to_b64(Image.fromarray(image))
+    request_data = {
+        "prompt": prompt,
+        "image": image_b64,
+        "tool": "visual_grounding",
+        "kwargs": {"box_threshold": box_threshold, "iou_threshold": iou_threshold},
+    }
+    data: Dict[str, Any] = _send_inference_request(request_data, "tools")
+    return_data = []
+    for i in range(len(data["bboxes"])):
+        return_data.append(
+            {
+                "score": round(data["scores"][i], 2),
+                "label": data["labels"][i],
+                "bbox": normalize_bbox(data["bboxes"][i], image_size),
+            }
+        )
+    return return_data
+
+
+def load_image(image_path: str) -> np.ndarray:
+    """'load_image' is a utility function that loads an image from the given path.
+
+    Parameters:
+        image_path (str): The path to the image.
+
+    Returns:
+        np.ndarray: The image as a NumPy array.
+
+    Example
+    -------
+    >>> load_image("path/to/image.jpg")
+    """
+
+    image = Image.open(image_path).convert("RGB")
+    return np.array(image)
+
+
+def save_image(image: np.ndarray) -> str:
+    """'save_image' is a utility function that saves an image as a temporary file.
+
+    Parameters:
+        image (np.ndarray): The image to save.
+
+    Returns:
+        str: The path to the saved image.
+
+    Example
+    -------
+    >>> save_image(image)
+    "/tmp/tmpabc123.png"
+    """
+
+    with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as f:
+        pil_image = Image.fromarray(image.astype(np.uint8))
+        pil_image.save(f, "PNG")
+    return f.name
+
+
+def display_bounding_boxes(
+    image: np.ndarray, bboxes: List[Dict[str, Any]]
+) -> np.ndarray:
+    """'display_bounding_boxes' is a utility function that displays bounding boxes on an image.
+
+    Parameters:
+        image (np.ndarray): The image to display the bounding boxes on.
+        bboxes (List[Dict[str, Any]]): A list of dictionaries containing the bounding boxes.
+
+    Returns:
+        np.ndarray: The image with the bounding boxes displayed.
+
+    Example
+    -------
+    >>> image_with_bboxes = display_bounding_boxes(image, [{'score': 0.99, 'label': 'dinosaur', 'bbox': [0.1, 0.11, 0.35, 0.4]}])
+    """
+    pil_image = Image.fromarray(image.astype(np.uint8))
+
+    color = {
+        label: COLORS[i % len(COLORS)]
+        for i, label in enumerate(set([box["label"] for box in bboxes]))
+    }
+
+    width, height = pil_image.size
+    fontsize = max(12, int(min(width, height) / 40))
+    draw = ImageDraw.Draw(pil_image)
+    font = ImageFont.truetype(
+        str(resources.files("vision_agent.fonts").joinpath("default_font_ch_en.ttf")),
+        fontsize,
+    )
+
+    for elt in bboxes:
+        label = elt["label"]
+        box = elt["bbox"]
+        scores = elt["score"]
+
+        box = [
+            int(box[0] * width),
+            int(box[1] * height),
+            int(box[2] * width),
+            int(box[3] * height),
+        ]
+        draw.rectangle(box, outline=color[label], width=4)
+        text = f"{label}: {scores:.2f}"
+        text_box = draw.textbbox((box[0], box[1]), text=text, font=font)
+        draw.rectangle((box[0], box[1], text_box[2], text_box[3]), fill=color[label])
+        draw.text((box[0], box[1]), text, fill="black", font=font)
+    return np.array(pil_image.convert("RGB"))
+
+
+def get_tool_documentation(funcs: List[Callable]) -> str:
+    docstrings = ""
+    for func in funcs:
+        docstrings += f"{func.__name__}: {inspect.signature(func)}\n{func.__doc__}\n\n"
+
+    return docstrings
+
+
+TOOLS_DOCSTRING = get_tool_documentation([load_image, grounding_dino])
+UTILITIES_DOCSTRING = get_tool_documentation(
+    [load_image, save_image, display_bounding_boxes]
+)