From 6b8e58bfb89b536991b63949974da4416b53dc6f Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Thu, 15 Aug 2024 10:52:36 -0700 Subject: [PATCH 01/37] update for new conv --- vision_agent/agent/vision_agent.py | 25 ++++++++++++++++++---- vision_agent/agent/vision_agent_prompts.py | 2 ++ 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/vision_agent/agent/vision_agent.py b/vision_agent/agent/vision_agent.py index cfb482e1..dad2d824 100644 --- a/vision_agent/agent/vision_agent.py +++ b/vision_agent/agent/vision_agent.py @@ -1,6 +1,8 @@ import copy import logging import os +import tempfile +import pickle as pkl from pathlib import Path from typing import Any, Dict, List, Optional, Union, cast @@ -12,12 +14,14 @@ VA_CODE, ) from vision_agent.lmm import LMM, Message, OpenAILMM +from vision_agent.tools.meta_tools import Artifacts from vision_agent.tools import META_TOOL_DOCSTRING from vision_agent.utils import CodeInterpreterFactory from vision_agent.utils.execute import CodeInterpreter logging.basicConfig(level=logging.INFO) _LOGGER = logging.getLogger(__name__) +ARTIFACT = "artifacts.pkl" WORKSPACE = Path(os.getenv("WORKSPACE", "")) WORKSPACE.mkdir(parents=True, exist_ok=True) if str(WORKSPACE) != "": @@ -28,7 +32,8 @@ class DefaultImports: code = [ "from typing import *", "from vision_agent.utils.execute import CodeInterpreter", - "from vision_agent.tools.meta_tools import generate_vision_code, edit_vision_code, open_file, create_file, scroll_up, scroll_down, edit_file, get_tool_descriptions", + "from vision_agent.tools.meta_tools import Artifacts, open_artifact, create_artifact, edit_artifact, get_tool_descriptions", + f"artifacts = Artifacts({ARTIFACT})", ] @staticmethod @@ -66,9 +71,21 @@ def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]: return extract_json(orch([{"role": "user", "content": prompt}], stream=False)) # type: ignore -def run_code_action(code: str, code_interpreter: CodeInterpreter) -> str: - # Note the code interpreter needs to keep running in the same environment because - # the SWE tools hold state like line numbers and currently open files. +def run_code_action(code: str, artifacts: Artifacts, code_interpreter: CodeInterpreter) -> str: + with tempfile.TemporaryDirectory() as tmpdirname: + for name in artifacts: + temp_file_path = Path(tmpdirname) / name + ".py" + with open(temp_file_path, "w") as f: + f.write(artifacts[name]) + code_interpreter.upload_file(temp_file_path) + temp_file_path.unlink() + + temp_file_path = Path(tmpdirname) / ARTIFACT + with open(temp_file_path, "wb") as f: + pkl.dump(artifacts.artifacts, f) + code_interpreter.upload_file(temp_file_path) + temp_file_path.unlink() + result = code_interpreter.exec_cell(DefaultImports.prepend_imports(code)) return_str = "" diff --git a/vision_agent/agent/vision_agent_prompts.py b/vision_agent/agent/vision_agent_prompts.py index 4774d84d..7b714378 100644 --- a/vision_agent/agent/vision_agent_prompts.py +++ b/vision_agent/agent/vision_agent_prompts.py @@ -31,6 +31,7 @@ {conversation} """ + EXAMPLES_CODE1 = """ USER: Can you detect the dogs in this image? Media name dog.jpg @@ -76,6 +77,7 @@ AGENT: {"thoughts": "One dog is detected, I will show this to the user and ask them if the result looks good.", "response": "I have written the code to detect one dog and shown you the output, do the results look good to you?", "let_user_respond": true} """ + EXAMPLES_CODE2 = """ USER: Can you create a function to count workers with helmets? From 07a9b849b6c5454e038a1bc9e40ab894bd18e432 Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Thu, 15 Aug 2024 10:52:50 -0700 Subject: [PATCH 02/37] add artifact tools --- vision_agent/tools/meta_tools.py | 755 +++++++++++++++++++------------ 1 file changed, 466 insertions(+), 289 deletions(-) diff --git a/vision_agent/tools/meta_tools.py b/vision_agent/tools/meta_tools.py index 4a82436d..29d56c55 100644 --- a/vision_agent/tools/meta_tools.py +++ b/vision_agent/tools/meta_tools.py @@ -1,12 +1,18 @@ import os +import pickle as pkl import subprocess +import tempfile from pathlib import Path -from typing import Any, Dict, List, Union +from typing import Any, Dict, List, Optional, Union +from uuid import UUID import vision_agent as va +from vision_agent.clients.landing_public_api import LandingPublicAPI from vision_agent.lmm.types import Message from vision_agent.tools.tool_utils import get_tool_documentation from vision_agent.tools.tools import TOOL_DESCRIPTIONS +from vision_agent.utils.image_utils import convert_to_b64 +from vision_agent.utils import CodeInterpreterFactory # These tools are adapted from SWE-Agent https://github.com/princeton-nlp/SWE-agent @@ -35,97 +41,32 @@ def filter_file(file_name: Union[str, Path]) -> bool: ) -def generate_vision_code(save_file: str, chat: str, media: List[str]) -> str: - """Generates python code to solve vision based tasks. +class Artifacts: + def __init__(self, save_path: Union[str, Path]) -> None: + self.save_path = save_path + self.artifacts = {} - Parameters: - save_file (str): The file path to save the code. - chat (str): The chat message from the user. - media (List[str]): The media files to use. - - Returns: - str: The generated code. - - Examples - -------- - >>> generate_vision_code("code.py", "Can you detect the dogs in this image?", ["image.jpg"]) - from vision_agent.tools import load_image, owl_v2 - def detect_dogs(image_path: str): - image = load_image(image_path) - dogs = owl_v2("dog", image) - return dogs - """ + self.code_sandbox_runtime = None - if ZMQ_PORT is not None: - agent = va.agent.VisionAgentCoder( - report_progress_callback=lambda inp: report_progress_callback( - int(ZMQ_PORT), inp - ) - ) - else: - agent = va.agent.VisionAgentCoder() - try: - fixed_chat: List[Message] = [{"role": "user", "content": chat, "media": media}] - response = agent.chat_with_workflow(fixed_chat) - code = response["code"] - with open(save_file, "w") as f: - f.write(code) - code_lines = code.splitlines(keepends=True) - total_lines = len(code_lines) - return view_lines(code_lines, 0, total_lines, save_file, total_lines) - except Exception as e: - return str(e) - - -def edit_vision_code(code_file: str, chat_history: List[str], media: List[str]) -> str: - """Edits python code to solve a vision based task. + def load(self, file_path: Union[str, Path]) -> None: + with open(file_path, "rb") as f: + self.artifacts = pkl.load(f) - Parameters: - code_file (str): The file path to the code. - chat_history (List[str]): The chat history to used to generate the code. - - Returns: - str: The edited code. - - Examples - -------- - >>> edit_vision_code( - >>> "code.py", - >>> ["Can you detect the dogs in this image?", "Can you use a higher threshold?"], - >>> ["dog.jpg"], - >>> ) - from vision_agent.tools import load_image, owl_v2 - def detect_dogs(image_path: str): - image = load_image(image_path) - dogs = owl_v2("dog", image, threshold=0.8) - return dogs - """ + def save(self) -> None: + with open(self.save_path, "wb") as f: + pkl.dump(self.artifacts, f) + + def __iter__(self): + return iter(self.artifacts) + + def __getitem__(self, name: str) -> str: + return self.artifacts[name] - agent = va.agent.VisionAgentCoder() - with open(code_file, "r") as f: - code = f.read() - - # Append latest code to second to last message from assistant - fixed_chat_history: List[Message] = [] - for i, chat in enumerate(chat_history): - if i == 0: - fixed_chat_history.append({"role": "user", "content": chat, "media": media}) - elif i > 0 and i < len(chat_history) - 1: - fixed_chat_history.append({"role": "user", "content": chat}) - elif i == len(chat_history) - 1: - fixed_chat_history.append({"role": "assistant", "content": code}) - fixed_chat_history.append({"role": "user", "content": chat}) - - try: - response = agent.chat_with_workflow(fixed_chat_history, test_multi_plan=False) - code = response["code"] - with open(code_file, "w") as f: - f.write(code) - code_lines = code.splitlines(keepends=True) - total_lines = len(code_lines) - return view_lines(code_lines, 0, total_lines, code_file, total_lines) - except Exception as e: - return str(e) + def __setitem__(self, name: str, value: str) -> None: + self.artifacts[name] = value + + def __contains__(self, name: str) -> bool: + return name in self.artifacts def format_lines(lines: List[str], start_idx: int) -> str: @@ -136,34 +77,38 @@ def format_lines(lines: List[str], start_idx: int) -> str: def view_lines( - lines: List[str], line_num: int, window_size: int, file_path: str, total_lines: int + lines: List[str], line_num: int, window_size: int, name: str, total_lines: int ) -> str: start = max(0, line_num - window_size) end = min(len(lines), line_num + window_size) return ( - f"[File: {file_path} ({total_lines} lines total)]\n" + f"[Artifact: {name} ({total_lines} lines total)]\n" + format_lines(lines[start:end], start) - + ("[End of file]" if end == len(lines) else f"[{len(lines) - end} more lines]") + + ( + "[End of artifact]" + if end == len(lines) + else f"[{len(lines) - end} more lines]" + ) ) -def open_file(file_path: str, line_num: int = 0, window_size: int = 100) -> str: - """Opens the file at at the given path in the editor. If `line_num` is provided, - the window will be moved to include that line. It only shows the first 100 lines by - default! Max `window_size` supported is 2000. use `scroll up/down` to view the file - if you want to see more. +def open_artifact( + artifacts: Artifacts, name: str, line_num: int = 0, window_size: int = 100 +) -> str: + """Opens the provided artifact. If `line_num` is provided, the window will be moved + to include that line. It only shows the first 100 lines by default! Max + `window_size` supported is 2000. Parameters: - file_path (str): The file path to open, preferred absolute path. + artifacts (Artifacts): The artifacts object to open the artifact from. + name (str): The name of the artifact to open. line_num (int): The line number to move the window to. window_size (int): The number of lines to show above and below the line. """ + if name not in artifacts: + return f"[Artifact {name} does not exist]" - file_path_p = Path(file_path) - if not file_path_p.exists(): - return f"[File {file_path} does not exist]" - - total_lines = sum(1 for _ in open(file_path_p)) + total_lines = len(artifacts[name].splitlines()) window_size = min(window_size, 2000) window_size = window_size // 2 if line_num - window_size < 0: @@ -171,158 +116,45 @@ def open_file(file_path: str, line_num: int = 0, window_size: int = 100) -> str: elif line_num >= total_lines: line_num = total_lines - 1 - window_size - global CURRENT_LINE, CURRENT_FILE - CURRENT_LINE = line_num - CURRENT_FILE = file_path - - with open(file_path, "r") as f: - lines = f.readlines() - - return view_lines(lines, line_num, window_size, file_path, total_lines) - - -def create_file(file_path: str) -> str: - """Creates and opens a new file with the given name. - - Parameters: - file_path (str): The file path to create, preferred absolute path. - """ - - file_path_p = Path(file_path) - if file_path_p.exists(): - return f"[File {file_path} already exists]" - file_path_p.touch() - global CURRENT_FILE - CURRENT_FILE = file_path - return f"[File created {file_path}]" - - -def scroll_up() -> str: - """Moves the window up by 100 lines.""" - if CURRENT_FILE is None: - return "[No file is open]" - - return open_file(CURRENT_FILE, CURRENT_LINE + DEFAULT_WINDOW_SIZE) - - -def scroll_down() -> str: - """Moves the window down by 100 lines.""" - if CURRENT_FILE is None: - return "[No file is open]" - - return open_file(CURRENT_FILE, CURRENT_LINE - DEFAULT_WINDOW_SIZE) - - -def search_dir(search_term: str, dir_path: str) -> str: - """Searches for search_term in all files in a directory. - - Parameters: - search_term (str): The search term to look for. - dir_path (str): The directory path to search in, preferred absolute path. - """ - - dir_path_p = Path(dir_path) - if not dir_path_p.exists(): - return f"[Directory {dir_path} does not exist]" - - matches = [] - for file in dir_path_p.glob("**/*"): - if filter_file(file): - with open(file, "r") as f: - lines = f.readlines() - for i, line in enumerate(lines): - if search_term in line: - matches.append(f"{file}:{i}|{line.strip()}\n") - if not matches: - return f"[No matches found for {search_term} in {dir_path}]" - if len(matches) > 100: - return f"[More than {len(matches)} matches found for {search_term} in {dir_path}. Please narrow your search]" + lines = artifacts[name].splitlines() - return_str = f"[Found {len(matches)} matches for {search_term} in {dir_path}]\n" - for match in matches: - return_str += match + return view_lines(lines, line_num, window_size, name, total_lines) - return_str += f"[End of matches for {search_term} in {dir_path}]" - return return_str - -def search_file(search_term: str, file_path: str) -> str: - """Searches the file for the given search term. +def create_artifact(artifacts: Artifacts, name: str) -> str: + """Creates a new artifiact with the given name. Parameters: - search_term (str): The search term to look for. - file_path (str): The file path to search in, preferred absolute path. + artifacts (Artifacts): The artifacts object to add the new artifact to. + name (str): The name of the new artifact. """ + if name in artifacts: + return f"[Artifact {name} already exists]" + artifacts[name] = "" + return f"[Artifact {name} created]" - file_path_p = Path(file_path) - if not file_path_p.exists(): - return f"[File {file_path} does not exist]" - - with open(file_path_p, "r") as f: - lines = f.readlines() - - search_results = [] - for i, line in enumerate(lines): - if search_term in line: - search_results.append(f"{i}|{line.strip()}\n") - if not search_results: - return f"[No matches found for {search_term} in {file_path}]" - - return_str = ( - f"[Found {len(search_results)} matches for {search_term} in {file_path}]\n" - ) - for result in search_results: - return_str += result - - return_str += f"[End of matches for {search_term} in {file_path}]" - return return_str - - -def find_file(file_name: str, dir_path: str = "./") -> str: - """Finds all files with the given name in the specified directory. - - Parameters: - file_name (str): The file name to look for. - dir_path (str): The directory path to search in, preferred absolute path. - """ - - dir_path_p = Path(dir_path) - if not dir_path_p.exists(): - return f"[Directory {dir_path} does not exist]" - - files = list(dir_path_p.glob(f"**/*{file_name}*")) - files = [f for f in files if filter_file(f)] - if not files: - return f"[No files found in {dir_path} with name {file_name}]" - - return_str = f"[Found {len(files)} matches for {file_name} in {dir_path}]\n" - for match in files: - return_str += str(match) + "\n" - - return_str += f"[End of matches for {file_name} in {dir_path}]" - return return_str - - -def edit_file(file_path: str, start: int, end: int, content: str) -> str: - """Edits the file at the given path with the provided content. The content will be - inserted between the `start` and `end` line numbers. If the `start` and `end` are - the same, the content will be inserted at the `start` line number. If the `end` is - greater than the total number of lines in the file, the content will be inserted at - the end of the file. If the `start` or `end` are negative, the function will return - an error message. +def edit_artifact( + artifacts: Artifacts, name: str, start: int, end: int, content: str +) -> str: + """Edits the given artifact with the provided content. The content will be inserted + between the `start` and `end` line numbers. If the `start` and `end` are the same, + the content will be inserted at the `start` line number. If the `end` is greater + than the total number of lines in the file, the content will be inserted at the end + of the file. If the `start` or `end` are negative, the function will return an + error message. Parameters: - file_path (str): The file path to edit, preferred absolute path. + artifacts (Artifacts): The artifacts object to edit the artifact from. + name (str): The name of the artifact to edit. start (int): The line number to start the edit. end (int): The line number to end the edit. content (str): The content to insert. """ - file_path_p = Path(file_path) - if not file_path_p.exists(): - return f"[File {file_path} does not exist]" + if name not in artifacts: + return f"[Artifact {name} does not exist]" - total_lines = sum(1 for _ in open(file_path_p)) + total_lines = len(artifacts[name].splitlines()) if start < 0 or end < 0 or start > end or end > total_lines: return "[Invalid line range]" if start == end: @@ -332,50 +164,391 @@ def edit_file(file_path: str, start: int, end: int, content: str) -> str: new_content_lines = [ line if line.endswith("\n") else line + "\n" for line in new_content_lines ] - with open(file_path_p, "r") as f: - lines = f.readlines() - edited_lines = lines[:start] + new_content_lines + lines[end:] + lines = artifacts[name].splitlines() + edited_lines = lines[:start] + new_content_lines + lines[end:] cur_line = start + len(content.split("\n")) // 2 - tmp_file = file_path_p.with_suffix(".tmp") - with open(tmp_file, "w") as f: - f.writelines(edited_lines) - - process = subprocess.Popen( - [ - "flake8", - "--isolated", - "--select=F821,F822,F831,E111,E112,E113,E999,E902", - tmp_file, - ], - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - text=True, - ) - stdout, _ = process.communicate() - tmp_file.unlink() - if stdout != "": - stdout = stdout.replace(tmp_file.name, file_path) - error_msg = "[Edit failed with the following status]\n" + stdout - original_view = view_lines( - lines, - start + ((end - start) // 2), - DEFAULT_WINDOW_SIZE, - file_path, - total_lines, + with tempfile.NamedTemporaryFile(delete=True) as f: + with open(f.name, "w") as f: + f.writelines(edited_lines) + + process = subprocess.Popen( + [ + "flake8", + "--isolated", + "--select=F821,F822,F831,E111,E112,E113,E999,E902", + f.name, + ], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, ) - total_lines_edit = sum(1 for _ in edited_lines) - edited_view = view_lines( - edited_lines, cur_line, DEFAULT_WINDOW_SIZE, file_path, total_lines_edit - ) - - error_msg += f"\n[This is how your edit would have looked like if applied]\n{edited_view}\n\n[This is the original code before your edit]\n{original_view}" - return error_msg - - with open(file_path_p, "w") as f: - f.writelines(edited_lines) + stdout, _ = process.communicate() + + if stdout != "": + stdout = stdout.replace(f.name, name) + error_msg = "[Edit failed with the following status]\n" + stdout + original_view = view_lines( + lines, + start + ((end - start) // 2), + DEFAULT_WINDOW_SIZE, + name, + total_lines, + ) + total_lines_edit = sum(1 for _ in edited_lines) + edited_view = view_lines( + edited_lines, cur_line, DEFAULT_WINDOW_SIZE, name, total_lines_edit + ) - return open_file(file_path, cur_line) + error_msg += f"\n[This is how your edit would have looked like if applied]\n{edited_view}\n\n[This is the original code before your edit]\n{original_view}" + return error_msg + + artifacts[name] = "".join(edited_lines) + + return open_artifact(artifacts, name, cur_line) + + +# def generate_vision_code(save_file: str, chat: str, media: List[str]) -> str: +# """Generates python code to solve vision based tasks. + +# Parameters: +# save_file (str): The file path to save the code. +# chat (str): The chat message from the user. +# media (List[str]): The media files to use. + +# Returns: +# str: The generated code. + +# Examples +# -------- +# >>> generate_vision_code("code.py", "Can you detect the dogs in this image?", ["image.jpg"]) +# from vision_agent.tools import load_image, owl_v2 +# def detect_dogs(image_path: str): +# image = load_image(image_path) +# dogs = owl_v2("dog", image) +# return dogs +# """ + +# if ZMQ_PORT is not None: +# agent = va.agent.VisionAgentCoder( +# report_progress_callback=lambda inp: report_progress_callback( +# int(ZMQ_PORT), inp +# ) +# ) +# else: +# agent = va.agent.VisionAgentCoder() +# try: +# fixed_chat: List[Message] = [{"role": "user", "content": chat, "media": media}] +# response = agent.chat_with_workflow(fixed_chat) +# code = response["code"] +# with open(save_file, "w") as f: +# f.write(code) +# code_lines = code.splitlines(keepends=True) +# total_lines = len(code_lines) +# return view_lines(code_lines, 0, total_lines, save_file, total_lines) +# except Exception as e: +# return str(e) + + +# def edit_vision_code(code_file: str, chat_history: List[str], media: List[str]) -> str: +# """Edits python code to solve a vision based task. + +# Parameters: +# code_file (str): The file path to the code. +# chat_history (List[str]): The chat history to used to generate the code. + +# Returns: +# str: The edited code. + +# Examples +# -------- +# >>> edit_vision_code( +# >>> "code.py", +# >>> ["Can you detect the dogs in this image?", "Can you use a higher threshold?"], +# >>> ["dog.jpg"], +# >>> ) +# from vision_agent.tools import load_image, owl_v2 +# def detect_dogs(image_path: str): +# image = load_image(image_path) +# dogs = owl_v2("dog", image, threshold=0.8) +# return dogs +# """ + +# agent = va.agent.VisionAgentCoder() +# with open(code_file, "r") as f: +# code = f.read() + +# # Append latest code to second to last message from assistant +# fixed_chat_history: List[Message] = [] +# for i, chat in enumerate(chat_history): +# if i == 0: +# fixed_chat_history.append({"role": "user", "content": chat, "media": media}) +# elif i > 0 and i < len(chat_history) - 1: +# fixed_chat_history.append({"role": "user", "content": chat}) +# elif i == len(chat_history) - 1: +# fixed_chat_history.append({"role": "assistant", "content": code}) +# fixed_chat_history.append({"role": "user", "content": chat}) + +# try: +# response = agent.chat_with_workflow(fixed_chat_history, test_multi_plan=False) +# code = response["code"] +# with open(code_file, "w") as f: +# f.write(code) +# code_lines = code.splitlines(keepends=True) +# total_lines = len(code_lines) +# return view_lines(code_lines, 0, total_lines, code_file, total_lines) +# except Exception as e: +# return str(e) + + +# def format_lines(lines: List[str], start_idx: int) -> str: +# output = "" +# for i, line in enumerate(lines): +# output += f"{i + start_idx}|{line}" +# return output + + +# def view_lines( +# lines: List[str], line_num: int, window_size: int, file_path: str, total_lines: int +# ) -> str: +# start = max(0, line_num - window_size) +# end = min(len(lines), line_num + window_size) +# return ( +# f"[File: {file_path} ({total_lines} lines total)]\n" +# + format_lines(lines[start:end], start) +# + ("[End of file]" if end == len(lines) else f"[{len(lines) - end} more lines]") +# ) + + +# def open_file(file_path: str, line_num: int = 0, window_size: int = 100) -> str: +# """Opens the file at at the given path in the editor. If `line_num` is provided, +# the window will be moved to include that line. It only shows the first 100 lines by +# default! Max `window_size` supported is 2000. use `scroll up/down` to view the file +# if you want to see more. + +# Parameters: +# file_path (str): The file path to open, preferred absolute path. +# line_num (int): The line number to move the window to. +# window_size (int): The number of lines to show above and below the line. +# """ + +# file_path_p = Path(file_path) +# if not file_path_p.exists(): +# return f"[File {file_path} does not exist]" + +# total_lines = sum(1 for _ in open(file_path_p)) +# window_size = min(window_size, 2000) +# window_size = window_size // 2 +# if line_num - window_size < 0: +# line_num = window_size +# elif line_num >= total_lines: +# line_num = total_lines - 1 - window_size + +# global CURRENT_LINE, CURRENT_FILE +# CURRENT_LINE = line_num +# CURRENT_FILE = file_path + +# with open(file_path, "r") as f: +# lines = f.readlines() + +# return view_lines(lines, line_num, window_size, file_path, total_lines) + + +# def create_file(file_path: str) -> str: +# """Creates and opens a new file with the given name. + +# Parameters: +# file_path (str): The file path to create, preferred absolute path. +# """ + +# file_path_p = Path(file_path) +# if file_path_p.exists(): +# return f"[File {file_path} already exists]" +# file_path_p.touch() +# global CURRENT_FILE +# CURRENT_FILE = file_path +# return f"[File created {file_path}]" + + +# def scroll_up() -> str: +# """Moves the window up by 100 lines.""" +# if CURRENT_FILE is None: +# return "[No file is open]" + +# return open_file(CURRENT_FILE, CURRENT_LINE + DEFAULT_WINDOW_SIZE) + + +# def scroll_down() -> str: +# """Moves the window down by 100 lines.""" +# if CURRENT_FILE is None: +# return "[No file is open]" + +# return open_file(CURRENT_FILE, CURRENT_LINE - DEFAULT_WINDOW_SIZE) + + +# def search_dir(search_term: str, dir_path: str) -> str: +# """Searches for search_term in all files in a directory. + +# Parameters: +# search_term (str): The search term to look for. +# dir_path (str): The directory path to search in, preferred absolute path. +# """ + +# dir_path_p = Path(dir_path) +# if not dir_path_p.exists(): +# return f"[Directory {dir_path} does not exist]" + +# matches = [] +# for file in dir_path_p.glob("**/*"): +# if filter_file(file): +# with open(file, "r") as f: +# lines = f.readlines() +# for i, line in enumerate(lines): +# if search_term in line: +# matches.append(f"{file}:{i}|{line.strip()}\n") +# if not matches: +# return f"[No matches found for {search_term} in {dir_path}]" +# if len(matches) > 100: +# return f"[More than {len(matches)} matches found for {search_term} in {dir_path}. Please narrow your search]" + +# return_str = f"[Found {len(matches)} matches for {search_term} in {dir_path}]\n" +# for match in matches: +# return_str += match + +# return_str += f"[End of matches for {search_term} in {dir_path}]" +# return return_str + + +# def search_file(search_term: str, file_path: str) -> str: +# """Searches the file for the given search term. + +# Parameters: +# search_term (str): The search term to look for. +# file_path (str): The file path to search in, preferred absolute path. +# """ + +# file_path_p = Path(file_path) +# if not file_path_p.exists(): +# return f"[File {file_path} does not exist]" + +# with open(file_path_p, "r") as f: +# lines = f.readlines() + +# search_results = [] +# for i, line in enumerate(lines): +# if search_term in line: +# search_results.append(f"{i}|{line.strip()}\n") + +# if not search_results: +# return f"[No matches found for {search_term} in {file_path}]" + +# return_str = ( +# f"[Found {len(search_results)} matches for {search_term} in {file_path}]\n" +# ) +# for result in search_results: +# return_str += result + +# return_str += f"[End of matches for {search_term} in {file_path}]" +# return return_str + + +# def find_file(file_name: str, dir_path: str = "./") -> str: +# """Finds all files with the given name in the specified directory. + +# Parameters: +# file_name (str): The file name to look for. +# dir_path (str): The directory path to search in, preferred absolute path. +# """ + +# dir_path_p = Path(dir_path) +# if not dir_path_p.exists(): +# return f"[Directory {dir_path} does not exist]" + +# files = list(dir_path_p.glob(f"**/*{file_name}*")) +# files = [f for f in files if filter_file(f)] +# if not files: +# return f"[No files found in {dir_path} with name {file_name}]" + +# return_str = f"[Found {len(files)} matches for {file_name} in {dir_path}]\n" +# for match in files: +# return_str += str(match) + "\n" + +# return_str += f"[End of matches for {file_name} in {dir_path}]" +# return return_str + + +# def edit_file(file_path: str, start: int, end: int, content: str) -> str: +# """Edits the file at the given path with the provided content. The content will be +# inserted between the `start` and `end` line numbers. If the `start` and `end` are +# the same, the content will be inserted at the `start` line number. If the `end` is +# greater than the total number of lines in the file, the content will be inserted at +# the end of the file. If the `start` or `end` are negative, the function will return +# an error message. + +# Parameters: +# file_path (str): The file path to edit, preferred absolute path. +# start (int): The line number to start the edit. +# end (int): The line number to end the edit. +# content (str): The content to insert. +# """ +# file_path_p = Path(file_path) +# if not file_path_p.exists(): +# return f"[File {file_path} does not exist]" + +# total_lines = sum(1 for _ in open(file_path_p)) +# if start < 0 or end < 0 or start > end or end > total_lines: +# return "[Invalid line range]" +# if start == end: +# end += 1 + +# new_content_lines = content.splitlines(keepends=True) +# new_content_lines = [ +# line if line.endswith("\n") else line + "\n" for line in new_content_lines +# ] +# with open(file_path_p, "r") as f: +# lines = f.readlines() +# edited_lines = lines[:start] + new_content_lines + lines[end:] + +# cur_line = start + len(content.split("\n")) // 2 +# tmp_file = file_path_p.with_suffix(".tmp") +# with open(tmp_file, "w") as f: +# f.writelines(edited_lines) + +# process = subprocess.Popen( +# [ +# "flake8", +# "--isolated", +# "--select=F821,F822,F831,E111,E112,E113,E999,E902", +# tmp_file, +# ], +# stdout=subprocess.PIPE, +# stderr=subprocess.PIPE, +# text=True, +# ) +# stdout, _ = process.communicate() +# tmp_file.unlink() +# if stdout != "": +# stdout = stdout.replace(tmp_file.name, file_path) +# error_msg = "[Edit failed with the following status]\n" + stdout +# original_view = view_lines( +# lines, +# start + ((end - start) // 2), +# DEFAULT_WINDOW_SIZE, +# file_path, +# total_lines, +# ) +# total_lines_edit = sum(1 for _ in edited_lines) +# edited_view = view_lines( +# edited_lines, cur_line, DEFAULT_WINDOW_SIZE, file_path, total_lines_edit +# ) + +# error_msg += f"\n[This is how your edit would have looked like if applied]\n{edited_view}\n\n[This is the original code before your edit]\n{original_view}" +# return error_msg + +# with open(file_path_p, "w") as f: +# f.writelines(edited_lines) + +# return open_file(file_path, cur_line) def get_tool_descriptions() -> str: @@ -388,15 +561,19 @@ def get_tool_descriptions() -> str: META_TOOL_DOCSTRING = get_tool_documentation( [ get_tool_descriptions, - generate_vision_code, - edit_vision_code, - open_file, - create_file, - scroll_up, - scroll_down, - edit_file, - search_dir, - search_file, - find_file, + open_artifact, + create_artifact, + edit_artifact, + # generate_vision_code, + # edit_vision_code, + # open_file, + # create_file, + # scroll_up, + # scroll_down, + # edit_file, + # search_dir, + # search_file, + # find_file, + # florencev2_fine_tuning, ] ) From 8dede494c0ea25de9dbc9e490f986e563c869aca Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Thu, 15 Aug 2024 10:53:02 -0700 Subject: [PATCH 03/37] update local executor --- vision_agent/utils/execute.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/vision_agent/utils/execute.py b/vision_agent/utils/execute.py index 033276d3..1ae5e446 100644 --- a/vision_agent/utils/execute.py +++ b/vision_agent/utils/execute.py @@ -40,6 +40,7 @@ load_dotenv() _LOGGER = logging.getLogger(__name__) _SESSION_TIMEOUT = 600 # 10 minutes +WORKSPACE = Path(os.getenv("WORKSPACE", "")) class MimeType(str, Enum): @@ -607,6 +608,22 @@ def exec_cell(self, code: str) -> Execution: traceback_raw = traceback.format_exc().splitlines() return Execution.from_exception(e, traceback_raw) + def upload_file(self, file_path: str) -> Path: + with open(file_path) as f: + contents = f.read() + with open(WORKSPACE / file_path, "wb") as f: + f.write(contents) + + return Path(WORKSPACE / file_path) + + def download_file(self, file_path: str) -> Path: + with open(file_path, "rb") as f: + contents = f.read() + with open(WORKSPACE / file_path, "wb") as f: + f.write(contents) + return Path(WORKSPACE / file_path) + + class CodeInterpreterFactory: """Factory class for creating code interpreters. From 97556be30c9348fceae8285f6558dd3e506da26e Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Mon, 26 Aug 2024 09:55:25 -0700 Subject: [PATCH 04/37] fix upload/download --- vision_agent/utils/execute.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/vision_agent/utils/execute.py b/vision_agent/utils/execute.py index 1ae5e446..299f5d2c 100644 --- a/vision_agent/utils/execute.py +++ b/vision_agent/utils/execute.py @@ -609,7 +609,7 @@ def exec_cell(self, code: str) -> Execution: return Execution.from_exception(e, traceback_raw) def upload_file(self, file_path: str) -> Path: - with open(file_path) as f: + with open(file_path, "rb") as f: contents = f.read() with open(WORKSPACE / file_path, "wb") as f: f.write(contents) @@ -617,11 +617,11 @@ def upload_file(self, file_path: str) -> Path: return Path(WORKSPACE / file_path) def download_file(self, file_path: str) -> Path: - with open(file_path, "rb") as f: + with open(WORKSPACE / file_path, "rb") as f: contents = f.read() - with open(WORKSPACE / file_path, "wb") as f: + with open(file_path, "wb") as f: f.write(contents) - return Path(WORKSPACE / file_path) + return Path(file_path) From 82169c24284764a8fc9634341a184551ead98698 Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Mon, 26 Aug 2024 12:46:53 -0700 Subject: [PATCH 05/37] cleaned up code for artifacts --- vision_agent/agent/vision_agent.py | 63 ++++++++++++++++-------------- 1 file changed, 34 insertions(+), 29 deletions(-) diff --git a/vision_agent/agent/vision_agent.py b/vision_agent/agent/vision_agent.py index dad2d824..f497f467 100644 --- a/vision_agent/agent/vision_agent.py +++ b/vision_agent/agent/vision_agent.py @@ -1,8 +1,8 @@ import copy import logging import os -import tempfile import pickle as pkl +import tempfile from pathlib import Path from typing import Any, Dict, List, Optional, Union, cast @@ -14,8 +14,8 @@ VA_CODE, ) from vision_agent.lmm import LMM, Message, OpenAILMM -from vision_agent.tools.meta_tools import Artifacts from vision_agent.tools import META_TOOL_DOCSTRING +from vision_agent.tools.meta_tools import Artifacts from vision_agent.utils import CodeInterpreterFactory from vision_agent.utils.execute import CodeInterpreter @@ -28,24 +28,30 @@ os.environ["PYTHONPATH"] = f"{WORKSPACE}:{os.getenv('PYTHONPATH', '')}" -class DefaultImports: - code = [ +class BoilerplateCode: + pre_code = [ "from typing import *", "from vision_agent.utils.execute import CodeInterpreter", "from vision_agent.tools.meta_tools import Artifacts, open_artifact, create_artifact, edit_artifact, get_tool_descriptions", - f"artifacts = Artifacts({ARTIFACT})", + "artifacts = Artifacts({remote_path})", + "artifacts.load({remote_path})", + ] + post_code = [ + "artifacts.save()", ] @staticmethod - def to_code_string() -> str: - return "\n".join(DefaultImports.code) - - @staticmethod - def prepend_imports(code: str) -> str: + def add_boilerplate(code: str) -> str: """Run this method to prepend the default imports to the code. NOTE: be sure to run this method after the custom tools have been registered. """ - return DefaultImports.to_code_string() + "\n\n" + code + return ( + "\n".join(BoilerplateCode.pre_code) + + "\n\n" + + code + + "\n\n" + + "\n".join(BoilerplateCode.post_code) + ) def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]: @@ -71,22 +77,10 @@ def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]: return extract_json(orch([{"role": "user", "content": prompt}], stream=False)) # type: ignore -def run_code_action(code: str, artifacts: Artifacts, code_interpreter: CodeInterpreter) -> str: - with tempfile.TemporaryDirectory() as tmpdirname: - for name in artifacts: - temp_file_path = Path(tmpdirname) / name + ".py" - with open(temp_file_path, "w") as f: - f.write(artifacts[name]) - code_interpreter.upload_file(temp_file_path) - temp_file_path.unlink() - - temp_file_path = Path(tmpdirname) / ARTIFACT - with open(temp_file_path, "wb") as f: - pkl.dump(artifacts.artifacts, f) - code_interpreter.upload_file(temp_file_path) - temp_file_path.unlink() - - result = code_interpreter.exec_cell(DefaultImports.prepend_imports(code)) +def run_code_action( + code: str, code_interpreter: CodeInterpreter +) -> str: + result = code_interpreter.exec_cell(BoilerplateCode.add_boilerplate(code)) return_str = "" if result.success: @@ -150,6 +144,7 @@ def __call__( self, input: Union[str, List[Message]], media: Optional[Union[str, Path]] = None, + artifacts: Optional[Artifacts] = None, ) -> str: """Chat with VisionAgent and get the conversation response. @@ -172,13 +167,13 @@ def __call__( def chat_with_code( self, chat: List[Message], + artifacts: Optional[Artifacts] = None, ) -> List[Message]: """Chat with VisionAgent, it will use code to execute actions to accomplish its tasks. Parameters: - chat (List[Message]): A conversation - in the format of: + chat (List[Message]): A conversation in the format of: [{"role": "user", "content": "describe your task here..."}] or if it contains media files, it should be in the format of: [{"role": "user", "content": "describe your task here...", "media": ["image1.jpg", "image2.jpg"]}] @@ -190,6 +185,10 @@ def chat_with_code( if not chat: raise ValueError("chat cannot be empty") + if not artifacts: + artifacts = Artifacts("artifacts.pkl") + artifacts.save() + with CodeInterpreterFactory.new_instance( code_sandbox_runtime=self.code_sandbox_runtime ) as code_interpreter: @@ -222,6 +221,8 @@ def chat_with_code( finished = False iterations = 0 while not finished and iterations < self.max_iterations: + artifacts_remote_path = code_interpreter.upload_file(artifacts.save_path) + response = run_conversation(self.agent, int_chat) if self.verbosity >= 1: _LOGGER.info(response) @@ -235,6 +236,10 @@ def chat_with_code( if code_action is not None: obs = run_code_action(code_action, code_interpreter) + artifacts_local_path = code_interpreter.download_file(artifacts_remote_path) + artifacts.load(artifacts_local_path) + artifacts.save() + if self.verbosity >= 1: _LOGGER.info(obs) int_chat.append({"role": "observation", "content": obs}) From d1f160201a4c541a70279a8d9fdf1abbbfb5bc8d Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Mon, 26 Aug 2024 12:47:03 -0700 Subject: [PATCH 06/37] starting artifact prompts --- vision_agent/agent/vision_agent_prompts.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/vision_agent/agent/vision_agent_prompts.py b/vision_agent/agent/vision_agent_prompts.py index 7b714378..cf1e9a33 100644 --- a/vision_agent/agent/vision_agent_prompts.py +++ b/vision_agent/agent/vision_agent_prompts.py @@ -32,6 +32,16 @@ """ +EXAMPLES_CODE1_ARTIFACT = """ +USER: Can you write a simple application that adds two numbers? + +AGENT: {"thoughts": "The user has asked to add two numbers, I will generate the code to add two numbers.", "response": "create_artifact(artifact, 'add_two_numbers')", "let_user_respond": false} + +OBSERVATION: +[Artifact add_two_numbers created] +""" + + EXAMPLES_CODE1 = """ USER: Can you detect the dogs in this image? Media name dog.jpg From 2fc76a53ac256550c5983d8aaacc29ed938e5aa5 Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Wed, 28 Aug 2024 08:39:24 -0700 Subject: [PATCH 07/37] app to add files to artifacts --- examples/chat/app.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/examples/chat/app.py b/examples/chat/app.py index f1cd62e7..94f3b23a 100644 --- a/examples/chat/app.py +++ b/examples/chat/app.py @@ -26,6 +26,8 @@ "response": "saved", "style": {"bottom": "calc(50% - 4.25rem", "right": "0.4rem"}, } +artifacts = va.tools.Artifacts("artifacts.pkl") +artifacts.save() agent = va.agent.VisionAgent(verbosity=1) st.set_page_config(layout="wide") @@ -44,7 +46,9 @@ def update_messages(messages, lock): - new_chat = agent.chat_with_code(messages) + if Path("artifacts.pkl").exists(): + artifacts.load("artifacts.pkl") + new_chat = agent.chat_with_code(messages, artifacts=artifacts) with lock: for new_message in new_chat: if new_message not in messages: @@ -121,6 +125,7 @@ def main(): if uploaded_file is not None: with open(WORKSPACE / uploaded_file.name, "wb") as f: f.write(uploaded_file.getbuffer()) + artifacts.artifacts[WORKSPACE / uploaded_file.name] = "" for file in WORKSPACE.iterdir(): if "__pycache__" not in str(file) and not str(file).startswith("."): From 11cef6f4e60a831c6ba73d7165c21b2c98fbefc4 Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Wed, 28 Aug 2024 08:40:11 -0700 Subject: [PATCH 08/37] add support for artifacts --- vision_agent/agent/vision_agent.py | 83 +++++++++++++++--------------- 1 file changed, 41 insertions(+), 42 deletions(-) diff --git a/vision_agent/agent/vision_agent.py b/vision_agent/agent/vision_agent.py index f497f467..0d617c8a 100644 --- a/vision_agent/agent/vision_agent.py +++ b/vision_agent/agent/vision_agent.py @@ -1,17 +1,15 @@ import copy import logging import os -import pickle as pkl -import tempfile from pathlib import Path from typing import Any, Dict, List, Optional, Union, cast from vision_agent.agent import Agent from vision_agent.agent.agent_utils import extract_json from vision_agent.agent.vision_agent_prompts import ( - EXAMPLES_CODE1, - EXAMPLES_CODE2, - VA_CODE, + EXAMPLES_CODE1_ARTIFACT, + EXAMPLES_CODE2_ARTIFACT, + VA_CODE_ARTIFACT, ) from vision_agent.lmm import LMM, Message, OpenAILMM from vision_agent.tools import META_TOOL_DOCSTRING @@ -21,7 +19,6 @@ logging.basicConfig(level=logging.INFO) _LOGGER = logging.getLogger(__name__) -ARTIFACT = "artifacts.pkl" WORKSPACE = Path(os.getenv("WORKSPACE", "")) WORKSPACE.mkdir(parents=True, exist_ok=True) if str(WORKSPACE) != "": @@ -32,25 +29,25 @@ class BoilerplateCode: pre_code = [ "from typing import *", "from vision_agent.utils.execute import CodeInterpreter", - "from vision_agent.tools.meta_tools import Artifacts, open_artifact, create_artifact, edit_artifact, get_tool_descriptions", - "artifacts = Artifacts({remote_path})", - "artifacts.load({remote_path})", + "from vision_agent.tools.meta_tools import Artifacts, open_artifact, create_artifact, edit_artifact, get_tool_descriptions, generate_vision_code, edit_vision_code", + "artifacts = Artifacts('{remote_path}')", + "artifacts.load('{remote_path}')", ] post_code = [ "artifacts.save()", ] @staticmethod - def add_boilerplate(code: str) -> str: + def add_boilerplate(code: str, **format) -> str: """Run this method to prepend the default imports to the code. NOTE: be sure to run this method after the custom tools have been registered. """ return ( - "\n".join(BoilerplateCode.pre_code) + "\n".join([s.format(**format) for s in BoilerplateCode.pre_code]) + "\n\n" + code + "\n\n" - + "\n".join(BoilerplateCode.post_code) + + "\n".join([s.format(**format) for s in BoilerplateCode.post_code]) ) @@ -68,38 +65,21 @@ def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]: else: raise ValueError(f"role {chat_i['role']} is not supported") - prompt = VA_CODE.format( + prompt = VA_CODE_ARTIFACT.format( documentation=META_TOOL_DOCSTRING, - examples=f"{EXAMPLES_CODE1}\n{EXAMPLES_CODE2}", - dir=WORKSPACE, + examples=f"{EXAMPLES_CODE1_ARTIFACT}\n{EXAMPLES_CODE2_ARTIFACT}", conversation=conversation, ) return extract_json(orch([{"role": "user", "content": prompt}], stream=False)) # type: ignore def run_code_action( - code: str, code_interpreter: CodeInterpreter + code: str, code_interpreter: CodeInterpreter, artifact_remote_path: str ) -> str: - result = code_interpreter.exec_cell(BoilerplateCode.add_boilerplate(code)) - - return_str = "" - if result.success: - for res in result.results: - if res.text is not None: - return_str += res.text.replace("\\n", "\n") - if result.logs.stdout: - return_str += "----- stdout -----\n" - for log in result.logs.stdout: - return_str += log.replace("\\n", "\n") - else: - # for log in result.logs.stderr: - # return_str += log.replace("\\n", "\n") - if result.error: - return_str += ( - "\n" + result.error.value + "\n".join(result.error.traceback_raw) - ) - - return return_str + result = code_interpreter.exec_cell( + BoilerplateCode.add_boilerplate(code, remote_path=artifact_remote_path) + ) + return result.text() def parse_execution(response: str) -> Optional[str]: @@ -112,8 +92,8 @@ def parse_execution(response: str) -> Optional[str]: class VisionAgent(Agent): """Vision Agent is an agent that can chat with the user and call tools or other - agents to generate code for it. Vision Agent uses python code to execute actions for - the user. Vision Agent is inspired by by OpenDev + agents to generate code for it. Vision Agent uses python code to execute actions + for the user. Vision Agent is inspired by by OpenDev https://github.com/OpenDevin/OpenDevin and CodeAct https://arxiv.org/abs/2402.01030 Example @@ -161,7 +141,7 @@ def __call__( input = [{"role": "user", "content": input}] if media is not None: input[0]["media"] = [media] - results = self.chat_with_code(input) + results = self.chat_with_code(input, artifacts) return results # type: ignore def chat_with_code( @@ -200,6 +180,10 @@ def chat_with_code( for media in chat_i["media"]: media = code_interpreter.upload_file(media) chat_i["content"] += f" Media name {media}" # type: ignore + # Save dummy value for now since we just need to know the path + # name in the key 'media'. Later on we can add artifact support + # for byte data. + artifacts.artifacts[media] = "" media_list.append(media) int_chat = cast( @@ -220,8 +204,14 @@ def chat_with_code( finished = False iterations = 0 + last_response = None while not finished and iterations < self.max_iterations: - artifacts_remote_path = code_interpreter.upload_file(artifacts.save_path) + artifacts_remote_path = code_interpreter.upload_file( + artifacts.save_path + ) + artifacts_loaded = artifacts.show() + int_chat.append({"role": "observation", "content": artifacts_loaded}) + orig_chat.append({"role": "observation", "content": artifacts_loaded}) response = run_conversation(self.agent, int_chat) if self.verbosity >= 1: @@ -229,14 +219,22 @@ def chat_with_code( int_chat.append({"role": "assistant", "content": str(response)}) orig_chat.append({"role": "assistant", "content": str(response)}) + # sometimes it gets stuck in a loop, so we force it to exit + if last_response == response: + response["let_user_respond"] = True + if response["let_user_respond"]: break code_action = parse_execution(response["response"]) if code_action is not None: - obs = run_code_action(code_action, code_interpreter) - artifacts_local_path = code_interpreter.download_file(artifacts_remote_path) + obs = run_code_action( + code_action, code_interpreter, artifacts_remote_path + ) + artifacts_local_path = code_interpreter.download_file( + artifacts_remote_path + ) artifacts.load(artifacts_local_path) artifacts.save() @@ -246,6 +244,7 @@ def chat_with_code( orig_chat.append({"role": "observation", "content": obs}) iterations += 1 + last_response = response return orig_chat def log_progress(self, data: Dict[str, Any]) -> None: From 0163daa5708e7aa5117679331f974fa7ef7cd35b Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Wed, 28 Aug 2024 08:40:34 -0700 Subject: [PATCH 09/37] add artifact meta tools --- vision_agent/tools/meta_tools.py | 134 ++++++++++++++++++++++++++++--- 1 file changed, 123 insertions(+), 11 deletions(-) diff --git a/vision_agent/tools/meta_tools.py b/vision_agent/tools/meta_tools.py index 29d56c55..4b245757 100644 --- a/vision_agent/tools/meta_tools.py +++ b/vision_agent/tools/meta_tools.py @@ -3,7 +3,7 @@ import subprocess import tempfile from pathlib import Path -from typing import Any, Dict, List, Optional, Union +from typing import Any, Callable, Dict, List, Optional, Union from uuid import UUID import vision_agent as va @@ -11,8 +11,8 @@ from vision_agent.lmm.types import Message from vision_agent.tools.tool_utils import get_tool_documentation from vision_agent.tools.tools import TOOL_DESCRIPTIONS -from vision_agent.utils.image_utils import convert_to_b64 from vision_agent.utils import CodeInterpreterFactory +from vision_agent.utils.image_utils import convert_to_b64 # These tools are adapted from SWE-Agent https://github.com/princeton-nlp/SWE-agent @@ -43,7 +43,7 @@ def filter_file(file_name: Union[str, Path]) -> bool: class Artifacts: def __init__(self, save_path: Union[str, Path]) -> None: - self.save_path = save_path + self.save_path = Path(save_path) self.artifacts = {} self.code_sandbox_runtime = None @@ -51,6 +51,16 @@ def __init__(self, save_path: Union[str, Path]) -> None: def load(self, file_path: Union[str, Path]) -> None: with open(file_path, "rb") as f: self.artifacts = pkl.load(f) + for k, v in self.artifacts.items(): + with open(self.save_path.parent / k, "w") as f: + f.write(v) + + def show(self) -> str: + out_str = "[Artifacts loaded]\n" + for k in self.artifacts.keys(): + out_str += f"Artifact {k} loaded to {str(self.save_path.parent / k)}\n" + out_str += "[End of artifacts]\n" + return out_str def save(self) -> None: with open(self.save_path, "wb") as f: @@ -81,7 +91,7 @@ def view_lines( ) -> str: start = max(0, line_num - window_size) end = min(len(lines), line_num + window_size) - return ( + return_str = ( f"[Artifact: {name} ({total_lines} lines total)]\n" + format_lines(lines[start:end], start) + ( @@ -90,6 +100,8 @@ def view_lines( else f"[{len(lines) - end} more lines]" ) ) + print(return_str) + return return_str def open_artifact( @@ -116,7 +128,7 @@ def open_artifact( elif line_num >= total_lines: line_num = total_lines - 1 - window_size - lines = artifacts[name].splitlines() + lines = artifacts[name].splitlines(keepends=True) return view_lines(lines, line_num, window_size, name, total_lines) @@ -129,9 +141,12 @@ def create_artifact(artifacts: Artifacts, name: str) -> str: name (str): The name of the new artifact. """ if name in artifacts: - return f"[Artifact {name} already exists]" - artifacts[name] = "" - return f"[Artifact {name} created]" + return_str = f"[Artifact {name} already exists]" + else: + artifacts[name] = "" + return_str = f"[Artifact {name} created]" + print(return_str) + return return_str def edit_artifact( @@ -151,8 +166,10 @@ def edit_artifact( end (int): The line number to end the edit. content (str): The content to insert. """ + # just make the artifact if it doesn't exist instead of forcing agent to call + # create_artifact if name not in artifacts: - return f"[Artifact {name} does not exist]" + artifacts[name] = "" total_lines = len(artifacts[name].splitlines()) if start < 0 or end < 0 or start > end or end > total_lines: @@ -208,6 +225,101 @@ def edit_artifact( return open_artifact(artifacts, name, cur_line) +def generate_vision_code( + artifacts: Artifacts, name: str, chat: str, media: List[str] +) -> str: + """Generates python code to solve vision based tasks. + + Parameters: + artifacts (Artifacts): The artifacts object to save the code to. + name (str): The name of the artifact to save the code to. + chat (str): The chat message from the user. + media (List[str]): The media files to use. + + Returns: + str: The generated code. + + Examples + -------- + >>> generate_vision_code(artifacts, "code.py", "Can you detect the dogs in this image?", ["image.jpg"]) + from vision_agent.tools import load_image, owl_v2 + def detect_dogs(image_path: str): + image = load_image(image_path) + dogs = owl_v2("dog", image) + return dogs + """ + + if ZMQ_PORT is not None: + agent = va.agent.VisionAgentCoder( + report_progress_callback=lambda inp: report_progress_callback( + int(ZMQ_PORT), inp + ) + ) + else: + agent = va.agent.VisionAgentCoder() + + fixed_chat: List[Message] = [{"role": "user", "content": chat, "media": media}] + response = agent.chat_with_workflow(fixed_chat) + code = response["code"] + artifacts[name] = code + code_lines = code.splitlines(keepends=True) + total_lines = len(code_lines) + return view_lines(code_lines, 0, total_lines, name, total_lines) + + +def edit_vision_code( + artifacts: Artifacts, name: str, chat_history: List[str], media: List[str] +) -> str: + """Edits python code to solve a vision based task. + + Parameters: + artifacts (Artifacts): The artifacts object to save the code to. + name (str): The file path to the code. + chat_history (List[str]): The chat history to used to generate the code. + + Returns: + str: The edited code. + + Examples + -------- + >>> edit_vision_code( + >>> artifacts, + >>> "code.py", + >>> ["Can you detect the dogs in this image?", "Can you use a higher threshold?"], + >>> ["dog.jpg"], + >>> ) + from vision_agent.tools import load_image, owl_v2 + def detect_dogs(image_path: str): + image = load_image(image_path) + dogs = owl_v2("dog", image, threshold=0.8) + return dogs + """ + + agent = va.agent.VisionAgentCoder() + if name not in artifacts: + return f"[Artifact {name} does not exist]" + + code = artifacts[name] + + # Append latest code to second to last message from assistant + fixed_chat_history: List[Message] = [] + for i, chat in enumerate(chat_history): + if i == 0: + fixed_chat_history.append({"role": "user", "content": chat, "media": media}) + elif i > 0 and i < len(chat_history) - 1: + fixed_chat_history.append({"role": "user", "content": chat}) + elif i == len(chat_history) - 1: + fixed_chat_history.append({"role": "assistant", "content": code}) + fixed_chat_history.append({"role": "user", "content": chat}) + + response = agent.chat_with_workflow(fixed_chat_history, test_multi_plan=False) + code = response["code"] + artifacts[name] = code + code_lines = code.splitlines(keepends=True) + total_lines = len(code_lines) + return view_lines(code_lines, 0, total_lines, name, total_lines) + + # def generate_vision_code(save_file: str, chat: str, media: List[str]) -> str: # """Generates python code to solve vision based tasks. @@ -564,8 +676,8 @@ def get_tool_descriptions() -> str: open_artifact, create_artifact, edit_artifact, - # generate_vision_code, - # edit_vision_code, + generate_vision_code, + edit_vision_code, # open_file, # create_file, # scroll_up, From 2596f433ae42e00ed2b4d55eb29353c1f2394f11 Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Wed, 28 Aug 2024 08:41:06 -0700 Subject: [PATCH 10/37] ran isort --- vision_agent/clients/landing_public_api.py | 4 ++-- vision_agent/tools/tools_types.py | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/vision_agent/clients/landing_public_api.py b/vision_agent/clients/landing_public_api.py index eec218ad..2319bf89 100644 --- a/vision_agent/clients/landing_public_api.py +++ b/vision_agent/clients/landing_public_api.py @@ -5,9 +5,9 @@ from requests.exceptions import HTTPError from vision_agent.clients.http import BaseHTTP -from vision_agent.utils.type_defs import LandingaiAPIKey +from vision_agent.tools.tools_types import BboxInputBase64, JobStatus, PromptTask from vision_agent.utils.exceptions import FineTuneModelNotFound -from vision_agent.tools.tools_types import BboxInputBase64, PromptTask, JobStatus +from vision_agent.utils.type_defs import LandingaiAPIKey class LandingPublicAPI(BaseHTTP): diff --git a/vision_agent/tools/tools_types.py b/vision_agent/tools/tools_types.py index aeb45c95..7b640adb 100644 --- a/vision_agent/tools/tools_types.py +++ b/vision_agent/tools/tools_types.py @@ -1,8 +1,8 @@ -from uuid import UUID from enum import Enum -from typing import List, Tuple, Optional +from typing import List, Optional, Tuple +from uuid import UUID -from pydantic import BaseModel, ConfigDict, Field, field_serializer, SerializationInfo +from pydantic import BaseModel, ConfigDict, Field, SerializationInfo, field_serializer class BboxInput(BaseModel): From 51d49f5ee4689008d242602fa45039ccaafa6deb Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Wed, 28 Aug 2024 09:38:18 -0700 Subject: [PATCH 11/37] prompt to work with artifacts --- vision_agent/agent/vision_agent_prompts.py | 55 +++++++++++++--------- 1 file changed, 32 insertions(+), 23 deletions(-) diff --git a/vision_agent/agent/vision_agent_prompts.py b/vision_agent/agent/vision_agent_prompts.py index cf1e9a33..93acb871 100644 --- a/vision_agent/agent/vision_agent_prompts.py +++ b/vision_agent/agent/vision_agent_prompts.py @@ -1,7 +1,7 @@ VA_CODE = """ **Role**: You are a helpful conversational agent that assists users with their requests by writing code to solve it. -**Taks**: As a conversational agent, you are required to understand the user's request and provide a helpful response. Use a Chain-of-Thought approach to break down the problem, create a plan, and then provide a response. Ensure that your response is clear, concise, and helpful. You can use an interactive Python (Jupyter Notebook) environment, executing code with . +**Taks**: As a conversational agent, you are required to understand the user's request and provide a helpful response. Use a Chain-of-Thought approach to break down the problem, create a plan, and then provide a response. Ensure that your response is clear, concise, and helpful. You can use an interactive Python (Jupyter Notebook) environment, executing code with . You are given access to an `artifacts` object which contains files shared between you and the user. `artifacts` will be automatically saved everytime you execute python code. print("Hello World!") @@ -32,29 +32,24 @@ """ -EXAMPLES_CODE1_ARTIFACT = """ -USER: Can you write a simple application that adds two numbers? - -AGENT: {"thoughts": "The user has asked to add two numbers, I will generate the code to add two numbers.", "response": "create_artifact(artifact, 'add_two_numbers')", "let_user_respond": false} +EXAMPLES_CODE1_ARTIFACTS = """ +USER: Can you detect the dogs in this image? Media name dog.jpg OBSERVATION: -[Artifact add_two_numbers created] -""" - - -EXAMPLES_CODE1 = """ -USER: Can you detect the dogs in this image? Media name dog.jpg +[Artifacts loaded] +Artifact dog.jpg loaded to /path/to/images/dog.jpg +[End of artifacts] -AGENT: {"thoughts": "I will use the generate_vision_code to detect the dogs in the image.", "response": "generate_vision_code('/example/workspace/dog_detector.py', 'Can you write code to detect dogs in this image?', media=['/example/workspace/dog.jpg'])", "let_user_respond": false} +AGENT: {"thoughts": "I will use the generate_vision_code to detect the dogs in the image.", "response": "generate_vision_code(artifacts, 'dog_detector.py', 'Can you write code to detect dogs in this image?', media=['/path/to/images/dog.jpg'])", "let_user_respond": false} OBSERVATION: -[File /example/workspace/dog_detector.py] +[Artifact dog_detector.py] 0|from vision_agent.tools import load_image, owl_v2 1|def detect_dogs(image_path: str): 2| image = load_image(image_path) 3| dogs = owl_v2("dog", image) 4| return dogs -[End of file] +[End of artifact] AGENT: {"thoughts": "I have generated the code to detect the dogs in the image, I must now run the code to get the output.", "response": "from dog_detector import detect_dogs\n print(detect_dogs('/example/workspace/dog.jpg'))", "let_user_respond": false} @@ -67,18 +62,23 @@ USER: The the image only has one dog, can you fix this? -AGENT: {"thoughts": "I will use edit_vision_code to fix the code to detect only one dog. The lower score is 0.23 so I will set the threshold above that.", "response": "edit_vision_code('/example/workspace/dog_detector.py', ['Can you write code to detect dogs in this image?', 'Can you increase the threshold to 0.24?'], media=['/example/workspace/dog.jpg'])", "let_user_respond": false} +[Artifacts loaded] +Artifact dog.jpg loaded to /path/to/images/dog.jpg +Artifact dog_detector.py loaded to /path/to/code/dog_detector.py +[End of artifacts] + +AGENT: {"thoughts": "I will use edit_vision_code to fix the code to detect only one dog. The lower score is 0.23 so I will set the threshold above that.", "response": "edit_vision_code(artifacts, 'dog_detector.py', ['Can you write code to detect dogs in this image?', 'Can you increase the threshold to 0.24?'], media=['/path/to/images/dog.jpg'])", "let_user_respond": false} OBSERVATION: -[File /example/workspace/dog_detector.py] +[Artifact dog_detector.py] 0|from vision_agent.tools import load_image, owl_v2 1|def detect_dogs(image_path: str): 2| image = load_image(image_path) 3| dogs = owl_v2("dog", image, threshold=0.24) 4| return dogs -[End of file] +[End of artifact] -AGENT: {"thoughts": "I have edited the code to detect only one dog, I must now run the code to get the output.", "response": "from dog_detector import detect_dogs\n print(detect_dogs('/example/workspace/dog.jpg'))", "let_user_respond": false} +AGENT: {"thoughts": "I have edited the code to detect only one dog, I must now run the code to get the output.", "response": "from dog_detector import detect_dogs\n print(detect_dogs('/path/to/images/dog.jpg'))", "let_user_respond": false} OBSERVATION: ----- stdout ----- @@ -88,17 +88,26 @@ """ -EXAMPLES_CODE2 = """ +EXAMPLES_CODE2_ARTIFACTS = """ USER: Can you create a function to count workers with helmets? +OBSERVATION: +[Artifacts loaded] +[End of artifacts] + AGENT: {"thoughts": "The user has asked to count workers with helmets but has not provided an image. I will ask the user for an image and then generate the code to count workers with helmets.", "response": "Can you provide an image of workers with helmets?", "let_user_respond": true} USER: Yes you can use workers.png -AGENT: {"thoughts": "I will use the generate_vision_code to count the workers with helmets in the image.", "response": "generate_vision_code('/example/workspace/code.py', 'Can you write code to count workers with helmets in this image?', media=['/example/workspace/workers.png'])", "let_user_respond": false} +OBSERVATION: +[Artifacts loaded] +Artifact workers.png loaded to /path/to/images/workers.png +[End of artifacts] + +AGENT: {"thoughts": "I will use the generate_vision_code to count the workers with helmets in the image.", "response": "generate_vision_code(artifacts, 'code.py', 'Can you write code to count workers with helmets in this image?', media=['/paths/to/images/workers.png'])", "let_user_respond": false} OBSERVATION: -[File /example/workspace/code.py] +[Artifact code.py] 0|from vision_agent.tools import load_image, owl_v2, closest_box_distance 1|def count_workers_with_helmets(image_path: str): 2| image = load_image(image_path) @@ -115,9 +124,9 @@ 13| if person_has_helmet: 14| count += 1 15| return count -[End of file] +[End of artifact] -AGENT: {"thoughts": "I have generated the code to count the workers with helmets in the image, I must now run the code to get the output.", "response": "from code import count_workers_with_helmets\n print(count_workers_with_helmets('/example/workspace/workers.png'))", "let_user_respond": false} +AGENT: {"thoughts": "I have generated the code to count the workers with helmets in the image, I must now run the code to get the output.", "response": "from code import count_workers_with_helmets\n print(count_workers_with_helmets('/path/to/images/workers.png'))", "let_user_respond": false} OBSERVATION: ----- stdout ----- From e31de9f0784b2b4e358f010fd1ef269897d18f21 Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Wed, 28 Aug 2024 09:42:35 -0700 Subject: [PATCH 12/37] minor fixes for prompts --- vision_agent/agent/vision_agent_prompts.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/vision_agent/agent/vision_agent_prompts.py b/vision_agent/agent/vision_agent_prompts.py index 93acb871..7e98bd94 100644 --- a/vision_agent/agent/vision_agent_prompts.py +++ b/vision_agent/agent/vision_agent_prompts.py @@ -1,4 +1,4 @@ -VA_CODE = """ +VA_CODE_ARTIFACTS = """ **Role**: You are a helpful conversational agent that assists users with their requests by writing code to solve it. **Taks**: As a conversational agent, you are required to understand the user's request and provide a helpful response. Use a Chain-of-Thought approach to break down the problem, create a plan, and then provide a response. Ensure that your response is clear, concise, and helpful. You can use an interactive Python (Jupyter Notebook) environment, executing code with . You are given access to an `artifacts` object which contains files shared between you and the user. `artifacts` will be automatically saved everytime you execute python code. @@ -15,7 +15,6 @@ **Examples**: Here is an example of how you can interact with a user and Actions to complete a task: --- START EXAMPLES --- -[Current directory: /example/workspace] {examples} --- END EXAMPLES --- @@ -26,8 +25,6 @@ **Conversation**: Here is the current conversation so far: --- START CONVERSATION --- -[Current directory: {dir}] - {conversation} """ From 9e83881ca81a6caa6fb1f19e7cda3cf9d3d892db Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Wed, 28 Aug 2024 13:02:36 -0700 Subject: [PATCH 13/37] add docs, fix load and saving remote files --- vision_agent/agent/vision_agent.py | 72 +++++++++++++++++++++--------- 1 file changed, 50 insertions(+), 22 deletions(-) diff --git a/vision_agent/agent/vision_agent.py b/vision_agent/agent/vision_agent.py index 0d617c8a..17fe347d 100644 --- a/vision_agent/agent/vision_agent.py +++ b/vision_agent/agent/vision_agent.py @@ -7,9 +7,9 @@ from vision_agent.agent import Agent from vision_agent.agent.agent_utils import extract_json from vision_agent.agent.vision_agent_prompts import ( - EXAMPLES_CODE1_ARTIFACT, - EXAMPLES_CODE2_ARTIFACT, - VA_CODE_ARTIFACT, + EXAMPLES_CODE1, + EXAMPLES_CODE2, + VA_CODE, ) from vision_agent.lmm import LMM, Message, OpenAILMM from vision_agent.tools import META_TOOL_DOCSTRING @@ -65,9 +65,9 @@ def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]: else: raise ValueError(f"role {chat_i['role']} is not supported") - prompt = VA_CODE_ARTIFACT.format( + prompt = VA_CODE.format( documentation=META_TOOL_DOCSTRING, - examples=f"{EXAMPLES_CODE1_ARTIFACT}\n{EXAMPLES_CODE2_ARTIFACT}", + examples=f"{EXAMPLES_CODE1}\n{EXAMPLES_CODE2}", conversation=conversation, ) return extract_json(orch([{"role": "user", "content": prompt}], stream=False)) # type: ignore @@ -109,8 +109,20 @@ def __init__( self, agent: Optional[LMM] = None, verbosity: int = 0, + local_artifacts_path: Optional[Union[str, Path]] = None, code_sandbox_runtime: Optional[str] = None, ) -> None: + """Initialize the VisionAgent. + + Parameters: + agent (Optional[LMM]): The agent to use for conversation and orchestration + of other agents. + verbosity (int): The verbosity level of the agent. + local_artifacts_path (Optional[Union[str, Path]]): The path to the local + artifacts file. + code_sandbox_runtime (Optional[str]): The code sandbox runtime to use. + """ + self.agent = ( OpenAILMM(temperature=0.0, json_mode=True) if agent is None else agent ) @@ -119,13 +131,18 @@ def __init__( self.code_sandbox_runtime = code_sandbox_runtime if self.verbosity >= 1: _LOGGER.setLevel(logging.INFO) + self.local_artifacts_path = ( + Path(local_artifacts_path) + if local_artifacts_path is not None + else "artifacts.pkl" + ) def __call__( self, input: Union[str, List[Message]], media: Optional[Union[str, Path]] = None, artifacts: Optional[Artifacts] = None, - ) -> str: + ) -> List[Message]: """Chat with VisionAgent and get the conversation response. Parameters: @@ -133,6 +150,7 @@ def __call__( [{"role": "user", "content": "describe your task here..."}, ...] or a string of just the contents. media (Optional[Union[str, Path]]): The media file to be used in the task. + artifacts (Optional[Artifacts]): The artifacts to use in the task. Returns: str: The conversation response. @@ -157,6 +175,7 @@ def chat_with_code( [{"role": "user", "content": "describe your task here..."}] or if it contains media files, it should be in the format of: [{"role": "user", "content": "describe your task here...", "media": ["image1.jpg", "image2.jpg"]}] + artifacts (Optional[Artifacts]): The artifacts to use in the task. Returns: List[Message]: The conversation response. @@ -166,8 +185,7 @@ def chat_with_code( raise ValueError("chat cannot be empty") if not artifacts: - artifacts = Artifacts("artifacts.pkl") - artifacts.save() + artifacts = Artifacts(WORKSPACE / "artifacts.pkl") with CodeInterpreterFactory.new_instance( code_sandbox_runtime=self.code_sandbox_runtime @@ -183,7 +201,7 @@ def chat_with_code( # Save dummy value for now since we just need to know the path # name in the key 'media'. Later on we can add artifact support # for byte data. - artifacts.artifacts[media] = "" + artifacts.artifacts[Path(media).name] = None media_list.append(media) int_chat = cast( @@ -205,14 +223,22 @@ def chat_with_code( finished = False iterations = 0 last_response = None - while not finished and iterations < self.max_iterations: - artifacts_remote_path = code_interpreter.upload_file( - artifacts.save_path - ) - artifacts_loaded = artifacts.show() - int_chat.append({"role": "observation", "content": artifacts_loaded}) - orig_chat.append({"role": "observation", "content": artifacts_loaded}) + # Save the current state of artifacts, will include any images the user + # passed in. + artifacts.save(self.local_artifacts_path) + + # Upload artifacts to remote location and show where they are going + # to be loaded to. The actual loading happens in BoilerplateCode as + # part of the pre_code. + remote_artifacts_path = code_interpreter.upload_file( + self.local_artifacts_path + ) + artifacts_loaded = artifacts.show() + int_chat.append({"role": "observation", "content": artifacts_loaded}) + orig_chat.append({"role": "observation", "content": artifacts_loaded}) + + while not finished and iterations < self.max_iterations: response = run_conversation(self.agent, int_chat) if self.verbosity >= 1: _LOGGER.info(response) @@ -230,13 +256,8 @@ def chat_with_code( if code_action is not None: obs = run_code_action( - code_action, code_interpreter, artifacts_remote_path + code_action, code_interpreter, str(remote_artifacts_path) ) - artifacts_local_path = code_interpreter.download_file( - artifacts_remote_path - ) - artifacts.load(artifacts_local_path) - artifacts.save() if self.verbosity >= 1: _LOGGER.info(obs) @@ -245,6 +266,13 @@ def chat_with_code( iterations += 1 last_response = response + + # after running the agent, download the artifacts locally + code_interpreter.download_file( + str(remote_artifacts_path.name), str(self.local_artifacts_path) + ) + artifacts.load(self.local_artifacts_path) + artifacts.save() return orig_chat def log_progress(self, data: Dict[str, Any]) -> None: From 84757f7b41b052a802628a5b7f17459859b1e757 Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Wed, 28 Aug 2024 13:02:44 -0700 Subject: [PATCH 14/37] rename prompts --- vision_agent/agent/vision_agent_prompts.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vision_agent/agent/vision_agent_prompts.py b/vision_agent/agent/vision_agent_prompts.py index 7e98bd94..c1cf541e 100644 --- a/vision_agent/agent/vision_agent_prompts.py +++ b/vision_agent/agent/vision_agent_prompts.py @@ -1,4 +1,4 @@ -VA_CODE_ARTIFACTS = """ +VA_CODE = """ **Role**: You are a helpful conversational agent that assists users with their requests by writing code to solve it. **Taks**: As a conversational agent, you are required to understand the user's request and provide a helpful response. Use a Chain-of-Thought approach to break down the problem, create a plan, and then provide a response. Ensure that your response is clear, concise, and helpful. You can use an interactive Python (Jupyter Notebook) environment, executing code with . You are given access to an `artifacts` object which contains files shared between you and the user. `artifacts` will be automatically saved everytime you execute python code. @@ -29,7 +29,7 @@ """ -EXAMPLES_CODE1_ARTIFACTS = """ +EXAMPLES_CODE1 = """ USER: Can you detect the dogs in this image? Media name dog.jpg OBSERVATION: @@ -85,7 +85,7 @@ """ -EXAMPLES_CODE2_ARTIFACTS = """ +EXAMPLES_CODE2 = """ USER: Can you create a function to count workers with helmets? OBSERVATION: From 65c8cdb38e9d88d70795c0737df94005996ebdca Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Wed, 28 Aug 2024 13:03:20 -0700 Subject: [PATCH 15/37] add docs for artifacts, allow None artifacts (which don't load) to be added --- vision_agent/tools/meta_tools.py | 35 ++++++++++++++++++++++++-------- 1 file changed, 27 insertions(+), 8 deletions(-) diff --git a/vision_agent/tools/meta_tools.py b/vision_agent/tools/meta_tools.py index 4b245757..a25645c4 100644 --- a/vision_agent/tools/meta_tools.py +++ b/vision_agent/tools/meta_tools.py @@ -42,28 +42,47 @@ def filter_file(file_name: Union[str, Path]) -> bool: class Artifacts: - def __init__(self, save_path: Union[str, Path]) -> None: - self.save_path = Path(save_path) + """Artifacts is a class that allows you to sync files between a local and remote + environment. In our case, the remote environment could be where the VisionAgent is + executing code and as the user adds new images, files or modifies files, those + need to be in sync with the remote environment the VisionAgent is running in. + """ + + def __init__(self, remote_save_path: Union[str, Path]) -> None: + self.remote_save_path = Path(remote_save_path) self.artifacts = {} self.code_sandbox_runtime = None def load(self, file_path: Union[str, Path]) -> None: + """Loads are artifacts into the remote environment. If an artifact value is None + it will skip loading it. + + Parameters: + file_path (Union[str, Path]): The file path to load the artifacts from + """ with open(file_path, "rb") as f: self.artifacts = pkl.load(f) for k, v in self.artifacts.items(): - with open(self.save_path.parent / k, "w") as f: - f.write(v) + if v is not None: + with open(self.remote_save_path.parent / k, "w") as f: + f.write(v) def show(self) -> str: + """Shows the artifacts that have been loaded and their remote save paths.""" out_str = "[Artifacts loaded]\n" for k in self.artifacts.keys(): - out_str += f"Artifact {k} loaded to {str(self.save_path.parent / k)}\n" + out_str += ( + f"Artifact {k} loaded to {str(self.remote_save_path.parent / k)}\n" + ) out_str += "[End of artifacts]\n" return out_str - def save(self) -> None: - with open(self.save_path, "wb") as f: + def save(self, local_path: Optional[Union[str, Path]] = None) -> None: + save_path = ( + Path(local_path) if local_path is not None else self.remote_save_path + ) + with open(save_path, "wb") as f: pkl.dump(self.artifacts, f) def __iter__(self): @@ -259,7 +278,7 @@ def detect_dogs(image_path: str): agent = va.agent.VisionAgentCoder() fixed_chat: List[Message] = [{"role": "user", "content": chat, "media": media}] - response = agent.chat_with_workflow(fixed_chat) + response = agent.chat_with_workflow(fixed_chat, test_multi_plan=False) code = response["code"] artifacts[name] = code code_lines = code.splitlines(keepends=True) From b3c13b1ecca2624e8209ad793e6c3b559e8ffd4a Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Wed, 28 Aug 2024 13:10:21 -0700 Subject: [PATCH 16/37] e2b and local uplaod/download work similarly now, can pass in target download path --- vision_agent/utils/execute.py | 68 +++++++++++++++++++++-------------- 1 file changed, 42 insertions(+), 26 deletions(-) diff --git a/vision_agent/utils/execute.py b/vision_agent/utils/execute.py index 299f5d2c..ee671a15 100644 --- a/vision_agent/utils/execute.py +++ b/vision_agent/utils/execute.py @@ -407,17 +407,19 @@ def exec_isolation(self, code: str) -> Execution: self.restart_kernel() return self.exec_cell(code) - def upload_file(self, file: Union[str, Path]) -> str: + def upload_file(self, file: Union[str, Path]) -> Path: # Default behavior is a no-op (for local code interpreter) - return str(file) + return Path(file) - def download_file(self, file_path: str) -> Path: + def download_file(self, remote_file_path: str, local_file_path: str) -> Path: # Default behavior is a no-op (for local code interpreter) - return Path(file_path) + return Path(local_file_path) class E2BCodeInterpreter(CodeInterpreter): - def __init__(self, *args: Any, **kwargs: Any) -> None: + def __init__( + self, remote_path: Optional[Union[str, Path]] = None, *args: Any, **kwargs: Any + ) -> None: super().__init__(*args, **kwargs) assert os.getenv("E2B_API_KEY"), "E2B_API_KEY environment variable must be set" try: @@ -444,6 +446,9 @@ def __init__(self, *args: Any, **kwargs: Any) -> None: _LOGGER.info( f"E2BCodeInterpreter (sandbox id: {self.interpreter.sandbox_id}) initialized:\n{sys_versions}" ) + self.remote_path = Path( + remote_path if remote_path is not None else "/home/user" + ) def close(self, *args: Any, **kwargs: Any) -> None: try: @@ -517,19 +522,18 @@ def exec_cell(self, code: str) -> Execution: before_sleep=tenacity.before_sleep_log(_LOGGER, logging.INFO), after=tenacity.after_log(_LOGGER, logging.INFO), ) - def upload_file(self, file: Union[str, Path]) -> str: + def upload_file(self, file: Union[str, Path]) -> Path: file_name = Path(file).name - remote_path = f"/home/user/{file_name}" with open(file, "rb") as f: - self.interpreter.files.write(path=remote_path, data=f) - _LOGGER.info(f"File ({file}) is uploaded to: {remote_path}") - return remote_path + self.interpreter.files.write(path=str(self.remote_path / file_name), data=f) + _LOGGER.info(f"File ({file}) is uploaded to: {str(self.remote_path)}") + return self.remote_path - def download_file(self, file_path: str) -> Path: - with tempfile.NamedTemporaryFile(mode="w+b", delete=False) as file: - file.write(self.interpreter.files.read(path=file_path, format="bytes")) - _LOGGER.info(f"File ({file_path}) is downloaded to: {file.name}") - return Path(file.name) + def download_file(self, remote_file_path: str, local_file_path: str) -> Path: + with open(local_file_path, "w+b") as f: + f.write(self.interpreter.files.read(path=remote_file_path, format="bytes")) + _LOGGER.info(f"File ({remote_file_path}) is downloaded to: {local_file_path}") + return Path(local_file_path) @staticmethod def _new_e2b_interpreter_impl(*args, **kwargs) -> E2BCodeInterpreterImpl: # type: ignore @@ -541,7 +545,11 @@ def _new_e2b_interpreter_impl(*args, **kwargs) -> E2BCodeInterpreterImpl: # typ class LocalCodeInterpreter(CodeInterpreter): - def __init__(self, timeout: int = _SESSION_TIMEOUT) -> None: + def __init__( + self, + timeout: int = _SESSION_TIMEOUT, + remote_path: Optional[Union[str, Path]] = None, + ) -> None: super().__init__(timeout=timeout) self.nb = nbformat.v4.new_notebook() self.nb_client = NotebookClient(self.nb, timeout=self.timeout) @@ -555,6 +563,7 @@ def __init__(self, timeout: int = _SESSION_TIMEOUT) -> None: ) sleep(1) self._new_kernel() + self.remote_path = Path(remote_path if remote_path is not None else WORKSPACE) def _new_kernel(self) -> None: if self.nb_client.kc is None or not run_sync(self.nb_client.kc.is_alive)(): # type: ignore @@ -611,18 +620,19 @@ def exec_cell(self, code: str) -> Execution: def upload_file(self, file_path: str) -> Path: with open(file_path, "rb") as f: contents = f.read() - with open(WORKSPACE / file_path, "wb") as f: + with open(self.remote_path / Path(file_path).name, "wb") as f: f.write(contents) + _LOGGER.info(f"File ({file_path}) is uploaded to: {str(self.remote_path)}") - return Path(WORKSPACE / file_path) + return Path(self.remote_path / file_path) - def download_file(self, file_path: str) -> Path: - with open(WORKSPACE / file_path, "rb") as f: + def download_file(self, remote_file_path: str, local_file_path: str) -> Path: + with open(self.remote_path / remote_file_path, "rb") as f: contents = f.read() - with open(file_path, "wb") as f: + with open(local_file_path, "wb") as f: f.write(contents) - return Path(file_path) - + _LOGGER.info(f"File ({remote_file_path}) is downloaded to: {local_file_path}") + return Path(local_file_path) class CodeInterpreterFactory: @@ -647,13 +657,19 @@ def get_default_instance() -> CodeInterpreter: return instance @staticmethod - def new_instance(code_sandbox_runtime: Optional[str] = None) -> CodeInterpreter: + def new_instance( + code_sandbox_runtime: Optional[str] = None, remote_path: Optional[str] = None + ) -> CodeInterpreter: if not code_sandbox_runtime: code_sandbox_runtime = os.getenv("CODE_SANDBOX_RUNTIME", "local") if code_sandbox_runtime == "e2b": - instance: CodeInterpreter = E2BCodeInterpreter(timeout=_SESSION_TIMEOUT) + instance: CodeInterpreter = E2BCodeInterpreter( + timeout=_SESSION_TIMEOUT, remote_path=remote_path + ) elif code_sandbox_runtime == "local": - instance = LocalCodeInterpreter(timeout=_SESSION_TIMEOUT) + instance = LocalCodeInterpreter( + timeout=_SESSION_TIMEOUT, remote_path=remote_path + ) else: raise ValueError( f"Unsupported code sandbox runtime: {code_sandbox_runtime}. Supported runtimes: e2b, local" From 6ebb75b84caa81253b75bff06bf7470bf5ffafc6 Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Wed, 28 Aug 2024 13:10:33 -0700 Subject: [PATCH 17/37] add Artifacts to exports --- vision_agent/tools/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vision_agent/tools/__init__.py b/vision_agent/tools/__init__.py index 3372fcbb..cbd92358 100644 --- a/vision_agent/tools/__init__.py +++ b/vision_agent/tools/__init__.py @@ -1,6 +1,6 @@ from typing import Callable, List, Optional -from .meta_tools import META_TOOL_DOCSTRING +from .meta_tools import META_TOOL_DOCSTRING, Artifacts from .prompts import CHOOSE_PARAMS, SYSTEM_PROMPT from .tool_utils import get_tool_descriptions_by_names from .tools import ( From 907c44970192d981f281576cf11fe1bd67985b0d Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Wed, 28 Aug 2024 13:10:45 -0700 Subject: [PATCH 18/37] local chat app to work with artifacts --- examples/chat/app.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/examples/chat/app.py b/examples/chat/app.py index 94f3b23a..68dede8d 100644 --- a/examples/chat/app.py +++ b/examples/chat/app.py @@ -26,9 +26,14 @@ "response": "saved", "style": {"bottom": "calc(50% - 4.25rem", "right": "0.4rem"}, } -artifacts = va.tools.Artifacts("artifacts.pkl") -artifacts.save() -agent = va.agent.VisionAgent(verbosity=1) +# set artifacts remote_path to WORKSPACE +artifacts = va.tools.Artifacts(WORKSPACE / "artifacts.pkl") +if Path("artifacts.pkl").exists(): + artifacts.load("artifacts.pkl") +else: + artifacts.save("artifacts.pkl") + +agent = va.agent.VisionAgent(verbosity=1, local_artifacts_path="artifacts.pkl") st.set_page_config(layout="wide") @@ -125,7 +130,9 @@ def main(): if uploaded_file is not None: with open(WORKSPACE / uploaded_file.name, "wb") as f: f.write(uploaded_file.getbuffer()) - artifacts.artifacts[WORKSPACE / uploaded_file.name] = "" + + # make it None so it wont load and overwrite the image + artifacts.artifacts[uploaded_file.name] = None for file in WORKSPACE.iterdir(): if "__pycache__" not in str(file) and not str(file).startswith("."): From bbae983d856f34acc6fb6abc400c1e9ef4aca45b Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Wed, 28 Aug 2024 13:16:33 -0700 Subject: [PATCH 19/37] updated docs --- README.md | 18 +++++++++++------- docs/index.md | 21 +++++++++++++-------- vision_agent/utils/image_utils.py | 2 +- 3 files changed, 25 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index f41bef31..88c59973 100644 --- a/README.md +++ b/README.md @@ -41,15 +41,15 @@ export OPENAI_API_KEY="your-api-key" ``` ### Vision Agent -There are two agents that you can use. Vision Agent is a conversational agent that has +There are two agents that you can use. `VisionAgent` is a conversational agent that has access to tools that allow it to write an navigate python code and file systems. It can -converse with the user in natural language. VisionAgentCoder is an agent that can write -code for vision tasks, such as counting people in an image. However, it cannot converse -and can only respond with code. VisionAgent can call VisionAgentCoder to write vision -code. +converse with the user in natural language. `VisionAgentCoder` is an agent specifically +for writing code for vision tasks, such as counting people in an image. However, it +cannot chat with you and can only respond with code. `VisionAgent` can call +`VisionAgentCoder` to write vision code. #### Basic Usage -To run the streamlit app locally to chat with Vision Agent, you can run the following +To run the streamlit app locally to chat with `VisionAgent`, you can run the following command: ```bash @@ -146,7 +146,7 @@ the code and having it update. You just need to add the code as a response from assistant: ```python -agent = va.agent.VisionAgent(verbosity=2) +agent = va.agent.VisionAgentCoder(verbosity=2) conv = [ { "role": "user", @@ -212,6 +212,10 @@ function. Make sure the documentation is in the same format above with descripti `Parameters:`, `Returns:`, and `Example\n-------`. You can find an example use case [here](examples/custom_tools/) as this is what the agent uses to pick and use the tool. +Can't find the tool you need and want add it to `VisionAgent`? Check out our +[vision-agent-tools](https://github.com/landing-ai/vision-agent-tools) repository where +we add the source code for all the tools used in `VisionAgent`. + ## Additional Backends ### Ollama We also provide a `VisionAgentCoder` that uses Ollama. To get started you must download diff --git a/docs/index.md b/docs/index.md index 8569c5cc..0f5022f9 100644 --- a/docs/index.md +++ b/docs/index.md @@ -38,15 +38,15 @@ export OPENAI_API_KEY="your-api-key" ``` ### Vision Agent -There are two agents that you can use. Vision Agent is a conversational agent that has +There are two agents that you can use. `VisionAgent` is a conversational agent that has access to tools that allow it to write an navigate python code and file systems. It can -converse with the user in natural language. VisionAgentCoder is an agent that can write -code for vision tasks, such as counting people in an image. However, it cannot converse -and can only respond with code. VisionAgent can call VisionAgentCoder to write vision -code. +converse with the user in natural language. `VisionAgentCoder` is an agent specifically +for writing code for vision tasks, such as counting people in an image. However, it +cannot chat with you and can only respond with code. `VisionAgent` can call +`VisionAgentCoder` to write vision code. #### Basic Usage -To run the streamlit app locally to chat with Vision Agent, you can run the following +To run the streamlit app locally to chat with `VisionAgent`, you can run the following command: ```bash @@ -143,7 +143,7 @@ the code and having it update. You just need to add the code as a response from assistant: ```python -agent = va.agent.VisionAgent(verbosity=2) +agent = va.agent.VisionAgentCoder(verbosity=2) conv = [ { "role": "user", @@ -209,6 +209,10 @@ function. Make sure the documentation is in the same format above with descripti `Parameters:`, `Returns:`, and `Example\n-------`. You can find an example use case [here](examples/custom_tools/) as this is what the agent uses to pick and use the tool. +Can't find the tool you need and want add it to `VisionAgent`? Check out our +[vision-agent-tools](https://github.com/landing-ai/vision-agent-tools) repository where +we add the source code for all the tools used in `VisionAgent`. + ## Additional Backends ### Ollama We also provide a `VisionAgentCoder` that uses Ollama. To get started you must download @@ -230,6 +234,7 @@ tools. You can use it just like you would use `VisionAgentCoder`: >>> agent = va.agent.OllamaVisionAgentCoder() >>> agent("Count the apples in the image", media="apples.jpg") ``` +> WARNING: VisionAgent doesn't work well unless the underlying LMM is sufficiently powerful. Do not expect good results or even working code with smaller models like Llama 3.1 8B. ### Azure OpenAI We also provide a `AzureVisionAgentCoder` that uses Azure OpenAI models. To get started @@ -241,7 +246,7 @@ follow the Azure Setup section below. You can use it just like you would use= >>> agent = va.agent.AzureVisionAgentCoder() >>> agent("Count the apples in the image", media="apples.jpg") ``` -> WARNING: VisionAgent doesn't work well unless the underlying LMM is sufficiently powerful. Do not expect good results or even working code with smaller models like Llama 3.1 8B. + ### Azure Setup If you want to use Azure OpenAI models, you need to have two OpenAI model deployments: diff --git a/vision_agent/utils/image_utils.py b/vision_agent/utils/image_utils.py index d2bc8a6d..54688f93 100644 --- a/vision_agent/utils/image_utils.py +++ b/vision_agent/utils/image_utils.py @@ -70,7 +70,7 @@ def rle_decode_array(rle: Dict[str, List[int]]) -> np.ndarray: r"""Decode a run-length encoded mask. Returns numpy array, 1 - mask, 0 - background. Parameters: - mask: The mask in run-length encoded as an array. + rle: The run-length encoded mask. """ size = rle["size"] counts = rle["counts"] From 3e7cfd2552f3c56ee5745631eac81c47bf7c17e0 Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Wed, 28 Aug 2024 13:24:33 -0700 Subject: [PATCH 20/37] fix flake8 --- vision_agent/tools/meta_tools.py | 358 +------------------------------ vision_agent/utils/execute.py | 1 - 2 files changed, 1 insertion(+), 358 deletions(-) diff --git a/vision_agent/tools/meta_tools.py b/vision_agent/tools/meta_tools.py index a25645c4..bc3e3058 100644 --- a/vision_agent/tools/meta_tools.py +++ b/vision_agent/tools/meta_tools.py @@ -3,16 +3,12 @@ import subprocess import tempfile from pathlib import Path -from typing import Any, Callable, Dict, List, Optional, Union -from uuid import UUID +from typing import Any, Dict, List, Optional, Union import vision_agent as va -from vision_agent.clients.landing_public_api import LandingPublicAPI from vision_agent.lmm.types import Message from vision_agent.tools.tool_utils import get_tool_documentation from vision_agent.tools.tools import TOOL_DESCRIPTIONS -from vision_agent.utils import CodeInterpreterFactory -from vision_agent.utils.image_utils import convert_to_b64 # These tools are adapted from SWE-Agent https://github.com/princeton-nlp/SWE-agent @@ -339,349 +335,6 @@ def detect_dogs(image_path: str): return view_lines(code_lines, 0, total_lines, name, total_lines) -# def generate_vision_code(save_file: str, chat: str, media: List[str]) -> str: -# """Generates python code to solve vision based tasks. - -# Parameters: -# save_file (str): The file path to save the code. -# chat (str): The chat message from the user. -# media (List[str]): The media files to use. - -# Returns: -# str: The generated code. - -# Examples -# -------- -# >>> generate_vision_code("code.py", "Can you detect the dogs in this image?", ["image.jpg"]) -# from vision_agent.tools import load_image, owl_v2 -# def detect_dogs(image_path: str): -# image = load_image(image_path) -# dogs = owl_v2("dog", image) -# return dogs -# """ - -# if ZMQ_PORT is not None: -# agent = va.agent.VisionAgentCoder( -# report_progress_callback=lambda inp: report_progress_callback( -# int(ZMQ_PORT), inp -# ) -# ) -# else: -# agent = va.agent.VisionAgentCoder() -# try: -# fixed_chat: List[Message] = [{"role": "user", "content": chat, "media": media}] -# response = agent.chat_with_workflow(fixed_chat) -# code = response["code"] -# with open(save_file, "w") as f: -# f.write(code) -# code_lines = code.splitlines(keepends=True) -# total_lines = len(code_lines) -# return view_lines(code_lines, 0, total_lines, save_file, total_lines) -# except Exception as e: -# return str(e) - - -# def edit_vision_code(code_file: str, chat_history: List[str], media: List[str]) -> str: -# """Edits python code to solve a vision based task. - -# Parameters: -# code_file (str): The file path to the code. -# chat_history (List[str]): The chat history to used to generate the code. - -# Returns: -# str: The edited code. - -# Examples -# -------- -# >>> edit_vision_code( -# >>> "code.py", -# >>> ["Can you detect the dogs in this image?", "Can you use a higher threshold?"], -# >>> ["dog.jpg"], -# >>> ) -# from vision_agent.tools import load_image, owl_v2 -# def detect_dogs(image_path: str): -# image = load_image(image_path) -# dogs = owl_v2("dog", image, threshold=0.8) -# return dogs -# """ - -# agent = va.agent.VisionAgentCoder() -# with open(code_file, "r") as f: -# code = f.read() - -# # Append latest code to second to last message from assistant -# fixed_chat_history: List[Message] = [] -# for i, chat in enumerate(chat_history): -# if i == 0: -# fixed_chat_history.append({"role": "user", "content": chat, "media": media}) -# elif i > 0 and i < len(chat_history) - 1: -# fixed_chat_history.append({"role": "user", "content": chat}) -# elif i == len(chat_history) - 1: -# fixed_chat_history.append({"role": "assistant", "content": code}) -# fixed_chat_history.append({"role": "user", "content": chat}) - -# try: -# response = agent.chat_with_workflow(fixed_chat_history, test_multi_plan=False) -# code = response["code"] -# with open(code_file, "w") as f: -# f.write(code) -# code_lines = code.splitlines(keepends=True) -# total_lines = len(code_lines) -# return view_lines(code_lines, 0, total_lines, code_file, total_lines) -# except Exception as e: -# return str(e) - - -# def format_lines(lines: List[str], start_idx: int) -> str: -# output = "" -# for i, line in enumerate(lines): -# output += f"{i + start_idx}|{line}" -# return output - - -# def view_lines( -# lines: List[str], line_num: int, window_size: int, file_path: str, total_lines: int -# ) -> str: -# start = max(0, line_num - window_size) -# end = min(len(lines), line_num + window_size) -# return ( -# f"[File: {file_path} ({total_lines} lines total)]\n" -# + format_lines(lines[start:end], start) -# + ("[End of file]" if end == len(lines) else f"[{len(lines) - end} more lines]") -# ) - - -# def open_file(file_path: str, line_num: int = 0, window_size: int = 100) -> str: -# """Opens the file at at the given path in the editor. If `line_num` is provided, -# the window will be moved to include that line. It only shows the first 100 lines by -# default! Max `window_size` supported is 2000. use `scroll up/down` to view the file -# if you want to see more. - -# Parameters: -# file_path (str): The file path to open, preferred absolute path. -# line_num (int): The line number to move the window to. -# window_size (int): The number of lines to show above and below the line. -# """ - -# file_path_p = Path(file_path) -# if not file_path_p.exists(): -# return f"[File {file_path} does not exist]" - -# total_lines = sum(1 for _ in open(file_path_p)) -# window_size = min(window_size, 2000) -# window_size = window_size // 2 -# if line_num - window_size < 0: -# line_num = window_size -# elif line_num >= total_lines: -# line_num = total_lines - 1 - window_size - -# global CURRENT_LINE, CURRENT_FILE -# CURRENT_LINE = line_num -# CURRENT_FILE = file_path - -# with open(file_path, "r") as f: -# lines = f.readlines() - -# return view_lines(lines, line_num, window_size, file_path, total_lines) - - -# def create_file(file_path: str) -> str: -# """Creates and opens a new file with the given name. - -# Parameters: -# file_path (str): The file path to create, preferred absolute path. -# """ - -# file_path_p = Path(file_path) -# if file_path_p.exists(): -# return f"[File {file_path} already exists]" -# file_path_p.touch() -# global CURRENT_FILE -# CURRENT_FILE = file_path -# return f"[File created {file_path}]" - - -# def scroll_up() -> str: -# """Moves the window up by 100 lines.""" -# if CURRENT_FILE is None: -# return "[No file is open]" - -# return open_file(CURRENT_FILE, CURRENT_LINE + DEFAULT_WINDOW_SIZE) - - -# def scroll_down() -> str: -# """Moves the window down by 100 lines.""" -# if CURRENT_FILE is None: -# return "[No file is open]" - -# return open_file(CURRENT_FILE, CURRENT_LINE - DEFAULT_WINDOW_SIZE) - - -# def search_dir(search_term: str, dir_path: str) -> str: -# """Searches for search_term in all files in a directory. - -# Parameters: -# search_term (str): The search term to look for. -# dir_path (str): The directory path to search in, preferred absolute path. -# """ - -# dir_path_p = Path(dir_path) -# if not dir_path_p.exists(): -# return f"[Directory {dir_path} does not exist]" - -# matches = [] -# for file in dir_path_p.glob("**/*"): -# if filter_file(file): -# with open(file, "r") as f: -# lines = f.readlines() -# for i, line in enumerate(lines): -# if search_term in line: -# matches.append(f"{file}:{i}|{line.strip()}\n") -# if not matches: -# return f"[No matches found for {search_term} in {dir_path}]" -# if len(matches) > 100: -# return f"[More than {len(matches)} matches found for {search_term} in {dir_path}. Please narrow your search]" - -# return_str = f"[Found {len(matches)} matches for {search_term} in {dir_path}]\n" -# for match in matches: -# return_str += match - -# return_str += f"[End of matches for {search_term} in {dir_path}]" -# return return_str - - -# def search_file(search_term: str, file_path: str) -> str: -# """Searches the file for the given search term. - -# Parameters: -# search_term (str): The search term to look for. -# file_path (str): The file path to search in, preferred absolute path. -# """ - -# file_path_p = Path(file_path) -# if not file_path_p.exists(): -# return f"[File {file_path} does not exist]" - -# with open(file_path_p, "r") as f: -# lines = f.readlines() - -# search_results = [] -# for i, line in enumerate(lines): -# if search_term in line: -# search_results.append(f"{i}|{line.strip()}\n") - -# if not search_results: -# return f"[No matches found for {search_term} in {file_path}]" - -# return_str = ( -# f"[Found {len(search_results)} matches for {search_term} in {file_path}]\n" -# ) -# for result in search_results: -# return_str += result - -# return_str += f"[End of matches for {search_term} in {file_path}]" -# return return_str - - -# def find_file(file_name: str, dir_path: str = "./") -> str: -# """Finds all files with the given name in the specified directory. - -# Parameters: -# file_name (str): The file name to look for. -# dir_path (str): The directory path to search in, preferred absolute path. -# """ - -# dir_path_p = Path(dir_path) -# if not dir_path_p.exists(): -# return f"[Directory {dir_path} does not exist]" - -# files = list(dir_path_p.glob(f"**/*{file_name}*")) -# files = [f for f in files if filter_file(f)] -# if not files: -# return f"[No files found in {dir_path} with name {file_name}]" - -# return_str = f"[Found {len(files)} matches for {file_name} in {dir_path}]\n" -# for match in files: -# return_str += str(match) + "\n" - -# return_str += f"[End of matches for {file_name} in {dir_path}]" -# return return_str - - -# def edit_file(file_path: str, start: int, end: int, content: str) -> str: -# """Edits the file at the given path with the provided content. The content will be -# inserted between the `start` and `end` line numbers. If the `start` and `end` are -# the same, the content will be inserted at the `start` line number. If the `end` is -# greater than the total number of lines in the file, the content will be inserted at -# the end of the file. If the `start` or `end` are negative, the function will return -# an error message. - -# Parameters: -# file_path (str): The file path to edit, preferred absolute path. -# start (int): The line number to start the edit. -# end (int): The line number to end the edit. -# content (str): The content to insert. -# """ -# file_path_p = Path(file_path) -# if not file_path_p.exists(): -# return f"[File {file_path} does not exist]" - -# total_lines = sum(1 for _ in open(file_path_p)) -# if start < 0 or end < 0 or start > end or end > total_lines: -# return "[Invalid line range]" -# if start == end: -# end += 1 - -# new_content_lines = content.splitlines(keepends=True) -# new_content_lines = [ -# line if line.endswith("\n") else line + "\n" for line in new_content_lines -# ] -# with open(file_path_p, "r") as f: -# lines = f.readlines() -# edited_lines = lines[:start] + new_content_lines + lines[end:] - -# cur_line = start + len(content.split("\n")) // 2 -# tmp_file = file_path_p.with_suffix(".tmp") -# with open(tmp_file, "w") as f: -# f.writelines(edited_lines) - -# process = subprocess.Popen( -# [ -# "flake8", -# "--isolated", -# "--select=F821,F822,F831,E111,E112,E113,E999,E902", -# tmp_file, -# ], -# stdout=subprocess.PIPE, -# stderr=subprocess.PIPE, -# text=True, -# ) -# stdout, _ = process.communicate() -# tmp_file.unlink() -# if stdout != "": -# stdout = stdout.replace(tmp_file.name, file_path) -# error_msg = "[Edit failed with the following status]\n" + stdout -# original_view = view_lines( -# lines, -# start + ((end - start) // 2), -# DEFAULT_WINDOW_SIZE, -# file_path, -# total_lines, -# ) -# total_lines_edit = sum(1 for _ in edited_lines) -# edited_view = view_lines( -# edited_lines, cur_line, DEFAULT_WINDOW_SIZE, file_path, total_lines_edit -# ) - -# error_msg += f"\n[This is how your edit would have looked like if applied]\n{edited_view}\n\n[This is the original code before your edit]\n{original_view}" -# return error_msg - -# with open(file_path_p, "w") as f: -# f.writelines(edited_lines) - -# return open_file(file_path, cur_line) - - def get_tool_descriptions() -> str: """Returns a description of all the tools that `generate_vision_code` has access to. Helpful for answering questions about what types of vision tasks you can do with @@ -697,14 +350,5 @@ def get_tool_descriptions() -> str: edit_artifact, generate_vision_code, edit_vision_code, - # open_file, - # create_file, - # scroll_up, - # scroll_down, - # edit_file, - # search_dir, - # search_file, - # find_file, - # florencev2_fine_tuning, ] ) diff --git a/vision_agent/utils/execute.py b/vision_agent/utils/execute.py index ee671a15..08924875 100644 --- a/vision_agent/utils/execute.py +++ b/vision_agent/utils/execute.py @@ -5,7 +5,6 @@ import platform import re import sys -import tempfile import traceback import warnings from enum import Enum From afc87c06e460702370e489d5f0cfd0d6059ab13c Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Wed, 28 Aug 2024 14:21:34 -0700 Subject: [PATCH 21/37] fix mypy errors --- vision_agent/agent/agent.py | 2 +- vision_agent/agent/vision_agent.py | 15 +++++++++------ vision_agent/agent/vision_agent_coder.py | 2 +- vision_agent/tools/meta_tools.py | 8 ++++---- vision_agent/utils/execute.py | 10 +++++----- 5 files changed, 20 insertions(+), 17 deletions(-) diff --git a/vision_agent/agent/agent.py b/vision_agent/agent/agent.py index 6b11f297..ca2cf181 100644 --- a/vision_agent/agent/agent.py +++ b/vision_agent/agent/agent.py @@ -11,7 +11,7 @@ def __call__( self, input: Union[str, List[Message]], media: Optional[Union[str, Path]] = None, - ) -> str: + ) -> Union[str, List[Message]]: pass @abstractmethod diff --git a/vision_agent/agent/vision_agent.py b/vision_agent/agent/vision_agent.py index 17fe347d..6399016e 100644 --- a/vision_agent/agent/vision_agent.py +++ b/vision_agent/agent/vision_agent.py @@ -38,7 +38,7 @@ class BoilerplateCode: ] @staticmethod - def add_boilerplate(code: str, **format) -> str: + def add_boilerplate(code: str, **format: Any) -> str: """Run this method to prepend the default imports to the code. NOTE: be sure to run this method after the custom tools have been registered. """ @@ -131,10 +131,13 @@ def __init__( self.code_sandbox_runtime = code_sandbox_runtime if self.verbosity >= 1: _LOGGER.setLevel(logging.INFO) - self.local_artifacts_path = ( - Path(local_artifacts_path) - if local_artifacts_path is not None - else "artifacts.pkl" + self.local_artifacts_path = cast( + str, + ( + Path(local_artifacts_path) + if local_artifacts_path is not None + else "artifacts.pkl" + ), ) def __call__( @@ -160,7 +163,7 @@ def __call__( if media is not None: input[0]["media"] = [media] results = self.chat_with_code(input, artifacts) - return results # type: ignore + return results def chat_with_code( self, diff --git a/vision_agent/agent/vision_agent_coder.py b/vision_agent/agent/vision_agent_coder.py index 7856bdb8..cc0711b6 100644 --- a/vision_agent/agent/vision_agent_coder.py +++ b/vision_agent/agent/vision_agent_coder.py @@ -725,7 +725,7 @@ def chat_with_workflow( else code_interpreter.upload_file(media) ) chat_i["content"] += f" Media name {media}" # type: ignore - media_list.append(media) + media_list.append(str(media)) int_chat = cast( List[Message], diff --git a/vision_agent/tools/meta_tools.py b/vision_agent/tools/meta_tools.py index bc3e3058..89c2dbdd 100644 --- a/vision_agent/tools/meta_tools.py +++ b/vision_agent/tools/meta_tools.py @@ -46,7 +46,7 @@ class Artifacts: def __init__(self, remote_save_path: Union[str, Path]) -> None: self.remote_save_path = Path(remote_save_path) - self.artifacts = {} + self.artifacts: Dict[str, Any] = {} self.code_sandbox_runtime = None @@ -81,10 +81,10 @@ def save(self, local_path: Optional[Union[str, Path]] = None) -> None: with open(save_path, "wb") as f: pkl.dump(self.artifacts, f) - def __iter__(self): + def __iter__(self) -> Any: return iter(self.artifacts) - def __getitem__(self, name: str) -> str: + def __getitem__(self, name: str) -> Any: return self.artifacts[name] def __setitem__(self, name: str, value: str) -> None: @@ -201,7 +201,7 @@ def edit_artifact( cur_line = start + len(content.split("\n")) // 2 with tempfile.NamedTemporaryFile(delete=True) as f: - with open(f.name, "w") as f: + with open(f.name, "w") as f: # type: ignore f.writelines(edited_lines) process = subprocess.Popen( diff --git a/vision_agent/utils/execute.py b/vision_agent/utils/execute.py index 08924875..15e4f9b9 100644 --- a/vision_agent/utils/execute.py +++ b/vision_agent/utils/execute.py @@ -410,7 +410,7 @@ def upload_file(self, file: Union[str, Path]) -> Path: # Default behavior is a no-op (for local code interpreter) return Path(file) - def download_file(self, remote_file_path: str, local_file_path: str) -> Path: + def download_file(self, remote_file_path: Union[str, Path], local_file_path: Union[str, Path]) -> Path: # Default behavior is a no-op (for local code interpreter) return Path(local_file_path) @@ -528,9 +528,9 @@ def upload_file(self, file: Union[str, Path]) -> Path: _LOGGER.info(f"File ({file}) is uploaded to: {str(self.remote_path)}") return self.remote_path - def download_file(self, remote_file_path: str, local_file_path: str) -> Path: + def download_file(self, remote_file_path: Union[str, Path], local_file_path: Union[str, Path]) -> Path: with open(local_file_path, "w+b") as f: - f.write(self.interpreter.files.read(path=remote_file_path, format="bytes")) + f.write(self.interpreter.files.read(path=str(remote_file_path), format="bytes")) _LOGGER.info(f"File ({remote_file_path}) is downloaded to: {local_file_path}") return Path(local_file_path) @@ -616,7 +616,7 @@ def exec_cell(self, code: str) -> Execution: traceback_raw = traceback.format_exc().splitlines() return Execution.from_exception(e, traceback_raw) - def upload_file(self, file_path: str) -> Path: + def upload_file(self, file_path: Union[str, Path]) -> Path: with open(file_path, "rb") as f: contents = f.read() with open(self.remote_path / Path(file_path).name, "wb") as f: @@ -625,7 +625,7 @@ def upload_file(self, file_path: str) -> Path: return Path(self.remote_path / file_path) - def download_file(self, remote_file_path: str, local_file_path: str) -> Path: + def download_file(self, remote_file_path: Union[str, Path], local_file_path: Union[str, Path]) -> Path: with open(self.remote_path / remote_file_path, "rb") as f: contents = f.read() with open(local_file_path, "wb") as f: From 4aa9fec019a77ad1122a1a805c6f54606607eba2 Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Wed, 28 Aug 2024 15:15:19 -0700 Subject: [PATCH 22/37] fix format --- vision_agent/utils/execute.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/vision_agent/utils/execute.py b/vision_agent/utils/execute.py index 15e4f9b9..05b03612 100644 --- a/vision_agent/utils/execute.py +++ b/vision_agent/utils/execute.py @@ -410,7 +410,9 @@ def upload_file(self, file: Union[str, Path]) -> Path: # Default behavior is a no-op (for local code interpreter) return Path(file) - def download_file(self, remote_file_path: Union[str, Path], local_file_path: Union[str, Path]) -> Path: + def download_file( + self, remote_file_path: Union[str, Path], local_file_path: Union[str, Path] + ) -> Path: # Default behavior is a no-op (for local code interpreter) return Path(local_file_path) @@ -528,9 +530,13 @@ def upload_file(self, file: Union[str, Path]) -> Path: _LOGGER.info(f"File ({file}) is uploaded to: {str(self.remote_path)}") return self.remote_path - def download_file(self, remote_file_path: Union[str, Path], local_file_path: Union[str, Path]) -> Path: + def download_file( + self, remote_file_path: Union[str, Path], local_file_path: Union[str, Path] + ) -> Path: with open(local_file_path, "w+b") as f: - f.write(self.interpreter.files.read(path=str(remote_file_path), format="bytes")) + f.write( + self.interpreter.files.read(path=str(remote_file_path), format="bytes") + ) _LOGGER.info(f"File ({remote_file_path}) is downloaded to: {local_file_path}") return Path(local_file_path) @@ -625,7 +631,9 @@ def upload_file(self, file_path: Union[str, Path]) -> Path: return Path(self.remote_path / file_path) - def download_file(self, remote_file_path: Union[str, Path], local_file_path: Union[str, Path]) -> Path: + def download_file( + self, remote_file_path: Union[str, Path], local_file_path: Union[str, Path] + ) -> Path: with open(self.remote_path / remote_file_path, "rb") as f: contents = f.read() with open(local_file_path, "wb") as f: From 53dea5728be340dba0f12f2b8539ffeb914dce4a Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Wed, 28 Aug 2024 20:28:50 -0700 Subject: [PATCH 23/37] add execution to conversation --- vision_agent/agent/vision_agent.py | 14 +++++++------- vision_agent/lmm/types.py | 3 ++- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/vision_agent/agent/vision_agent.py b/vision_agent/agent/vision_agent.py index 6399016e..04cafd5e 100644 --- a/vision_agent/agent/vision_agent.py +++ b/vision_agent/agent/vision_agent.py @@ -15,7 +15,7 @@ from vision_agent.tools import META_TOOL_DOCSTRING from vision_agent.tools.meta_tools import Artifacts from vision_agent.utils import CodeInterpreterFactory -from vision_agent.utils.execute import CodeInterpreter +from vision_agent.utils.execute import CodeInterpreter, Execution logging.basicConfig(level=logging.INFO) _LOGGER = logging.getLogger(__name__) @@ -75,11 +75,10 @@ def run_conversation(orch: LMM, chat: List[Message]) -> Dict[str, Any]: def run_code_action( code: str, code_interpreter: CodeInterpreter, artifact_remote_path: str -) -> str: - result = code_interpreter.exec_cell( +) -> Execution: + return code_interpreter.exec_isolation( BoilerplateCode.add_boilerplate(code, remote_path=artifact_remote_path) ) - return result.text() def parse_execution(response: str) -> Optional[str]: @@ -258,14 +257,15 @@ def chat_with_code( code_action = parse_execution(response["response"]) if code_action is not None: - obs = run_code_action( + result = run_code_action( code_action, code_interpreter, str(remote_artifacts_path) ) + obs = result.text() if self.verbosity >= 1: _LOGGER.info(obs) - int_chat.append({"role": "observation", "content": obs}) - orig_chat.append({"role": "observation", "content": obs}) + int_chat.append({"role": "observation", "content": obs, "execution": result}) + orig_chat.append({"role": "observation", "content": obs, "execution": result}) iterations += 1 last_response = response diff --git a/vision_agent/lmm/types.py b/vision_agent/lmm/types.py index ded6a42b..ba2b3189 100644 --- a/vision_agent/lmm/types.py +++ b/vision_agent/lmm/types.py @@ -1,5 +1,6 @@ from pathlib import Path from typing import Dict, Sequence, Union +from vision_agent.utils.execute import Execution TextOrImage = Union[str, Sequence[Union[str, Path]]] -Message = Dict[str, TextOrImage] +Message = Dict[str, Union[TextOrImage, Execution]] From e508809658458e309d68c441f3842a48df050b6e Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Thu, 29 Aug 2024 08:32:32 -0700 Subject: [PATCH 24/37] fixed type errors --- vision_agent/agent/vision_agent.py | 10 +++++++--- vision_agent/agent/vision_agent_coder.py | 2 +- vision_agent/lmm/lmm.py | 6 ++++-- 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/vision_agent/agent/vision_agent.py b/vision_agent/agent/vision_agent.py index 04cafd5e..58bea5ca 100644 --- a/vision_agent/agent/vision_agent.py +++ b/vision_agent/agent/vision_agent.py @@ -198,7 +198,7 @@ def chat_with_code( for chat_i in int_chat: if "media" in chat_i: for media in chat_i["media"]: - media = code_interpreter.upload_file(media) + media = code_interpreter.upload_file(cast(str, media)) chat_i["content"] += f" Media name {media}" # type: ignore # Save dummy value for now since we just need to know the path # name in the key 'media'. Later on we can add artifact support @@ -264,8 +264,12 @@ def chat_with_code( if self.verbosity >= 1: _LOGGER.info(obs) - int_chat.append({"role": "observation", "content": obs, "execution": result}) - orig_chat.append({"role": "observation", "content": obs, "execution": result}) + int_chat.append( + {"role": "observation", "content": obs, "execution": result} + ) + orig_chat.append( + {"role": "observation", "content": obs, "execution": result} + ) iterations += 1 last_response = response diff --git a/vision_agent/agent/vision_agent_coder.py b/vision_agent/agent/vision_agent_coder.py index cc0711b6..c8488902 100644 --- a/vision_agent/agent/vision_agent_coder.py +++ b/vision_agent/agent/vision_agent_coder.py @@ -722,7 +722,7 @@ def chat_with_workflow( media if type(media) is str and media.startswith(("http", "https")) - else code_interpreter.upload_file(media) + else code_interpreter.upload_file(cast(str, media)) ) chat_i["content"] += f" Media name {media}" # type: ignore media_list.append(str(media)) diff --git a/vision_agent/lmm/lmm.py b/vision_agent/lmm/lmm.py index 15df5ac9..76481f3f 100644 --- a/vision_agent/lmm/lmm.py +++ b/vision_agent/lmm/lmm.py @@ -138,7 +138,7 @@ def chat( fixed_c["content"] = [{"type": "text", "text": c["content"]}] # type: ignore if "media" in c: for media in c["media"]: - encoded_media = encode_media(media) + encoded_media = encode_media(cast(str, media)) fixed_c["content"].append( # type: ignore { @@ -389,7 +389,9 @@ def chat( fixed_chat = [] for message in chat: if "media" in message: - message["images"] = [encode_media(m) for m in message["media"]] + message["images"] = [ + encode_media(cast(str, m)) for m in message["media"] + ] del message["media"] fixed_chat.append(message) url = f"{self.url}/chat" From d83857e8f430e0458eff13b3eb824a303b502bbc Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Thu, 29 Aug 2024 09:47:26 -0700 Subject: [PATCH 25/37] fixed bug with upload file --- vision_agent/utils/execute.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vision_agent/utils/execute.py b/vision_agent/utils/execute.py index 05b03612..b2da6f11 100644 --- a/vision_agent/utils/execute.py +++ b/vision_agent/utils/execute.py @@ -528,7 +528,7 @@ def upload_file(self, file: Union[str, Path]) -> Path: with open(file, "rb") as f: self.interpreter.files.write(path=str(self.remote_path / file_name), data=f) _LOGGER.info(f"File ({file}) is uploaded to: {str(self.remote_path)}") - return self.remote_path + return self.remote_path / file_name def download_file( self, remote_file_path: Union[str, Path], local_file_path: Union[str, Path] From 51503b90febc9d6d85b277b2cd2d757d6f30210f Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Thu, 29 Aug 2024 10:15:32 -0700 Subject: [PATCH 26/37] added ability to write media files to artifacts --- vision_agent/agent/vision_agent.py | 17 +++++----- vision_agent/tools/meta_tools.py | 53 +++++++++++++++++++----------- 2 files changed, 43 insertions(+), 27 deletions(-) diff --git a/vision_agent/agent/vision_agent.py b/vision_agent/agent/vision_agent.py index 58bea5ca..df73cef4 100644 --- a/vision_agent/agent/vision_agent.py +++ b/vision_agent/agent/vision_agent.py @@ -29,7 +29,7 @@ class BoilerplateCode: pre_code = [ "from typing import *", "from vision_agent.utils.execute import CodeInterpreter", - "from vision_agent.tools.meta_tools import Artifacts, open_artifact, create_artifact, edit_artifact, get_tool_descriptions, generate_vision_code, edit_vision_code", + "from vision_agent.tools.meta_tools import Artifacts, open_code_artifact, create_code_artifact, edit_code_artifact, get_tool_descriptions, generate_vision_code, edit_vision_code, write_media_artifact", "artifacts = Artifacts('{remote_path}')", "artifacts.load('{remote_path}')", ] @@ -198,13 +198,14 @@ def chat_with_code( for chat_i in int_chat: if "media" in chat_i: for media in chat_i["media"]: - media = code_interpreter.upload_file(cast(str, media)) - chat_i["content"] += f" Media name {media}" # type: ignore - # Save dummy value for now since we just need to know the path - # name in the key 'media'. Later on we can add artifact support - # for byte data. - artifacts.artifacts[Path(media).name] = None - media_list.append(media) + media = cast(str, media) + artifacts.artifacts[Path(media).name] = open(media, "rb").read() + + media_remote_path = ( + Path(code_interpreter.remote_path) / Path(media).name + ) + chat_i["content"] += f" Media name {media_remote_path}" # type: ignore + media_list.append(media_remote_path) int_chat = cast( List[Message], diff --git a/vision_agent/tools/meta_tools.py b/vision_agent/tools/meta_tools.py index 89c2dbdd..364bdf0e 100644 --- a/vision_agent/tools/meta_tools.py +++ b/vision_agent/tools/meta_tools.py @@ -61,7 +61,8 @@ def load(self, file_path: Union[str, Path]) -> None: self.artifacts = pkl.load(f) for k, v in self.artifacts.items(): if v is not None: - with open(self.remote_save_path.parent / k, "w") as f: + mode = "w" if isinstance(v, str) else "wb" + with open(self.remote_save_path.parent / k, mode) as f: f.write(v) def show(self) -> str: @@ -87,7 +88,7 @@ def __iter__(self) -> Any: def __getitem__(self, name: str) -> Any: return self.artifacts[name] - def __setitem__(self, name: str, value: str) -> None: + def __setitem__(self, name: str, value: Any) -> None: self.artifacts[name] = value def __contains__(self, name: str) -> bool: @@ -119,11 +120,11 @@ def view_lines( return return_str -def open_artifact( +def open_code_artifact( artifacts: Artifacts, name: str, line_num: int = 0, window_size: int = 100 ) -> str: - """Opens the provided artifact. If `line_num` is provided, the window will be moved - to include that line. It only shows the first 100 lines by default! Max + """Opens the provided code artifact. If `line_num` is provided, the window will be + moved to include that line. It only shows the first 100 lines by default! Max `window_size` supported is 2000. Parameters: @@ -148,8 +149,8 @@ def open_artifact( return view_lines(lines, line_num, window_size, name, total_lines) -def create_artifact(artifacts: Artifacts, name: str) -> str: - """Creates a new artifiact with the given name. +def create_code_artifact(artifacts: Artifacts, name: str) -> str: + """Creates a new code artifiact with the given name. Parameters: artifacts (Artifacts): The artifacts object to add the new artifact to. @@ -164,15 +165,15 @@ def create_artifact(artifacts: Artifacts, name: str) -> str: return return_str -def edit_artifact( +def edit_code_artifact( artifacts: Artifacts, name: str, start: int, end: int, content: str ) -> str: - """Edits the given artifact with the provided content. The content will be inserted - between the `start` and `end` line numbers. If the `start` and `end` are the same, - the content will be inserted at the `start` line number. If the `end` is greater - than the total number of lines in the file, the content will be inserted at the end - of the file. If the `start` or `end` are negative, the function will return an - error message. + """Edits the given code artifact with the provided content. The content will be + inserted between the `start` and `end` line numbers. If the `start` and `end` are + the same, the content will be inserted at the `start` line number. If the `end` is + greater than the total number of lines in the file, the content will be inserted at + the end of the file. If the `start` or `end` are negative, the function will return + an error message. Parameters: artifacts (Artifacts): The artifacts object to edit the artifact from. @@ -237,7 +238,7 @@ def edit_artifact( artifacts[name] = "".join(edited_lines) - return open_artifact(artifacts, name, cur_line) + return open_code_artifact(artifacts, name, cur_line) def generate_vision_code( @@ -274,7 +275,7 @@ def detect_dogs(image_path: str): agent = va.agent.VisionAgentCoder() fixed_chat: List[Message] = [{"role": "user", "content": chat, "media": media}] - response = agent.chat_with_workflow(fixed_chat, test_multi_plan=False) + response = agent.chat_with_workflow(fixed_chat, test_multi_plan=True) code = response["code"] artifacts[name] = code code_lines = code.splitlines(keepends=True) @@ -335,6 +336,19 @@ def detect_dogs(image_path: str): return view_lines(code_lines, 0, total_lines, name, total_lines) +def write_media_artifact(artifacts: Artifacts, local_path: str) -> str: + """Writes a media file to the artifacts object. + + Parameters: + artifacts (Artifacts): The artifacts object to save the media to. + local_path (str): The local path to the media file. + """ + with open(local_path, "rb") as f: + media = f.read() + artifacts[Path(local_path).name] = media + return f"[Media {Path(local_path).name} saved]" + + def get_tool_descriptions() -> str: """Returns a description of all the tools that `generate_vision_code` has access to. Helpful for answering questions about what types of vision tasks you can do with @@ -345,10 +359,11 @@ def get_tool_descriptions() -> str: META_TOOL_DOCSTRING = get_tool_documentation( [ get_tool_descriptions, - open_artifact, - create_artifact, - edit_artifact, + open_code_artifact, + create_code_artifact, + edit_code_artifact, generate_vision_code, edit_vision_code, + write_media_artifact, ] ) From 0ed6bb7bfc22f9167b2eb991ddda0725b3cf7a8f Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Thu, 29 Aug 2024 10:15:58 -0700 Subject: [PATCH 27/37] return outside of context --- vision_agent/tools/tools.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py index 62a1908a..0695b547 100644 --- a/vision_agent/tools/tools.py +++ b/vision_agent/tools/tools.py @@ -1,8 +1,9 @@ -import os import io import json import logging +import os import tempfile +import urllib.request from importlib import resources from pathlib import Path from typing import Any, Dict, List, Optional, Tuple, Union, cast @@ -15,7 +16,6 @@ from PIL import Image, ImageDraw, ImageFont from pillow_heif import register_heif_opener # type: ignore from pytube import YouTube # type: ignore -import urllib.request from vision_agent.clients.landing_public_api import LandingPublicAPI from vision_agent.tools.tool_utils import ( @@ -1332,7 +1332,7 @@ def save_video( video.write_videofile(f.name, codec="libx264") f.close() _save_video_to_result(f.name) - return f.name + return f.name def _save_video_to_result(video_uri: str) -> None: From 04bd7686d752baea5293cd6c72d0d1d2cb260244 Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Thu, 29 Aug 2024 10:17:03 -0700 Subject: [PATCH 28/37] make remote path execute variable --- vision_agent/utils/execute.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/vision_agent/utils/execute.py b/vision_agent/utils/execute.py index b2da6f11..37c8d260 100644 --- a/vision_agent/utils/execute.py +++ b/vision_agent/utils/execute.py @@ -384,8 +384,15 @@ def from_e2b_execution(exec: E2BExecution) -> "Execution": class CodeInterpreter(abc.ABC): """Code interpreter interface.""" - def __init__(self, timeout: int, *args: Any, **kwargs: Any) -> None: + def __init__( + self, + timeout: int, + remote_path: Optional[Union[str, Path]] = None, + *args: Any, + **kwargs: Any, + ) -> None: self.timeout = timeout + self.remote_path = Path(remote_path if remote_path is not None else WORKSPACE) def __enter__(self) -> Self: return self From 9782893dc7c5624eb297e6115eec12f828392905 Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Thu, 29 Aug 2024 10:19:24 -0700 Subject: [PATCH 29/37] add codec for video encoding --- vision_agent/utils/image_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vision_agent/utils/image_utils.py b/vision_agent/utils/image_utils.py index 54688f93..979f6c97 100644 --- a/vision_agent/utils/image_utils.py +++ b/vision_agent/utils/image_utils.py @@ -100,7 +100,7 @@ def frames_to_bytes( """ with tempfile.NamedTemporaryFile(delete=True) as temp_file: clip = ImageSequenceClip(frames, fps=fps) - clip.write_videofile(temp_file.name + f".{file_ext}", fps=fps) + clip.write_videofile(temp_file.name + f".{file_ext}", fps=fps, code="libx264") with open(temp_file.name + f".{file_ext}", "rb") as f: buffer_bytes = f.read() return buffer_bytes From 75c12893d90d82e680230fa1d0f501e350dbafcf Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Thu, 29 Aug 2024 10:19:52 -0700 Subject: [PATCH 30/37] fix prompts to include writing media artifacts --- vision_agent/agent/vision_agent_prompts.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/vision_agent/agent/vision_agent_prompts.py b/vision_agent/agent/vision_agent_prompts.py index c1cf541e..85e34cd5 100644 --- a/vision_agent/agent/vision_agent_prompts.py +++ b/vision_agent/agent/vision_agent_prompts.py @@ -86,7 +86,7 @@ EXAMPLES_CODE2 = """ -USER: Can you create a function to count workers with helmets? +USER: Can you create a function to count workers with helmets, return the count and save a visaulization of the bounding boxes? OBSERVATION: [Artifacts loaded] @@ -101,16 +101,17 @@ Artifact workers.png loaded to /path/to/images/workers.png [End of artifacts] -AGENT: {"thoughts": "I will use the generate_vision_code to count the workers with helmets in the image.", "response": "generate_vision_code(artifacts, 'code.py', 'Can you write code to count workers with helmets in this image?', media=['/paths/to/images/workers.png'])", "let_user_respond": false} +AGENT: {"thoughts": "I will use the generate_vision_code to count the workers with helmets in the image.", "response": "generate_vision_code(artifacts, 'code.py', 'Can you create a function to count workers with helmets, return the count and save a visaulization of the bounding boxes?', media=['/paths/to/images/workers.png'])", "let_user_respond": false} OBSERVATION: [Artifact code.py] -0|from vision_agent.tools import load_image, owl_v2, closest_box_distance -1|def count_workers_with_helmets(image_path: str): +0|from vision_agent.tools import load_image, owl_v2, closest_box_distance, overlay_bounding_boxes, save_image +1|def count_workers_with_helmets(image_path: str, output_path: str): 2| image = load_image(image_path) -3| workers = owl_v2("worker", image) -4| helmets = owl_v2("helmet", image) -5| count = 0 +3| detections = owl_v2("worker, helmet", image) +4| workers = [d for d in detections if d['label'] == 'worker'] +5| helmets = [d for d in detections if d['label'] == 'helmet'] +6| count = 0 6| for worker in workers: 7| person_box = worker['bbox'] 8| person_has_helmet = False @@ -120,14 +121,16 @@ 12| break 13| if person_has_helmet: 14| count += 1 +15| overlay_bounding_boxes(image, detections) +16| save_image(output_path, image) 15| return count [End of artifact] -AGENT: {"thoughts": "I have generated the code to count the workers with helmets in the image, I must now run the code to get the output.", "response": "from code import count_workers_with_helmets\n print(count_workers_with_helmets('/path/to/images/workers.png'))", "let_user_respond": false} +AGENT: {"thoughts": "I have generated the code to count the workers with helmets in the image, I must now run the code to get the output and write the visualization to the artifacts so the user can see it.", "response": "from code import count_workers_with_helmets\n print(count_workers_with_helmets('/path/to/images/workers.png', 'workers_viz.png'))\n write_media_artifact(artifacts, 'workers_viz.png')", "let_user_respond": false} OBSERVATION: ----- stdout ----- 2 -AGENT: {"thoughts": "Two workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "I have written the code to count the workers wearing helmets in code.py", "let_user_respond": true} +AGENT: {"thoughts": "Two workers with helmets are detected, I will show this to the user and ask them if the result looks good.", "response": "I have written the code to count the workers wearing helmets in code.py and saved the visualization under 'workers_viz.png'.", "let_user_respond": true} """ From 1d8dd7863e53f5238ad2515a6befbf05b1945c7d Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Thu, 29 Aug 2024 10:20:04 -0700 Subject: [PATCH 31/37] isort --- vision_agent/lmm/types.py | 1 + vision_agent/tools/__init__.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/vision_agent/lmm/types.py b/vision_agent/lmm/types.py index ba2b3189..b9c99fe2 100644 --- a/vision_agent/lmm/types.py +++ b/vision_agent/lmm/types.py @@ -1,5 +1,6 @@ from pathlib import Path from typing import Dict, Sequence, Union + from vision_agent.utils.execute import Execution TextOrImage = Union[str, Sequence[Union[str, Path]]] diff --git a/vision_agent/tools/__init__.py b/vision_agent/tools/__init__.py index cbd92358..e82d7553 100644 --- a/vision_agent/tools/__init__.py +++ b/vision_agent/tools/__init__.py @@ -21,8 +21,8 @@ dpt_hybrid_midas, extract_frames, florence2_image_caption, - florence2_phrase_grounding, florence2_ocr, + florence2_phrase_grounding, florence2_roberta_vqa, florence2_sam2_image, florence2_sam2_video, From 7a510e3bc21cc9d9f74efc1405d1b3d47c3f7eac Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Thu, 29 Aug 2024 13:18:26 -0700 Subject: [PATCH 32/37] fix typo --- vision_agent/utils/image_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vision_agent/utils/image_utils.py b/vision_agent/utils/image_utils.py index 979f6c97..c1cc8eb6 100644 --- a/vision_agent/utils/image_utils.py +++ b/vision_agent/utils/image_utils.py @@ -100,7 +100,7 @@ def frames_to_bytes( """ with tempfile.NamedTemporaryFile(delete=True) as temp_file: clip = ImageSequenceClip(frames, fps=fps) - clip.write_videofile(temp_file.name + f".{file_ext}", fps=fps, code="libx264") + clip.write_videofile(temp_file.name + f".{file_ext}", fps=fps, codec="libx264") with open(temp_file.name + f".{file_ext}", "rb") as f: buffer_bytes = f.read() return buffer_bytes From ac9a5e09df599fb40249077fc8ad30f19c718415 Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Thu, 29 Aug 2024 14:59:15 -0700 Subject: [PATCH 33/37] added redisplay for nested notebook sessions --- vision_agent/tools/meta_tools.py | 34 ++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/vision_agent/tools/meta_tools.py b/vision_agent/tools/meta_tools.py index 364bdf0e..b3c3ed8c 100644 --- a/vision_agent/tools/meta_tools.py +++ b/vision_agent/tools/meta_tools.py @@ -5,10 +5,13 @@ from pathlib import Path from typing import Any, Dict, List, Optional, Union +from IPython.display import display + import vision_agent as va from vision_agent.lmm.types import Message from vision_agent.tools.tool_utils import get_tool_documentation from vision_agent.tools.tools import TOOL_DESCRIPTIONS +from vision_agent.utils.execute import Execution, MimeType # These tools are adapted from SWE-Agent https://github.com/princeton-nlp/SWE-agent @@ -37,6 +40,35 @@ def filter_file(file_name: Union[str, Path]) -> bool: ) +def redisplay_results(execution: Execution) -> None: + """This function is used to add previous execution results to the current output. + This is handy if you are inside a notebook environment, call it notebook1, and you + have a nested notebook environment, call it notebook2, and you want the execution + results from notebook2 to be included in the execution results for notebook1. + """ + for result in execution.results: + if result.text is not None: + display({MimeType.TEXT_PLAIN: result.text}) + if result.html is not None: + display({MimeType.TEXT_HTML: result.html}) + if result.markdown is not None: + display({MimeType.TEXT_MARKDOWN: result.markdown}) + if result.svg is not None: + display({MimeType.IMAGE_SVG: result.svg}) + if result.png is not None: + display({MimeType.IMAGE_PNG: result.png}) + if result.jpeg is not None: + display({MimeType.IMAGE_JPEG: result.jpeg}) + if result.mp4 is not None: + display({MimeType.VIDEO_MP4_B64: result.mp4}) + if result.latex is not None: + display({MimeType.TEXT_LATEX: result.latex}) + if result.json is not None: + display({MimeType.APPLICATION_JSON: result.json}) + if result.extra is not None: + display(result.extra) + + class Artifacts: """Artifacts is a class that allows you to sync files between a local and remote environment. In our case, the remote environment could be where the VisionAgent is @@ -276,6 +308,7 @@ def detect_dogs(image_path: str): fixed_chat: List[Message] = [{"role": "user", "content": chat, "media": media}] response = agent.chat_with_workflow(fixed_chat, test_multi_plan=True) + redisplay_results(response["test_result"]) code = response["code"] artifacts[name] = code code_lines = code.splitlines(keepends=True) @@ -329,6 +362,7 @@ def detect_dogs(image_path: str): fixed_chat_history.append({"role": "user", "content": chat}) response = agent.chat_with_workflow(fixed_chat_history, test_multi_plan=False) + redisplay_results(response["test_result"]) code = response["code"] artifacts[name] = code code_lines = code.splitlines(keepends=True) From 32b1ce92d6083cebe706e0dc5e393b656b959f3f Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Thu, 29 Aug 2024 19:13:08 -0700 Subject: [PATCH 34/37] return artifacts --- vision_agent/agent/vision_agent.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/vision_agent/agent/vision_agent.py b/vision_agent/agent/vision_agent.py index df73cef4..5544b188 100644 --- a/vision_agent/agent/vision_agent.py +++ b/vision_agent/agent/vision_agent.py @@ -1,8 +1,9 @@ import copy import logging import os +import tempfile from pathlib import Path -from typing import Any, Dict, List, Optional, Union, cast +from typing import Any, Dict, List, Optional, Tuple, Union, cast from vision_agent.agent import Agent from vision_agent.agent.agent_utils import extract_json @@ -135,7 +136,7 @@ def __init__( ( Path(local_artifacts_path) if local_artifacts_path is not None - else "artifacts.pkl" + else Path(tempfile.NamedTemporaryFile(delete=False).name) ), ) @@ -161,14 +162,14 @@ def __call__( input = [{"role": "user", "content": input}] if media is not None: input[0]["media"] = [media] - results = self.chat_with_code(input, artifacts) + results, _ = self.chat_with_code(input, artifacts) return results def chat_with_code( self, chat: List[Message], artifacts: Optional[Artifacts] = None, - ) -> List[Message]: + ) -> Tuple[List[Message], Artifacts]: """Chat with VisionAgent, it will use code to execute actions to accomplish its tasks. @@ -187,6 +188,7 @@ def chat_with_code( raise ValueError("chat cannot be empty") if not artifacts: + # this is setting remote artifacts path artifacts = Artifacts(WORKSPACE / "artifacts.pkl") with CodeInterpreterFactory.new_instance( @@ -265,9 +267,8 @@ def chat_with_code( if self.verbosity >= 1: _LOGGER.info(obs) - int_chat.append( - {"role": "observation", "content": obs, "execution": result} - ) + # don't add execution results to internal chat + int_chat.append({"role": "observation", "content": obs}) orig_chat.append( {"role": "observation", "content": obs, "execution": result} ) @@ -281,7 +282,7 @@ def chat_with_code( ) artifacts.load(self.local_artifacts_path) artifacts.save() - return orig_chat + return orig_chat, artifacts def log_progress(self, data: Dict[str, Any]) -> None: pass From 33cf8e71af32c59058fc15a8ac1901fa3fab0b69 Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Thu, 29 Aug 2024 19:18:53 -0700 Subject: [PATCH 35/37] add trace for last edited artifact --- vision_agent/tools/meta_tools.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/vision_agent/tools/meta_tools.py b/vision_agent/tools/meta_tools.py index b3c3ed8c..833ad542 100644 --- a/vision_agent/tools/meta_tools.py +++ b/vision_agent/tools/meta_tools.py @@ -194,6 +194,8 @@ def create_code_artifact(artifacts: Artifacts, name: str) -> str: artifacts[name] = "" return_str = f"[Artifact {name} created]" print(return_str) + + display({MimeType.APPLICATION_JSON: {"last_artifact": name}}) return return_str @@ -270,6 +272,7 @@ def edit_code_artifact( artifacts[name] = "".join(edited_lines) + display({MimeType.APPLICATION_JSON: {"last_artifact": name}}) return open_code_artifact(artifacts, name, cur_line) @@ -313,6 +316,8 @@ def detect_dogs(image_path: str): artifacts[name] = code code_lines = code.splitlines(keepends=True) total_lines = len(code_lines) + + display({MimeType.APPLICATION_JSON: {"last_artifact": name}}) return view_lines(code_lines, 0, total_lines, name, total_lines) @@ -367,6 +372,8 @@ def detect_dogs(image_path: str): artifacts[name] = code code_lines = code.splitlines(keepends=True) total_lines = len(code_lines) + + display({MimeType.APPLICATION_JSON: {"last_artifact": name}}) return view_lines(code_lines, 0, total_lines, name, total_lines) From 40c1cbdce4feec26de15a35c15b588ca3dc10539 Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Thu, 29 Aug 2024 20:19:22 -0700 Subject: [PATCH 36/37] handle artifact return --- examples/chat/app.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/chat/app.py b/examples/chat/app.py index 68dede8d..9291f65a 100644 --- a/examples/chat/app.py +++ b/examples/chat/app.py @@ -53,7 +53,7 @@ def update_messages(messages, lock): if Path("artifacts.pkl").exists(): artifacts.load("artifacts.pkl") - new_chat = agent.chat_with_code(messages, artifacts=artifacts) + new_chat, _ = agent.chat_with_code(messages, artifacts=artifacts) with lock: for new_message in new_chat: if new_message not in messages: From 58a1be455a579e0dbd6248762849aceffc854a08 Mon Sep 17 00:00:00 2001 From: Dillon Laird Date: Thu, 29 Aug 2024 20:19:44 -0700 Subject: [PATCH 37/37] only add text to obs, no trace --- vision_agent/agent/vision_agent.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vision_agent/agent/vision_agent.py b/vision_agent/agent/vision_agent.py index 5544b188..2bb04343 100644 --- a/vision_agent/agent/vision_agent.py +++ b/vision_agent/agent/vision_agent.py @@ -263,7 +263,7 @@ def chat_with_code( result = run_code_action( code_action, code_interpreter, str(remote_artifacts_path) ) - obs = result.text() + obs = str(result.logs) if self.verbosity >= 1: _LOGGER.info(obs)