From 9a773d285bb843d6ae3007cd87521dce7317ae9e Mon Sep 17 00:00:00 2001 From: Zhichao Date: Thu, 12 Sep 2024 14:56:34 +0800 Subject: [PATCH 1/4] feat: add OPENAI_API_KEY to e2b env (#235) * add OPENAI_API_KEY to e2b env * add anthropic key * fix type --- vision_agent/utils/execute.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/vision_agent/utils/execute.py b/vision_agent/utils/execute.py index c2e0e652..65e9fea9 100644 --- a/vision_agent/utils/execute.py +++ b/vision_agent/utils/execute.py @@ -691,8 +691,9 @@ def new_instance( if not code_sandbox_runtime: code_sandbox_runtime = os.getenv("CODE_SANDBOX_RUNTIME", "local") if code_sandbox_runtime == "e2b": + envs = _get_e2b_env() instance: CodeInterpreter = E2BCodeInterpreter( - timeout=_SESSION_TIMEOUT, remote_path=remote_path + timeout=_SESSION_TIMEOUT, remote_path=remote_path, envs=envs ) elif code_sandbox_runtime == "local": instance = LocalCodeInterpreter( @@ -705,6 +706,20 @@ def new_instance( return instance +def _get_e2b_env() -> Union[Dict[str, str], None]: + openai_api_key = os.getenv("OPENAI_API_KEY", "") + anthropic_api_key = os.getenv("ANTHROPIC_API_KEY", "") + if openai_api_key or anthropic_api_key: + envs = {} + if openai_api_key: + envs["OPENAI_API_KEY"] = openai_api_key + if anthropic_api_key: + envs["ANTHROPIC_API_KEY"] = anthropic_api_key + else: + envs = None + return envs + + def _parse_local_code_interpreter_outputs(outputs: List[Dict[str, Any]]) -> Execution: """Parse notebook cell outputs to Execution object. Output types: https://nbformat.readthedocs.io/en/latest/format_description.html#code-cell-outputs From b66a8c90cd4c8153d7c81403d52eb6ec027ac5f5 Mon Sep 17 00:00:00 2001 From: GitHub Actions Bot Date: Thu, 12 Sep 2024 06:59:20 +0000 Subject: [PATCH 2/4] [skip ci] chore(release): vision-agent 0.2.134 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index ffcb17bd..1131268b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api" [tool.poetry] name = "vision-agent" -version = "0.2.133" +version = "0.2.134" description = "Toolset for Vision Agent" authors = ["Landing AI "] readme = "README.md" From 91cce76734085d416009dc20d251ed9d69fb933c Mon Sep 17 00:00:00 2001 From: wuyiqunLu <132986242+wuyiqunLu@users.noreply.github.com> Date: Sat, 14 Sep 2024 13:48:59 +0800 Subject: [PATCH 3/4] feat: support artifact name display (#236) * feat: support artifact name display * fix lint --- vision_agent/agent/vision_agent.py | 18 +++++++++++++--- vision_agent/tools/meta_tools.py | 33 +++++++++++++++++------------- vision_agent/utils/execute.py | 5 +++++ 3 files changed, 39 insertions(+), 17 deletions(-) diff --git a/vision_agent/agent/vision_agent.py b/vision_agent/agent/vision_agent.py index 736c9754..40195246 100644 --- a/vision_agent/agent/vision_agent.py +++ b/vision_agent/agent/vision_agent.py @@ -15,6 +15,7 @@ from vision_agent.lmm import LMM, Message, OpenAILMM from vision_agent.tools import META_TOOL_DOCSTRING, save_image, load_image from vision_agent.tools.meta_tools import Artifacts, use_extra_vision_agent_args +from vision_agent.tools.tools import extract_frames, save_video from vision_agent.utils import CodeInterpreterFactory from vision_agent.utils.execute import CodeInterpreter, Execution @@ -224,9 +225,20 @@ def chat_with_code( for media in chat_i["media"]: if type(media) is str and media.startswith(("http", "https")): # TODO: Ideally we should not call VA.tools here, we should come to revisit how to better support remote image later - file_path = Path(media).name - ndarray = load_image(media) - save_image(ndarray, file_path) + file_path = str( + Path(self.local_artifacts_path).parent + / Path(media).name + ) + if file_path.lower().endswith( + ".mp4" + ) or file_path.lower().endswith(".mov"): + video_frames = extract_frames(media) + save_video( + [frame for frame, _ in video_frames], file_path + ) + else: + ndarray = load_image(media) + save_image(ndarray, file_path) media = file_path else: media = cast(str, media) diff --git a/vision_agent/tools/meta_tools.py b/vision_agent/tools/meta_tools.py index ccf98287..477cbd30 100644 --- a/vision_agent/tools/meta_tools.py +++ b/vision_agent/tools/meta_tools.py @@ -53,25 +53,27 @@ def redisplay_results(execution: Execution) -> None: """ for result in execution.results: if result.text is not None: - display({MimeType.TEXT_PLAIN: result.text}) + display({MimeType.TEXT_PLAIN: result.text}, raw=True) if result.html is not None: - display({MimeType.TEXT_HTML: result.html}) + display({MimeType.TEXT_HTML: result.html}, raw=True) if result.markdown is not None: - display({MimeType.TEXT_MARKDOWN: result.markdown}) + display({MimeType.TEXT_MARKDOWN: result.markdown}, raw=True) if result.svg is not None: - display({MimeType.IMAGE_SVG: result.svg}) + display({MimeType.IMAGE_SVG: result.svg}, raw=True) if result.png is not None: - display({MimeType.IMAGE_PNG: result.png}) + display({MimeType.IMAGE_PNG: result.png}, raw=True) if result.jpeg is not None: - display({MimeType.IMAGE_JPEG: result.jpeg}) + display({MimeType.IMAGE_JPEG: result.jpeg}, raw=True) if result.mp4 is not None: - display({MimeType.VIDEO_MP4_B64: result.mp4}) + display({MimeType.VIDEO_MP4_B64: result.mp4}, raw=True) if result.latex is not None: - display({MimeType.TEXT_LATEX: result.latex}) + display({MimeType.TEXT_LATEX: result.latex}, raw=True) if result.json is not None: - display({MimeType.APPLICATION_JSON: result.json}) + display({MimeType.APPLICATION_JSON: result.json}, raw=True) + if result.artifact_name is not None: + display({MimeType.TEXT_ARTIFACT_NAME: result.artifact_name}, raw=True) if result.extra is not None: - display(result.extra) + display(result.extra, raw=True) class Artifacts: @@ -208,7 +210,7 @@ def create_code_artifact(artifacts: Artifacts, name: str) -> str: return_str = f"[Artifact {name} created]" print(return_str) - display({MimeType.APPLICATION_JSON: {"last_artifact": name}}) + display({MimeType.TEXT_ARTIFACT_NAME: name}, raw=True) return return_str @@ -292,7 +294,7 @@ def edit_code_artifact( artifacts[name] = "".join(edited_lines) - display({MimeType.APPLICATION_JSON: {"last_artifact": name}}) + display({MimeType.TEXT_ARTIFACT_NAME: name}, raw=True) return open_code_artifact(artifacts, name, cur_line) @@ -348,7 +350,7 @@ def detect_dogs(image_path: str): code_lines = code.splitlines(keepends=True) total_lines = len(code_lines) - display({MimeType.APPLICATION_JSON: {"last_artifact": name}}) + display({MimeType.TEXT_ARTIFACT_NAME: name}, raw=True) return view_lines(code_lines, 0, total_lines, name, total_lines) @@ -413,7 +415,7 @@ def detect_dogs(image_path: str): code_lines = code.splitlines(keepends=True) total_lines = len(code_lines) - display({MimeType.APPLICATION_JSON: {"last_artifact": name}}) + display({MimeType.TEXT_ARTIFACT_NAME: name}, raw=True) return view_lines(code_lines, 0, total_lines, name, total_lines) @@ -427,6 +429,7 @@ def write_media_artifact(artifacts: Artifacts, local_path: str) -> str: with open(local_path, "rb") as f: media = f.read() artifacts[Path(local_path).name] = media + display({MimeType.TEXT_ARTIFACT_NAME: Path(local_path).name}, raw=True) return f"[Media {Path(local_path).name} saved]" @@ -592,6 +595,8 @@ def replacer(match: re.Match) -> str: diff = get_diff_with_prompts(name, code, new_code) print(diff) + + display({MimeType.TEXT_ARTIFACT_NAME: name}, raw=True) return diff diff --git a/vision_agent/utils/execute.py b/vision_agent/utils/execute.py index 65e9fea9..cc01a89d 100644 --- a/vision_agent/utils/execute.py +++ b/vision_agent/utils/execute.py @@ -56,6 +56,7 @@ class MimeType(str, Enum): TEXT_LATEX = "text/latex" APPLICATION_JSON = "application/json" APPLICATION_JAVASCRIPT = "application/javascript" + TEXT_ARTIFACT_NAME = "text/artifact/name" class FileSerializer: @@ -103,6 +104,7 @@ class Result: latex: Optional[str] = None json: Optional[Dict[str, Any]] = None javascript: Optional[str] = None + artifact_name: Optional[str] = None extra: Optional[Dict[str, Any]] = None "Extra data that can be included. Not part of the standard types." @@ -127,6 +129,7 @@ def __init__(self, is_main_result: bool, data: Dict[str, Any]): self.latex = data.pop(MimeType.TEXT_LATEX, None) self.json = data.pop(MimeType.APPLICATION_JSON, None) self.javascript = data.pop(MimeType.APPLICATION_JAVASCRIPT, None) + self.artifact_name = data.pop(MimeType.TEXT_ARTIFACT_NAME, None) self.extra = data # Only keeping the PNG representation if both PNG and JPEG are present if self.png and self.jpeg: @@ -204,6 +207,8 @@ def formats(self) -> Iterable[str]: formats.append("javascript") if self.mp4: formats.append("mp4") + if self.artifact_name: + formats.append("artifact_name") if self.extra: formats.extend(iter(self.extra)) return formats From a56927db9a079ddf7572b4783079c52b742dc4de Mon Sep 17 00:00:00 2001 From: GitHub Actions Bot Date: Sat, 14 Sep 2024 05:52:10 +0000 Subject: [PATCH 4/4] [skip ci] chore(release): vision-agent 0.2.135 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 1131268b..7499eb7e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api" [tool.poetry] name = "vision-agent" -version = "0.2.134" +version = "0.2.135" description = "Toolset for Vision Agent" authors = ["Landing AI "] readme = "README.md"