From 9a773d285bb843d6ae3007cd87521dce7317ae9e Mon Sep 17 00:00:00 2001
From: Zhichao <yzld2002@gmail.com>
Date: Thu, 12 Sep 2024 14:56:34 +0800
Subject: [PATCH 1/4] feat: add OPENAI_API_KEY to e2b env (#235)

* add OPENAI_API_KEY to e2b env

* add anthropic key

* fix type
---
 vision_agent/utils/execute.py | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/vision_agent/utils/execute.py b/vision_agent/utils/execute.py
index c2e0e652..65e9fea9 100644
--- a/vision_agent/utils/execute.py
+++ b/vision_agent/utils/execute.py
@@ -691,8 +691,9 @@ def new_instance(
         if not code_sandbox_runtime:
             code_sandbox_runtime = os.getenv("CODE_SANDBOX_RUNTIME", "local")
         if code_sandbox_runtime == "e2b":
+            envs = _get_e2b_env()
             instance: CodeInterpreter = E2BCodeInterpreter(
-                timeout=_SESSION_TIMEOUT, remote_path=remote_path
+                timeout=_SESSION_TIMEOUT, remote_path=remote_path, envs=envs
             )
         elif code_sandbox_runtime == "local":
             instance = LocalCodeInterpreter(
@@ -705,6 +706,20 @@ def new_instance(
         return instance
 
 
+def _get_e2b_env() -> Union[Dict[str, str], None]:
+    openai_api_key = os.getenv("OPENAI_API_KEY", "")
+    anthropic_api_key = os.getenv("ANTHROPIC_API_KEY", "")
+    if openai_api_key or anthropic_api_key:
+        envs = {}
+        if openai_api_key:
+            envs["OPENAI_API_KEY"] = openai_api_key
+        if anthropic_api_key:
+            envs["ANTHROPIC_API_KEY"] = anthropic_api_key
+    else:
+        envs = None
+    return envs
+
+
 def _parse_local_code_interpreter_outputs(outputs: List[Dict[str, Any]]) -> Execution:
     """Parse notebook cell outputs to Execution object. Output types:
     https://nbformat.readthedocs.io/en/latest/format_description.html#code-cell-outputs

From b66a8c90cd4c8153d7c81403d52eb6ec027ac5f5 Mon Sep 17 00:00:00 2001
From: GitHub Actions Bot <yazhou.cao@landing.ai>
Date: Thu, 12 Sep 2024 06:59:20 +0000
Subject: [PATCH 2/4] [skip ci] chore(release): vision-agent 0.2.134

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index ffcb17bd..1131268b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
 
 [tool.poetry]
 name = "vision-agent"
-version = "0.2.133"
+version = "0.2.134"
 description = "Toolset for Vision Agent"
 authors = ["Landing AI <dev@landing.ai>"]
 readme = "README.md"

From 91cce76734085d416009dc20d251ed9d69fb933c Mon Sep 17 00:00:00 2001
From: wuyiqunLu <132986242+wuyiqunLu@users.noreply.github.com>
Date: Sat, 14 Sep 2024 13:48:59 +0800
Subject: [PATCH 3/4] feat: support artifact name display (#236)

* feat: support artifact name display

* fix lint
---
 vision_agent/agent/vision_agent.py | 18 +++++++++++++---
 vision_agent/tools/meta_tools.py   | 33 +++++++++++++++++-------------
 vision_agent/utils/execute.py      |  5 +++++
 3 files changed, 39 insertions(+), 17 deletions(-)

diff --git a/vision_agent/agent/vision_agent.py b/vision_agent/agent/vision_agent.py
index 736c9754..40195246 100644
--- a/vision_agent/agent/vision_agent.py
+++ b/vision_agent/agent/vision_agent.py
@@ -15,6 +15,7 @@
 from vision_agent.lmm import LMM, Message, OpenAILMM
 from vision_agent.tools import META_TOOL_DOCSTRING, save_image, load_image
 from vision_agent.tools.meta_tools import Artifacts, use_extra_vision_agent_args
+from vision_agent.tools.tools import extract_frames, save_video
 from vision_agent.utils import CodeInterpreterFactory
 from vision_agent.utils.execute import CodeInterpreter, Execution
 
@@ -224,9 +225,20 @@ def chat_with_code(
                     for media in chat_i["media"]:
                         if type(media) is str and media.startswith(("http", "https")):
                             # TODO: Ideally we should not call VA.tools here, we should come to revisit how to better support remote image later
-                            file_path = Path(media).name
-                            ndarray = load_image(media)
-                            save_image(ndarray, file_path)
+                            file_path = str(
+                                Path(self.local_artifacts_path).parent
+                                / Path(media).name
+                            )
+                            if file_path.lower().endswith(
+                                ".mp4"
+                            ) or file_path.lower().endswith(".mov"):
+                                video_frames = extract_frames(media)
+                                save_video(
+                                    [frame for frame, _ in video_frames], file_path
+                                )
+                            else:
+                                ndarray = load_image(media)
+                                save_image(ndarray, file_path)
                             media = file_path
                         else:
                             media = cast(str, media)
diff --git a/vision_agent/tools/meta_tools.py b/vision_agent/tools/meta_tools.py
index ccf98287..477cbd30 100644
--- a/vision_agent/tools/meta_tools.py
+++ b/vision_agent/tools/meta_tools.py
@@ -53,25 +53,27 @@ def redisplay_results(execution: Execution) -> None:
     """
     for result in execution.results:
         if result.text is not None:
-            display({MimeType.TEXT_PLAIN: result.text})
+            display({MimeType.TEXT_PLAIN: result.text}, raw=True)
         if result.html is not None:
-            display({MimeType.TEXT_HTML: result.html})
+            display({MimeType.TEXT_HTML: result.html}, raw=True)
         if result.markdown is not None:
-            display({MimeType.TEXT_MARKDOWN: result.markdown})
+            display({MimeType.TEXT_MARKDOWN: result.markdown}, raw=True)
         if result.svg is not None:
-            display({MimeType.IMAGE_SVG: result.svg})
+            display({MimeType.IMAGE_SVG: result.svg}, raw=True)
         if result.png is not None:
-            display({MimeType.IMAGE_PNG: result.png})
+            display({MimeType.IMAGE_PNG: result.png}, raw=True)
         if result.jpeg is not None:
-            display({MimeType.IMAGE_JPEG: result.jpeg})
+            display({MimeType.IMAGE_JPEG: result.jpeg}, raw=True)
         if result.mp4 is not None:
-            display({MimeType.VIDEO_MP4_B64: result.mp4})
+            display({MimeType.VIDEO_MP4_B64: result.mp4}, raw=True)
         if result.latex is not None:
-            display({MimeType.TEXT_LATEX: result.latex})
+            display({MimeType.TEXT_LATEX: result.latex}, raw=True)
         if result.json is not None:
-            display({MimeType.APPLICATION_JSON: result.json})
+            display({MimeType.APPLICATION_JSON: result.json}, raw=True)
+        if result.artifact_name is not None:
+            display({MimeType.TEXT_ARTIFACT_NAME: result.artifact_name}, raw=True)
         if result.extra is not None:
-            display(result.extra)
+            display(result.extra, raw=True)
 
 
 class Artifacts:
@@ -208,7 +210,7 @@ def create_code_artifact(artifacts: Artifacts, name: str) -> str:
         return_str = f"[Artifact {name} created]"
     print(return_str)
 
-    display({MimeType.APPLICATION_JSON: {"last_artifact": name}})
+    display({MimeType.TEXT_ARTIFACT_NAME: name}, raw=True)
     return return_str
 
 
@@ -292,7 +294,7 @@ def edit_code_artifact(
 
     artifacts[name] = "".join(edited_lines)
 
-    display({MimeType.APPLICATION_JSON: {"last_artifact": name}})
+    display({MimeType.TEXT_ARTIFACT_NAME: name}, raw=True)
     return open_code_artifact(artifacts, name, cur_line)
 
 
@@ -348,7 +350,7 @@ def detect_dogs(image_path: str):
     code_lines = code.splitlines(keepends=True)
     total_lines = len(code_lines)
 
-    display({MimeType.APPLICATION_JSON: {"last_artifact": name}})
+    display({MimeType.TEXT_ARTIFACT_NAME: name}, raw=True)
     return view_lines(code_lines, 0, total_lines, name, total_lines)
 
 
@@ -413,7 +415,7 @@ def detect_dogs(image_path: str):
     code_lines = code.splitlines(keepends=True)
     total_lines = len(code_lines)
 
-    display({MimeType.APPLICATION_JSON: {"last_artifact": name}})
+    display({MimeType.TEXT_ARTIFACT_NAME: name}, raw=True)
     return view_lines(code_lines, 0, total_lines, name, total_lines)
 
 
@@ -427,6 +429,7 @@ def write_media_artifact(artifacts: Artifacts, local_path: str) -> str:
     with open(local_path, "rb") as f:
         media = f.read()
     artifacts[Path(local_path).name] = media
+    display({MimeType.TEXT_ARTIFACT_NAME: Path(local_path).name}, raw=True)
     return f"[Media {Path(local_path).name} saved]"
 
 
@@ -592,6 +595,8 @@ def replacer(match: re.Match) -> str:
 
     diff = get_diff_with_prompts(name, code, new_code)
     print(diff)
+
+    display({MimeType.TEXT_ARTIFACT_NAME: name}, raw=True)
     return diff
 
 
diff --git a/vision_agent/utils/execute.py b/vision_agent/utils/execute.py
index 65e9fea9..cc01a89d 100644
--- a/vision_agent/utils/execute.py
+++ b/vision_agent/utils/execute.py
@@ -56,6 +56,7 @@ class MimeType(str, Enum):
     TEXT_LATEX = "text/latex"
     APPLICATION_JSON = "application/json"
     APPLICATION_JAVASCRIPT = "application/javascript"
+    TEXT_ARTIFACT_NAME = "text/artifact/name"
 
 
 class FileSerializer:
@@ -103,6 +104,7 @@ class Result:
     latex: Optional[str] = None
     json: Optional[Dict[str, Any]] = None
     javascript: Optional[str] = None
+    artifact_name: Optional[str] = None
     extra: Optional[Dict[str, Any]] = None
     "Extra data that can be included. Not part of the standard types."
 
@@ -127,6 +129,7 @@ def __init__(self, is_main_result: bool, data: Dict[str, Any]):
         self.latex = data.pop(MimeType.TEXT_LATEX, None)
         self.json = data.pop(MimeType.APPLICATION_JSON, None)
         self.javascript = data.pop(MimeType.APPLICATION_JAVASCRIPT, None)
+        self.artifact_name = data.pop(MimeType.TEXT_ARTIFACT_NAME, None)
         self.extra = data
         # Only keeping the PNG representation if both PNG and JPEG are present
         if self.png and self.jpeg:
@@ -204,6 +207,8 @@ def formats(self) -> Iterable[str]:
             formats.append("javascript")
         if self.mp4:
             formats.append("mp4")
+        if self.artifact_name:
+            formats.append("artifact_name")
         if self.extra:
             formats.extend(iter(self.extra))
         return formats

From a56927db9a079ddf7572b4783079c52b742dc4de Mon Sep 17 00:00:00 2001
From: GitHub Actions Bot <yazhou.cao@landing.ai>
Date: Sat, 14 Sep 2024 05:52:10 +0000
Subject: [PATCH 4/4] [skip ci] chore(release): vision-agent 0.2.135

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 1131268b..7499eb7e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
 
 [tool.poetry]
 name = "vision-agent"
-version = "0.2.134"
+version = "0.2.135"
 description = "Toolset for Vision Agent"
 authors = ["Landing AI <dev@landing.ai>"]
 readme = "README.md"