From c884ddc197e0c580615b59f57edf4d38c8c12233 Mon Sep 17 00:00:00 2001 From: Zhichao Date: Wed, 28 Aug 2024 14:30:37 +0800 Subject: [PATCH 01/12] do not upload to code_interpreter --- vision_agent/agent/vision_agent_coder.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/vision_agent/agent/vision_agent_coder.py b/vision_agent/agent/vision_agent_coder.py index b10988c6..771855a5 100644 --- a/vision_agent/agent/vision_agent_coder.py +++ b/vision_agent/agent/vision_agent_coder.py @@ -718,7 +718,6 @@ def chat_with_workflow( for chat_i in chat: if "media" in chat_i: for media in chat_i["media"]: - media = code_interpreter.upload_file(media) chat_i["content"] += f" Media name {media}" # type: ignore media_list.append(media) @@ -754,7 +753,9 @@ def chat_with_workflow( plans = write_plans( int_chat, T.get_tool_descriptions_by_names( - customized_tool_names, T.FUNCTION_TOOLS, T.UTIL_TOOLS # type: ignore + customized_tool_names, + T.FUNCTION_TOOLS, + T.UTIL_TOOLS, # type: ignore ), format_memory(working_memory), self.planner, From 126d3ab0d65bdcac5f945c8d6d7a73e712164f71 Mon Sep 17 00:00:00 2001 From: Zhichao Date: Wed, 28 Aug 2024 14:32:08 +0800 Subject: [PATCH 02/12] endcode_media support url ad mp4 --- vision_agent/lmm/lmm.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/vision_agent/lmm/lmm.py b/vision_agent/lmm/lmm.py index e78a0593..27feff83 100644 --- a/vision_agent/lmm/lmm.py +++ b/vision_agent/lmm/lmm.py @@ -30,6 +30,10 @@ def encode_image_bytes(image: bytes) -> str: def encode_media(media: Union[str, Path]) -> str: + if type(media) is str and media.startswith(("http", "https")): + if media.endswith(".mp4"): + return media[:-4] + ".png" + return media extension = "png" extension = Path(media).suffix if extension.lower() not in { @@ -138,7 +142,9 @@ def chat( { "type": "image_url", "image_url": { - "url": f"data:image/png;base64,{encoded_media}", + "url": encoded_media + if encoded_media.startswith(("http", "https")) + else f"data:image/png;base64,{encoded_media}", "detail": "low", }, }, From 42d99de2f322b845d70a941d171e161801b75324 Mon Sep 17 00:00:00 2001 From: Zhichao Date: Wed, 28 Aug 2024 14:32:21 +0800 Subject: [PATCH 03/12] load_image --- vision_agent/tools/tools.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py index 594fcf6d..7e3f7338 100644 --- a/vision_agent/tools/tools.py +++ b/vision_agent/tools/tools.py @@ -1,3 +1,4 @@ +import os import io import json import logging @@ -14,6 +15,7 @@ from PIL import Image, ImageDraw, ImageFont from pillow_heif import register_heif_opener # type: ignore from pytube import YouTube # type: ignore +import urllib.request from vision_agent.clients.landing_public_api import LandingPublicAPI from vision_agent.tools.tool_utils import ( @@ -1250,10 +1252,10 @@ def default(self, obj: Any): # type: ignore def load_image(image_path: str) -> np.ndarray: - """'load_image' is a utility function that loads an image from the given file path string. + """'load_image' is a utility function that loads an image from the given file path string or an URL. Parameters: - image_path (str): The path to the image. + image_path (str): The path or URL to the image. Returns: np.ndarray: The image as a NumPy array. @@ -1265,6 +1267,15 @@ def load_image(image_path: str) -> np.ndarray: # NOTE: sometimes the generated code pass in a NumPy array if isinstance(image_path, np.ndarray): return image_path + if image_path.startswith(("http", "https")): + _, image_suffix = os.path.splitext(image_path) + with tempfile.NamedTemporaryFile(delete=False, suffix=image_suffix) as tmp_file: + # Download the image and save it to the temporary file + with urllib.request.urlopen(image_path) as response: + tmp_file.write(response.read()) + _LOGGER.info(f"{image_path} saved to {tmp_file.name}") + print(f"{image_path} saved to {tmp_file.name}") + image_path = tmp_file.name image = Image.open(image_path).convert("RGB") return np.array(image) From 095943c5f9bb119ac84e1c19d999531d1dba2a49 Mon Sep 17 00:00:00 2001 From: Zhichao Date: Wed, 28 Aug 2024 14:46:19 +0800 Subject: [PATCH 04/12] remove print --- vision_agent/tools/tools.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py index 7e3f7338..07f6a1aa 100644 --- a/vision_agent/tools/tools.py +++ b/vision_agent/tools/tools.py @@ -1273,8 +1273,6 @@ def load_image(image_path: str) -> np.ndarray: # Download the image and save it to the temporary file with urllib.request.urlopen(image_path) as response: tmp_file.write(response.read()) - _LOGGER.info(f"{image_path} saved to {tmp_file.name}") - print(f"{image_path} saved to {tmp_file.name}") image_path = tmp_file.name image = Image.open(image_path).convert("RGB") return np.array(image) From b0a464bca48d04ec579aa872d4222ade2bf367c4 Mon Sep 17 00:00:00 2001 From: Zhichao Date: Wed, 28 Aug 2024 14:50:11 +0800 Subject: [PATCH 05/12] add comment for video associated png --- vision_agent/lmm/lmm.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/vision_agent/lmm/lmm.py b/vision_agent/lmm/lmm.py index 27feff83..156f79ca 100644 --- a/vision_agent/lmm/lmm.py +++ b/vision_agent/lmm/lmm.py @@ -31,7 +31,9 @@ def encode_image_bytes(image: bytes) -> str: def encode_media(media: Union[str, Path]) -> str: if type(media) is str and media.startswith(("http", "https")): - if media.endswith(".mp4"): + # for mp4 video url, we assume there is a same url but ends with png + # vision-agent-ui will upload this png when uploading the video + if media.endswith((".mp4", "mov")): return media[:-4] + ".png" return media extension = "png" @@ -396,7 +398,6 @@ def chat( tmp_kwargs = self.kwargs | kwargs data.update(tmp_kwargs) if "stream" in tmp_kwargs and tmp_kwargs["stream"]: - json_data = json.dumps(data) def f() -> Iterator[Optional[str]]: @@ -430,7 +431,6 @@ def generate( media: Optional[List[Union[str, Path]]] = None, **kwargs: Any, ) -> Union[str, Iterator[Optional[str]]]: - url = f"{self.url}/generate" data: Dict[str, Any] = { "model": self.model_name, @@ -445,7 +445,6 @@ def generate( tmp_kwargs = self.kwargs | kwargs data.update(tmp_kwargs) if "stream" in tmp_kwargs and tmp_kwargs["stream"]: - json_data = json.dumps(data) def f() -> Iterator[Optional[str]]: From 7f36f6a10b5e163c354c5dbd4ffcb01cb02c1e4f Mon Sep 17 00:00:00 2001 From: Zhichao Date: Wed, 28 Aug 2024 14:50:53 +0800 Subject: [PATCH 06/12] minor revert --- vision_agent/agent/vision_agent_coder.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/vision_agent/agent/vision_agent_coder.py b/vision_agent/agent/vision_agent_coder.py index 771855a5..6e52e45c 100644 --- a/vision_agent/agent/vision_agent_coder.py +++ b/vision_agent/agent/vision_agent_coder.py @@ -753,9 +753,7 @@ def chat_with_workflow( plans = write_plans( int_chat, T.get_tool_descriptions_by_names( - customized_tool_names, - T.FUNCTION_TOOLS, - T.UTIL_TOOLS, # type: ignore + customized_tool_names, T.FUNCTION_TOOLS, T.UTIL_TOOLS # type: ignore ), format_memory(working_memory), self.planner, From 635c51ab21da3345ba06e0fcfe8a6f34c1011d97 Mon Sep 17 00:00:00 2001 From: Zhichao Date: Wed, 28 Aug 2024 15:51:35 +0800 Subject: [PATCH 07/12] lint --- vision_agent/lmm/lmm.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/vision_agent/lmm/lmm.py b/vision_agent/lmm/lmm.py index 156f79ca..1881c0bd 100644 --- a/vision_agent/lmm/lmm.py +++ b/vision_agent/lmm/lmm.py @@ -144,9 +144,11 @@ def chat( { "type": "image_url", "image_url": { - "url": encoded_media - if encoded_media.startswith(("http", "https")) - else f"data:image/png;base64,{encoded_media}", + "url": ( + encoded_media + if encoded_media.startswith(("http", "https")) + else f"data:image/png;base64,{encoded_media}" + ), "detail": "low", }, }, From bf8de9c978de977095d80ac28873bfcf6099602f Mon Sep 17 00:00:00 2001 From: Zhichao Date: Wed, 28 Aug 2024 16:12:53 +0800 Subject: [PATCH 08/12] backwards --- vision_agent/agent/vision_agent_coder.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vision_agent/agent/vision_agent_coder.py b/vision_agent/agent/vision_agent_coder.py index 6e52e45c..88a0a966 100644 --- a/vision_agent/agent/vision_agent_coder.py +++ b/vision_agent/agent/vision_agent_coder.py @@ -718,6 +718,7 @@ def chat_with_workflow( for chat_i in chat: if "media" in chat_i: for media in chat_i["media"]: + media = media if type(media) is str and media.startswith(("http", "https")) else code_interpreter.upload_file(media) chat_i["content"] += f" Media name {media}" # type: ignore media_list.append(media) From 19d31c9bc59ee9edfd59bcc6c3fb81a9124bf4a2 Mon Sep 17 00:00:00 2001 From: Zhichao Date: Wed, 28 Aug 2024 16:15:45 +0800 Subject: [PATCH 09/12] lint --- vision_agent/agent/vision_agent_coder.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/vision_agent/agent/vision_agent_coder.py b/vision_agent/agent/vision_agent_coder.py index 88a0a966..ac86abfc 100644 --- a/vision_agent/agent/vision_agent_coder.py +++ b/vision_agent/agent/vision_agent_coder.py @@ -718,7 +718,12 @@ def chat_with_workflow( for chat_i in chat: if "media" in chat_i: for media in chat_i["media"]: - media = media if type(media) is str and media.startswith(("http", "https")) else code_interpreter.upload_file(media) + media = ( + media + if type(media) is str + and media.startswith(("http", "https")) + else code_interpreter.upload_file(media) + ) chat_i["content"] += f" Media name {media}" # type: ignore media_list.append(media) From 2a2ef320b632b6d52bf668d41edd703fdb57ba41 Mon Sep 17 00:00:00 2001 From: Zhichao Date: Wed, 28 Aug 2024 18:15:53 +0800 Subject: [PATCH 10/12] also save video --- vision_agent/tools/tools.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py index 07f6a1aa..403d2271 100644 --- a/vision_agent/tools/tools.py +++ b/vision_agent/tools/tools.py @@ -1222,6 +1222,13 @@ def extract_frames( video_file_path = video.download(output_path=temp_dir) return extract_frames_from_video(video_file_path, fps) + elif str(video_uri).startswith(("http", "https")): + _, image_suffix = os.path.splitext(video_uri) + with tempfile.NamedTemporaryFile(delete=False, suffix=image_suffix) as tmp_file: + # Download the video and save it to the temporary file + with urllib.request.urlopen(str(video_uri)) as response: + tmp_file.write(response.read()) + return extract_frames_from_video(tmp_file.name, fps) return extract_frames_from_video(str(video_uri), fps) From a408f5c1a342ea99f17e7b0dbc2be50fae7e0587 Mon Sep 17 00:00:00 2001 From: Zhichao Date: Wed, 28 Aug 2024 19:05:58 +0800 Subject: [PATCH 11/12] more strict check for vision-agent-ui --- vision_agent/lmm/lmm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vision_agent/lmm/lmm.py b/vision_agent/lmm/lmm.py index 1881c0bd..b9f7ec54 100644 --- a/vision_agent/lmm/lmm.py +++ b/vision_agent/lmm/lmm.py @@ -33,7 +33,7 @@ def encode_media(media: Union[str, Path]) -> str: if type(media) is str and media.startswith(("http", "https")): # for mp4 video url, we assume there is a same url but ends with png # vision-agent-ui will upload this png when uploading the video - if media.endswith((".mp4", "mov")): + if media.endswith((".mp4", "mov")) and media.find('vision-agent-dev.s3') != -1: return media[:-4] + ".png" return media extension = "png" From 51664f8bbec5b64d9273ae54ebdf58b10f744efe Mon Sep 17 00:00:00 2001 From: Zhichao Date: Wed, 28 Aug 2024 20:40:52 +0800 Subject: [PATCH 12/12] fix lint --- vision_agent/lmm/lmm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vision_agent/lmm/lmm.py b/vision_agent/lmm/lmm.py index b9f7ec54..15df5ac9 100644 --- a/vision_agent/lmm/lmm.py +++ b/vision_agent/lmm/lmm.py @@ -33,7 +33,7 @@ def encode_media(media: Union[str, Path]) -> str: if type(media) is str and media.startswith(("http", "https")): # for mp4 video url, we assume there is a same url but ends with png # vision-agent-ui will upload this png when uploading the video - if media.endswith((".mp4", "mov")) and media.find('vision-agent-dev.s3') != -1: + if media.endswith((".mp4", "mov")) and media.find("vision-agent-dev.s3") != -1: return media[:-4] + ".png" return media extension = "png"