From 63e8630d0f2a8565010b1180e0556f6c14ae7ead Mon Sep 17 00:00:00 2001 From: Yazhou Cao Date: Tue, 26 Mar 2024 11:48:47 -0700 Subject: [PATCH] Update ExtractFrames tool --- poetry.lock | 4 ++-- vision_agent/tools/tools.py | 15 ++++++++++++--- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/poetry.lock b/poetry.lock index d7c66b5d..d6ba3c83 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1209,9 +1209,9 @@ files = [ [package.dependencies] numpy = [ + {version = ">=1.23.5", markers = "python_version >= \"3.11\""}, {version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""}, {version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""}, - {version = ">=1.23.5", markers = "python_version >= \"3.11\""}, ] [[package]] @@ -1275,8 +1275,8 @@ files = [ [package.dependencies] numpy = [ - {version = ">=1.22.4,<2", markers = "python_version < \"3.11\""}, {version = ">=1.23.2,<2", markers = "python_version == \"3.11\""}, + {version = ">=1.22.4,<2", markers = "python_version < \"3.11\""}, ] python-dateutil = ">=2.8.2" pytz = ">=2020.1" diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py index 42b3a810..b6578502 100644 --- a/vision_agent/tools/tools.py +++ b/vision_agent/tools/tools.py @@ -459,7 +459,7 @@ def __call__(self, input: List[int]) -> float: class ExtractFrames(Tool): name = "extract_frames_" - description = "'extract_frames_' extract image frames from the input video, return a list of tuple (frame, timestamp), where the timestamp is the relative time in seconds of the frame occurred in the video." + description = "'extract_frames_' extract image frames from the input video, return a list of tuple (frame, timestamp), where the timestamp is the relative time in seconds of the frame occurred in the video, the frame is a local image file path that stores the frame." usage = { "required_parameters": [{"name": "video_uri", "type": "str"}], "examples": [ @@ -474,14 +474,23 @@ class ExtractFrames(Tool): ], } - def __call__(self, video_uri: str) -> list[tuple[np.ndarray, float]]: + def __call__(self, video_uri: str) -> list[tuple[str, float]]: try: from vision_agent.tools.video import extract_frames_from_video except Exception as e: raise ImportError( "vision_agent is not installed correctly (cause: missing dependencies), please run 'pip install vision-agent[video]' instead." ) from e - return extract_frames_from_video(video_uri) + frames = extract_frames_from_video(video_uri) + result = [] + _LOGGER.info( + f"Extracted {len(frames)} frames from video {video_uri}. Temporarily saving them as images to disk for downstream tasks." + ) + for frame, ts in frames: + with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp: + Image.fromarray(frame).save(tmp) + result.append((tmp.name, ts)) + return result TOOLS = {