Add frame extraction tool for video processing

landing-ai · Mar 25, 2024 · 31791be · 31791be
1 parent 7fe61ab
commit 31791be
Show file tree

Hide file tree

Showing 6 changed files with 353 additions and 1 deletion.
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -29,6 +29,10 @@ sentence-transformers = "2.*"
 openai = "1.*"
 typing_extensions = "4.*"
 
+[tool.poetry.group.video.dependencies]
+moviepy = "1.*"
+opencv-python-headless = "4.*"
+
 [tool.poetry.group.dev.dependencies]
 autoflake = "1.*"
 pytest = "7.*"
@@ -84,4 +88,5 @@ module = [
  "faiss.*",
  "openai.*",
  "sentence_transformers.*",
+ "moviepy.*",
 ]
diff --git a/tests/data/video/test.mp4 b/tests/data/video/test.mp4
diff --git a/tests/tools/test_video.py b/tests/tools/test_video.py
@@ -0,0 +1,7 @@
+from vision_agent.tools.video import extract_frames_from_video
+
+
+def test_extract_frames_from_video():
+ video_path = "tests/data/video/test.mp4"
+ res = extract_frames_from_video(video_path)
+ assert len(res) == 1
diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py
@@ -457,6 +457,33 @@ def __call__(self, input: List[int]) -> float:
  return round(input[0] / input[1], 2)
 
 
+class ExtractFrames(Tool):
+ name = "extract_frames_"
+ description = "'extract_frames_' extract image frames from the input video, return a list of tuple (frame, timestamp), where the timestamp is the relative time in seconds of the frame occurred in the video."
+ usage = {
+ "required_parameters": [{"name": "video_uri", "type": "str"}],
+ "examples": [
+ {
+ "scenario": "Can you extract the frames from this video? Video: www.foobar.com/video?name=test.mp4",
+ "parameters": {"video_uri": "www.foobar.com/video?name=test.mp4"},
+ },
+ {
+ "scenario": "Can you extract the images from this video file? Video path: tests/data/test.mp4",
+ "parameters": {"video_uri": "tests/data/test.mp4"},
+ },
+ ],
+ }
+
+ def __call__(self, video_uri: str) -> list[tuple[np.ndarray, float]]:
+ try:
+ from vision_agent.tools.video import extract_frames_from_video
+ except Exception as e:
+ raise ImportError(
+ "vision_agent is not installed correctly (cause: missing dependencies), please run 'pip install vision-agent[video]' instead."
+ ) from e
+ return extract_frames_from_video(video_uri)
+
+
 TOOLS = {
  i: {"name": c.name, "description": c.description, "usage": c.usage, "class": c}
  for i, c in enumerate(
@@ -472,6 +499,7 @@ def __call__(self, input: List[int]) -> float:
  Subtract,
  Multiply,
  Divide,
+ ExtractFrames,
  ]
  )
  if (hasattr(c, "name") and hasattr(c, "description") and hasattr(c, "usage"))