diff --git a/poetry.lock b/poetry.lock index 3f479663..d7c66b5d 100644 --- a/poetry.lock +++ b/poetry.lock @@ -257,6 +257,17 @@ files = [ [package.extras] dev = ["black", "flake8", "flake8-pyi", "matplotlib", "mypy (==0.770)", "numpy", "pandas", "pytest"] +[[package]] +name = "decorator" +version = "4.4.2" +description = "Decorators for Humans" +optional = false +python-versions = ">=2.6, !=3.0.*, !=3.1.*" +files = [ + {file = "decorator-4.4.2-py2.py3-none-any.whl", hash = "sha256:41fa54c2a0cc4ba648be4fd43cff00aedf5b9465c9bf18d64325bc225f08f760"}, + {file = "decorator-4.4.2.tar.gz", hash = "sha256:e3a62f0520172440ca0dcc823749319382e377f37f140a0b99ef45fecb84bfe7"}, +] + [[package]] name = "distro" version = "1.9.0" @@ -518,6 +529,56 @@ files = [ {file = "idna-3.6.tar.gz", hash = "sha256:9ecdbbd083b06798ae1e86adcbfe8ab1479cf864e4ee30fe4e46a003d12491ca"}, ] +[[package]] +name = "imageio" +version = "2.34.0" +description = "Library for reading and writing a wide range of image, video, scientific, and volumetric data formats." +optional = false +python-versions = ">=3.8" +files = [ + {file = "imageio-2.34.0-py3-none-any.whl", hash = "sha256:08082bf47ccb54843d9c73fe9fc8f3a88c72452ab676b58aca74f36167e8ccba"}, + {file = "imageio-2.34.0.tar.gz", hash = "sha256:ae9732e10acf807a22c389aef193f42215718e16bd06eed0c5bb57e1034a4d53"}, +] + +[package.dependencies] +numpy = "*" +pillow = ">=8.3.2" + +[package.extras] +all-plugins = ["astropy", "av", "imageio-ffmpeg", "pillow-heif", "psutil", "tifffile"] +all-plugins-pypy = ["av", "imageio-ffmpeg", "pillow-heif", "psutil", "tifffile"] +build = ["wheel"] +dev = ["black", "flake8", "fsspec[github]", "pytest", "pytest-cov"] +docs = ["numpydoc", "pydata-sphinx-theme", "sphinx (<6)"] +ffmpeg = ["imageio-ffmpeg", "psutil"] +fits = ["astropy"] +full = ["astropy", "av", "black", "flake8", "fsspec[github]", "gdal", "imageio-ffmpeg", "itk", "numpydoc", "pillow-heif", "psutil", "pydata-sphinx-theme", "pytest", "pytest-cov", "sphinx (<6)", "tifffile", "wheel"] +gdal = ["gdal"] +itk = ["itk"] +linting = ["black", "flake8"] +pillow-heif = ["pillow-heif"] +pyav = ["av"] +test = ["fsspec[github]", "pytest", "pytest-cov"] +tifffile = ["tifffile"] + +[[package]] +name = "imageio-ffmpeg" +version = "0.4.9" +description = "FFMPEG wrapper for Python" +optional = false +python-versions = ">=3.5" +files = [ + {file = "imageio-ffmpeg-0.4.9.tar.gz", hash = "sha256:39bcd1660118ef360fa4047456501071364661aa9d9021d3d26c58f1ee2081f5"}, + {file = "imageio_ffmpeg-0.4.9-py3-none-macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:24095e882a126a0d217197b86265f821b4bb3cf9004104f67c1384a2b4b49168"}, + {file = "imageio_ffmpeg-0.4.9-py3-none-manylinux2010_x86_64.whl", hash = "sha256:2996c64af3e5489227096580269317719ea1a8121d207f2e28d6c24ebc4a253e"}, + {file = "imageio_ffmpeg-0.4.9-py3-none-manylinux2014_aarch64.whl", hash = "sha256:7eead662d2f46d748c0ab446b68f423eb63d2b54d0a8ef96f80607245540866d"}, + {file = "imageio_ffmpeg-0.4.9-py3-none-win32.whl", hash = "sha256:b6de1e18911687c538d5585d8287ab1a23624ca9dc2044fcc4607de667bcf11e"}, + {file = "imageio_ffmpeg-0.4.9-py3-none-win_amd64.whl", hash = "sha256:7e900c695c6541b1cb17feb1baacd4009b30a53a45b81c23d53a67ab13ffb766"}, +] + +[package.dependencies] +setuptools = "*" + [[package]] name = "iniconfig" version = "2.0.0" @@ -803,6 +864,30 @@ files = [ griffe = ">=0.37" mkdocstrings = ">=0.20" +[[package]] +name = "moviepy" +version = "1.0.3" +description = "Video editing with Python" +optional = false +python-versions = "*" +files = [ + {file = "moviepy-1.0.3.tar.gz", hash = "sha256:2884e35d1788077db3ff89e763c5ba7bfddbd7ae9108c9bc809e7ba58fa433f5"}, +] + +[package.dependencies] +decorator = ">=4.0.2,<5.0" +imageio = {version = ">=2.5,<3.0", markers = "python_version >= \"3.4\""} +imageio_ffmpeg = {version = ">=0.2.0", markers = "python_version >= \"3.4\""} +numpy = {version = ">=1.17.3", markers = "python_version > \"2.7\""} +proglog = "<=1.0.0" +requests = ">=2.8.1,<3.0" +tqdm = ">=4.11.2,<5.0" + +[package.extras] +doc = ["Sphinx (>=1.5.2,<2.0)", "numpydoc (>=0.6.0,<1.0)", "pygame (>=1.9.3,<2.0)", "sphinx_rtd_theme (>=0.1.10b0,<1.0)"] +optional = ["matplotlib (>=2.0.0,<3.0)", "opencv-python (>=3.0,<4.0)", "scikit-image (>=0.13.0,<1.0)", "scikit-learn", "scipy (>=0.19.0,<1.5)", "youtube_dl"] +test = ["coverage (<5.0)", "coveralls (>=1.1,<2.0)", "pytest (>=3.0.0,<4.0)", "pytest-cov (>=2.5.1,<3.0)", "requests (>=2.8.1,<3.0)"] + [[package]] name = "mpmath" version = "1.3.0" @@ -1106,6 +1191,29 @@ typing-extensions = ">=4.7,<5" [package.extras] datalib = ["numpy (>=1)", "pandas (>=1.2.3)", "pandas-stubs (>=1.1.0.11)"] +[[package]] +name = "opencv-python-headless" +version = "4.9.0.80" +description = "Wrapper package for OpenCV python bindings." +optional = false +python-versions = ">=3.6" +files = [ + {file = "opencv-python-headless-4.9.0.80.tar.gz", hash = "sha256:71a4cd8cf7c37122901d8e81295db7fb188730e33a0e40039a4e59c1030b0958"}, + {file = "opencv_python_headless-4.9.0.80-cp37-abi3-macosx_10_16_x86_64.whl", hash = "sha256:2ea8a2edc4db87841991b2fbab55fc07b97ecb602e0f47d5d485bd75cee17c1a"}, + {file = "opencv_python_headless-4.9.0.80-cp37-abi3-macosx_11_0_arm64.whl", hash = "sha256:e0ee54e27be493e8f7850847edae3128e18b540dac1d7b2e4001b8944e11e1c6"}, + {file = "opencv_python_headless-4.9.0.80-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:57ce2865e8fec431c6f97a81e9faaf23fa5be61011d0a75ccf47a3c0d65fa73d"}, + {file = "opencv_python_headless-4.9.0.80-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:976656362d68d9f40a5c66f83901430538002465f7db59142784f3893918f3df"}, + {file = "opencv_python_headless-4.9.0.80-cp37-abi3-win32.whl", hash = "sha256:11e3849d83e6651d4e7699aadda9ec7ed7c38957cbbcb99db074f2a2d2de9670"}, + {file = "opencv_python_headless-4.9.0.80-cp37-abi3-win_amd64.whl", hash = "sha256:a8056c2cb37cd65dfcdf4153ca16f7362afcf3a50d600d6bb69c660fc61ee29c"}, +] + +[package.dependencies] +numpy = [ + {version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""}, + {version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""}, + {version = ">=1.23.5", markers = "python_version >= \"3.11\""}, +] + [[package]] name = "packaging" version = "24.0" @@ -1325,6 +1433,20 @@ files = [ dev = ["pre-commit", "tox"] testing = ["pytest", "pytest-benchmark"] +[[package]] +name = "proglog" +version = "0.1.10" +description = "Log and progress bar manager for console, notebooks, web..." +optional = false +python-versions = "*" +files = [ + {file = "proglog-0.1.10-py3-none-any.whl", hash = "sha256:19d5da037e8c813da480b741e3fa71fb1ac0a5b02bf21c41577c7f327485ec50"}, + {file = "proglog-0.1.10.tar.gz", hash = "sha256:658c28c9c82e4caeb2f25f488fff9ceace22f8d69b15d0c1c86d64275e4ddab4"}, +] + +[package.dependencies] +tqdm = "*" + [[package]] name = "pycodestyle" version = "2.9.1" @@ -2488,4 +2610,4 @@ watchmedo = ["PyYAML (>=3.10)"] [metadata] lock-version = "2.0" python-versions = ">=3.10,<3.12" -content-hash = "dbb1f3241c006408ab8056349c63d7f947450c01fd518a758af66e2e5c000916" +content-hash = "c22b1c0eb7fbae1f326837eacfe7af3dd0ee754d7a074c9ae1b465e05d65e98e" diff --git a/pyproject.toml b/pyproject.toml index 5af13d3a..2ea59ef3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,6 +29,10 @@ sentence-transformers = "2.*" openai = "1.*" typing_extensions = "4.*" +[tool.poetry.group.video.dependencies] +moviepy = "1.*" +opencv-python-headless = "4.*" + [tool.poetry.group.dev.dependencies] autoflake = "1.*" pytest = "7.*" @@ -84,4 +88,5 @@ module = [ "faiss.*", "openai.*", "sentence_transformers.*", + "moviepy.*", ] diff --git a/tests/data/video/test.mp4 b/tests/data/video/test.mp4 new file mode 100644 index 00000000..596eea38 Binary files /dev/null and b/tests/data/video/test.mp4 differ diff --git a/tests/tools/test_video.py b/tests/tools/test_video.py new file mode 100644 index 00000000..8952c238 --- /dev/null +++ b/tests/tools/test_video.py @@ -0,0 +1,7 @@ +from vision_agent.tools.video import extract_frames_from_video + + +def test_extract_frames_from_video(): + video_path = "tests/data/video/test.mp4" + res = extract_frames_from_video(video_path) + assert len(res) == 1 diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py index a2b75851..42b3a810 100644 --- a/vision_agent/tools/tools.py +++ b/vision_agent/tools/tools.py @@ -457,6 +457,33 @@ def __call__(self, input: List[int]) -> float: return round(input[0] / input[1], 2) +class ExtractFrames(Tool): + name = "extract_frames_" + description = "'extract_frames_' extract image frames from the input video, return a list of tuple (frame, timestamp), where the timestamp is the relative time in seconds of the frame occurred in the video." + usage = { + "required_parameters": [{"name": "video_uri", "type": "str"}], + "examples": [ + { + "scenario": "Can you extract the frames from this video? Video: www.foobar.com/video?name=test.mp4", + "parameters": {"video_uri": "www.foobar.com/video?name=test.mp4"}, + }, + { + "scenario": "Can you extract the images from this video file? Video path: tests/data/test.mp4", + "parameters": {"video_uri": "tests/data/test.mp4"}, + }, + ], + } + + def __call__(self, video_uri: str) -> list[tuple[np.ndarray, float]]: + try: + from vision_agent.tools.video import extract_frames_from_video + except Exception as e: + raise ImportError( + "vision_agent is not installed correctly (cause: missing dependencies), please run 'pip install vision-agent[video]' instead." + ) from e + return extract_frames_from_video(video_uri) + + TOOLS = { i: {"name": c.name, "description": c.description, "usage": c.usage, "class": c} for i, c in enumerate( @@ -472,6 +499,7 @@ def __call__(self, input: List[int]) -> float: Subtract, Multiply, Divide, + ExtractFrames, ] ) if (hasattr(c, "name") and hasattr(c, "description") and hasattr(c, "usage")) diff --git a/vision_agent/tools/video.py b/vision_agent/tools/video.py new file mode 100644 index 00000000..1957915d --- /dev/null +++ b/vision_agent/tools/video.py @@ -0,0 +1,190 @@ +import logging +import math +import os +from concurrent.futures import ProcessPoolExecutor, as_completed +from typing import cast + +import cv2 +import numpy as np +from moviepy.video.io.VideoFileClip import VideoFileClip +from tqdm import tqdm + +_LOGGER = logging.getLogger(__name__) +# The maximum length of the clip to extract frames from, in seconds +_CLIP_LENGTH = 30.0 + + +def extract_frames_from_video( + video_uri: str, fps: int = 2, motion_detection_threshold: float = 0.06 +) -> list[tuple[np.ndarray, float]]: + """Extract frames from a video + + Parameters + ---------- + video_uri: str, the path to the video file or a video file url + fps: int, the frame rate per second to extract the frames + motion_detection_threshold: float, the threshold to detect the motion between frames. + A value between 0-1, the percentage change that is considered a different frame. + A lower value means more frames will be extracted. + + Returns + ------- + list[tuple[np.ndarray, int]], a list of tuples containing the extracted frame and the timestamp in seconds. E.g. [(frame1, 0.0), (frame2, 0.5), ...] + The timestamp is the time in seconds from the start of the video. E.g. 12.125 means 12.125 seconds from the start of the video. + The frames are sorted by the timestamp in ascending order. + """ + with VideoFileClip(video_uri) as video: + video_duration: float = video.duration + num_workers = os.cpu_count() + clip_length: float = min(video_duration, _CLIP_LENGTH) + start_times = list(range(0, math.ceil(video_duration), math.ceil(clip_length))) + assert start_times, f"No frames to extract from the input video: {video_uri}" + segment_args = [ + { + "video_uri": video_uri, + "start": start, + "end": ( + start + clip_length if i < len(start_times) - 1 else video_duration + ), + "fps": fps, + "motion_detection_threshold": motion_detection_threshold, + } + for i, start in enumerate(start_times) + ] + if ( + cast(float, segment_args[-1]["end"]) + - cast(float, segment_args[-1]["start"]) + < 1 + ): + # If the last segment is less than 1s, merge it with the previous segment + # This is to avoid the failure of the last segment extraction + assert ( + len(segment_args) > 1 + ), "Development bug - Expect at least 2 segments." + segment_args[-2]["end"] = video_duration + segment_args.pop(-1) + _LOGGER.info( + f"""Created {len(segment_args)} segments from the input video {video_uri} of length {video.duration}s, with clip size: {clip_length}s and {num_workers} workers. + Segments: {segment_args} + """ + ) + frames = [] + with tqdm(total=len(segment_args)) as pbar: + with ProcessPoolExecutor(max_workers=num_workers) as executor: + futures = [ + executor.submit(_extract_frames_by_clip, **kwargs) # type: ignore + for kwargs in segment_args + ] + for future in as_completed(futures): + result = future.result() + frames.extend(result) + pbar.update(1) + frames.sort(key=lambda x: x[1]) + _LOGGER.info(f"Extracted {len(frames)} frames from video {video_uri}") + return frames + + +def _extract_frames_by_clip( + video_uri: str, + start: int = 0, + end: float = -1, + fps: int = 2, + motion_detection_threshold: float = 0.06, +) -> list[tuple[np.ndarray, float]]: + """Extract frames from a video clip with start and end time in seconds. + + Parameters + ---------- + video_uri: str, the path to the video file or a video file url + start: int, the start time (in seconds) of the clip to extract + end: float, the end time (in seconds, up to millisecond level precision) of the clip to extract, if -1, extract the whole video + fps: int, the frame rate to extract the frames + motion_detection_threshold: float, the threshold to detect the motion between frames + """ + with VideoFileClip(video_uri) as video: + source_fps = video.fps + if end <= 0: + end = video.duration + _LOGGER.info( + f"Extracting frames from video {video_uri} ({video.duration}s) with start={start}s and end={end}s" + ) + clip = video.subclip(start, end) + processable_frames = int(clip.duration * fps) + _LOGGER.info( + f"Extracting frames from video clip of length {clip.duration}s with FPS={fps} and start_time={start}s. Total number of frames in clip: {processable_frames}" + ) + frames = [] + total_count, skipped_count = 0, 0 + prev_processed_frame = None + pbar = tqdm( + total=processable_frames, desc=f"Extracting frames from clip {start}-{end}" + ) + for i, frame in enumerate(clip.iter_frames(fps=fps, dtype="uint8")): + curr_processed_frame = _preprocess_frame(frame) + total_count += 1 + pbar.update(1) + # Skip the frame if it is similar to the previous one + if prev_processed_frame is not None and _similar_frame( + prev_processed_frame, + curr_processed_frame, + threshold=motion_detection_threshold, + ): + skipped_count += 1 + continue + prev_processed_frame = curr_processed_frame + ts = round(clip.reader.pos / source_fps, 3) + frames.append((frame, ts)) + + _LOGGER.info( + f"""Finished! + Frames extracted: {len(frames)} + Extracted frame timestamp: {[f[1] for f in frames]} + Total processed frames: {total_count} + Skipped frames: {skipped_count} + Scan FPS: {fps} + Clip start time: {start}s, {clip.pos} + Clip end time: {end}s + Clip duration: {clip.duration}s + Clip total frames: {clip.duration * source_fps} + Video duration: {video.duration}s + Video FPS: {video.fps} + Video total frames: {video.reader.nframes}""" + ) + return frames + + +def _preprocess_frame(frame: np.ndarray) -> np.ndarray: + # Convert to grayscale + frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) + frame = cv2.GaussianBlur(src=frame, ksize=(5, 5), sigmaX=0) + return frame + + +def _similar_frame( + prev_frame: np.ndarray, curr_frame: np.ndarray, threshold: float +) -> bool: + """Detect two frames are similar or not + + Parameters + ---------- + threshold : float, optional + Similarity threshold, a value between 0-1, the percentage change that is considered a different frame. + """ + # calculate difference and update previous frame TODO: don't assume the processed image is cached + diff_frame = cv2.absdiff(src1=prev_frame, src2=curr_frame) + # Only take different areas that are different enough (>20 / 255) + thresh_frame = cv2.threshold( + src=diff_frame, thresh=20, maxval=255, type=cv2.THRESH_BINARY + )[1] + change_percentage = cv2.countNonZero(thresh_frame) / ( + curr_frame.shape[0] * curr_frame.shape[1] + ) + _LOGGER.debug(f"Image diff: {change_percentage}") + return change_percentage < threshold + + +# res = extract_frames(video) +if __name__ == "__main__": + video_path = "/Users/asia/Downloads/frames/baby_cam1.MP4" + res = extract_frames_from_video(video_path) + print("done, extracted num frames: ", len(res))