From b8be42dfde8d1b0be1d72c665f95e801f8081ebd Mon Sep 17 00:00:00 2001 From: Yazhou Cao Date: Tue, 26 Mar 2024 16:49:01 -0700 Subject: [PATCH] Update docs --- README.md | 1 + docs/index.md | 1 + vision_agent/tools/tools.py | 4 +++- vision_agent/tools/video.py | 39 +++++++++++++++---------------------- 4 files changed, 21 insertions(+), 24 deletions(-) diff --git a/README.md b/README.md index 5687dd43..6879e734 100644 --- a/README.md +++ b/README.md @@ -104,6 +104,7 @@ you. For example: | Crop | Crop crops an image given a bounding box and returns a file name of the cropped image. | | BboxArea | BboxArea returns the area of the bounding box in pixels normalized to 2 decimal places. | | SegArea | SegArea returns the area of the segmentation mask in pixels normalized to 2 decimal places. | +| ExtractFrames | ExtractFrames extracts image frames from the input video. | It also has a basic set of calculate tools such as add, subtract, multiply and divide. diff --git a/docs/index.md b/docs/index.md index 2a83bef7..e0033dfa 100644 --- a/docs/index.md +++ b/docs/index.md @@ -90,6 +90,7 @@ you. For example: | Crop | Crop crops an image given a bounding box and returns a file name of the cropped image. | | BboxArea | BboxArea returns the area of the bounding box in pixels normalized to 2 decimal places. | | SegArea | SegArea returns the area of the segmentation mask in pixels normalized to 2 decimal places. | +| ExtractFrames | ExtractFrames extracts image frames from the input video. | It also has a basic set of calculate tools such as add, subtract, multiply and divide. diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py index 99afcee3..6e55f210 100644 --- a/vision_agent/tools/tools.py +++ b/vision_agent/tools/tools.py @@ -507,6 +507,8 @@ def __call__(self, input: List[int]) -> float: class ExtractFrames(Tool): + r"""Extract frames from a video.""" + name = "extract_frames_" description = "'extract_frames_' extract image frames from the input video, return a list of tuple (frame, timestamp), where the timestamp is the relative time in seconds of the frame occurred in the video, the frame is a local image file path that stores the frame." usage = { @@ -524,7 +526,7 @@ class ExtractFrames(Tool): } def __call__(self, video_uri: str) -> list[tuple[str, float]]: - """Extract frames from a video clip with start and end time in seconds. + """Extract frames from a video. Parameters: diff --git a/vision_agent/tools/video.py b/vision_agent/tools/video.py index a86efc49..5745c2fe 100644 --- a/vision_agent/tools/video.py +++ b/vision_agent/tools/video.py @@ -19,19 +19,15 @@ def extract_frames_from_video( ) -> list[tuple[np.ndarray, float]]: """Extract frames from a video - Parameters - ---------- - video_uri: str, the path to the video file or a video file url - fps: int, the frame rate per second to extract the frames - motion_detection_threshold: float, The threshold to detect motion between changes/frames. - A value between 0-1, which represents the percentage change required for the frames to be considered in motion. - For example, a lower value means more frames will be extracted. + Parameters: + video_uri: the path to the video file or a video file url + fps: the frame rate per second to extract the frames + motion_detection_threshold: The threshold to detect motion between changes/frames. + A value between 0-1, which represents the percentage change required for the frames to be considered in motion. + For example, a lower value means more frames will be extracted. - Returns - ------- - list[tuple[np.ndarray, int]], a list of tuples containing the extracted frame and the timestamp in seconds. E.g. [(frame1, 0.0), (frame2, 0.5), ...] - The timestamp is the time in seconds from the start of the video. E.g. 12.125 means 12.125 seconds from the start of the video. - The frames are sorted by the timestamp in ascending order. + Returns: + a list of tuples containing the extracted frame and the timestamp in seconds. E.g. [(frame1, 0.0), (frame2, 0.5), ...]. The timestamp is the time in seconds from the start of the video. E.g. 12.125 means 12.125 seconds from the start of the video. The frames are sorted by the timestamp in ascending order. """ with VideoFileClip(video_uri) as video: video_duration: float = video.duration @@ -93,13 +89,12 @@ def _extract_frames_by_clip( ) -> list[tuple[np.ndarray, float]]: """Extract frames from a video clip with start and end time in seconds. - Parameters - ---------- - video_uri: str, the path to the video file or a video file url - start: int, the start time (in seconds) of the clip to extract - end: float, the end time (in seconds, up to millisecond level precision) of the clip to extract, if -1, extract the whole video - fps: int, the frame rate to extract the frames - motion_detection_threshold: float, the threshold to detect the motion between frames + Parameters: + video_uri: the path to the video file or a video file url + start: the start time (in seconds) of the clip to extract + end: the end time (in seconds, up to millisecond level precision) of the clip to extract, if -1, extract the whole video + fps: the frame rate to extract the frames + motion_detection_threshold: the threshold to detect the motion between frames """ with VideoFileClip(video_uri) as video: source_fps = video.fps @@ -165,10 +160,8 @@ def _similar_frame( ) -> bool: """Detect two frames are similar or not - Parameters - ---------- - threshold : float, optional - Similarity threshold, a value between 0-1, the percentage change that is considered a different frame. + Parameters: + threshold: similarity threshold, a value between 0-1, the percentage change that is considered a different frame. """ # calculate difference and update previous frame TODO: don't assume the processed image is cached diff_frame = cv2.absdiff(src1=prev_frame, src2=curr_frame)