diff --git a/poetry.lock b/poetry.lock
index 3f479663..d7c66b5d 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -257,6 +257,17 @@ files = [
 [package.extras]
 dev = ["black", "flake8", "flake8-pyi", "matplotlib", "mypy (==0.770)", "numpy", "pandas", "pytest"]
 
+[[package]]
+name = "decorator"
+version = "4.4.2"
+description = "Decorators for Humans"
+optional = false
+python-versions = ">=2.6, !=3.0.*, !=3.1.*"
+files = [
+    {file = "decorator-4.4.2-py2.py3-none-any.whl", hash = "sha256:41fa54c2a0cc4ba648be4fd43cff00aedf5b9465c9bf18d64325bc225f08f760"},
+    {file = "decorator-4.4.2.tar.gz", hash = "sha256:e3a62f0520172440ca0dcc823749319382e377f37f140a0b99ef45fecb84bfe7"},
+]
+
 [[package]]
 name = "distro"
 version = "1.9.0"
@@ -518,6 +529,56 @@ files = [
     {file = "idna-3.6.tar.gz", hash = "sha256:9ecdbbd083b06798ae1e86adcbfe8ab1479cf864e4ee30fe4e46a003d12491ca"},
 ]
 
+[[package]]
+name = "imageio"
+version = "2.34.0"
+description = "Library for reading and writing a wide range of image, video, scientific, and volumetric data formats."
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "imageio-2.34.0-py3-none-any.whl", hash = "sha256:08082bf47ccb54843d9c73fe9fc8f3a88c72452ab676b58aca74f36167e8ccba"},
+    {file = "imageio-2.34.0.tar.gz", hash = "sha256:ae9732e10acf807a22c389aef193f42215718e16bd06eed0c5bb57e1034a4d53"},
+]
+
+[package.dependencies]
+numpy = "*"
+pillow = ">=8.3.2"
+
+[package.extras]
+all-plugins = ["astropy", "av", "imageio-ffmpeg", "pillow-heif", "psutil", "tifffile"]
+all-plugins-pypy = ["av", "imageio-ffmpeg", "pillow-heif", "psutil", "tifffile"]
+build = ["wheel"]
+dev = ["black", "flake8", "fsspec[github]", "pytest", "pytest-cov"]
+docs = ["numpydoc", "pydata-sphinx-theme", "sphinx (<6)"]
+ffmpeg = ["imageio-ffmpeg", "psutil"]
+fits = ["astropy"]
+full = ["astropy", "av", "black", "flake8", "fsspec[github]", "gdal", "imageio-ffmpeg", "itk", "numpydoc", "pillow-heif", "psutil", "pydata-sphinx-theme", "pytest", "pytest-cov", "sphinx (<6)", "tifffile", "wheel"]
+gdal = ["gdal"]
+itk = ["itk"]
+linting = ["black", "flake8"]
+pillow-heif = ["pillow-heif"]
+pyav = ["av"]
+test = ["fsspec[github]", "pytest", "pytest-cov"]
+tifffile = ["tifffile"]
+
+[[package]]
+name = "imageio-ffmpeg"
+version = "0.4.9"
+description = "FFMPEG wrapper for Python"
+optional = false
+python-versions = ">=3.5"
+files = [
+    {file = "imageio-ffmpeg-0.4.9.tar.gz", hash = "sha256:39bcd1660118ef360fa4047456501071364661aa9d9021d3d26c58f1ee2081f5"},
+    {file = "imageio_ffmpeg-0.4.9-py3-none-macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:24095e882a126a0d217197b86265f821b4bb3cf9004104f67c1384a2b4b49168"},
+    {file = "imageio_ffmpeg-0.4.9-py3-none-manylinux2010_x86_64.whl", hash = "sha256:2996c64af3e5489227096580269317719ea1a8121d207f2e28d6c24ebc4a253e"},
+    {file = "imageio_ffmpeg-0.4.9-py3-none-manylinux2014_aarch64.whl", hash = "sha256:7eead662d2f46d748c0ab446b68f423eb63d2b54d0a8ef96f80607245540866d"},
+    {file = "imageio_ffmpeg-0.4.9-py3-none-win32.whl", hash = "sha256:b6de1e18911687c538d5585d8287ab1a23624ca9dc2044fcc4607de667bcf11e"},
+    {file = "imageio_ffmpeg-0.4.9-py3-none-win_amd64.whl", hash = "sha256:7e900c695c6541b1cb17feb1baacd4009b30a53a45b81c23d53a67ab13ffb766"},
+]
+
+[package.dependencies]
+setuptools = "*"
+
 [[package]]
 name = "iniconfig"
 version = "2.0.0"
@@ -803,6 +864,30 @@ files = [
 griffe = ">=0.37"
 mkdocstrings = ">=0.20"
 
+[[package]]
+name = "moviepy"
+version = "1.0.3"
+description = "Video editing with Python"
+optional = false
+python-versions = "*"
+files = [
+    {file = "moviepy-1.0.3.tar.gz", hash = "sha256:2884e35d1788077db3ff89e763c5ba7bfddbd7ae9108c9bc809e7ba58fa433f5"},
+]
+
+[package.dependencies]
+decorator = ">=4.0.2,<5.0"
+imageio = {version = ">=2.5,<3.0", markers = "python_version >= \"3.4\""}
+imageio_ffmpeg = {version = ">=0.2.0", markers = "python_version >= \"3.4\""}
+numpy = {version = ">=1.17.3", markers = "python_version > \"2.7\""}
+proglog = "<=1.0.0"
+requests = ">=2.8.1,<3.0"
+tqdm = ">=4.11.2,<5.0"
+
+[package.extras]
+doc = ["Sphinx (>=1.5.2,<2.0)", "numpydoc (>=0.6.0,<1.0)", "pygame (>=1.9.3,<2.0)", "sphinx_rtd_theme (>=0.1.10b0,<1.0)"]
+optional = ["matplotlib (>=2.0.0,<3.0)", "opencv-python (>=3.0,<4.0)", "scikit-image (>=0.13.0,<1.0)", "scikit-learn", "scipy (>=0.19.0,<1.5)", "youtube_dl"]
+test = ["coverage (<5.0)", "coveralls (>=1.1,<2.0)", "pytest (>=3.0.0,<4.0)", "pytest-cov (>=2.5.1,<3.0)", "requests (>=2.8.1,<3.0)"]
+
 [[package]]
 name = "mpmath"
 version = "1.3.0"
@@ -1106,6 +1191,29 @@ typing-extensions = ">=4.7,<5"
 [package.extras]
 datalib = ["numpy (>=1)", "pandas (>=1.2.3)", "pandas-stubs (>=1.1.0.11)"]
 
+[[package]]
+name = "opencv-python-headless"
+version = "4.9.0.80"
+description = "Wrapper package for OpenCV python bindings."
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "opencv-python-headless-4.9.0.80.tar.gz", hash = "sha256:71a4cd8cf7c37122901d8e81295db7fb188730e33a0e40039a4e59c1030b0958"},
+    {file = "opencv_python_headless-4.9.0.80-cp37-abi3-macosx_10_16_x86_64.whl", hash = "sha256:2ea8a2edc4db87841991b2fbab55fc07b97ecb602e0f47d5d485bd75cee17c1a"},
+    {file = "opencv_python_headless-4.9.0.80-cp37-abi3-macosx_11_0_arm64.whl", hash = "sha256:e0ee54e27be493e8f7850847edae3128e18b540dac1d7b2e4001b8944e11e1c6"},
+    {file = "opencv_python_headless-4.9.0.80-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:57ce2865e8fec431c6f97a81e9faaf23fa5be61011d0a75ccf47a3c0d65fa73d"},
+    {file = "opencv_python_headless-4.9.0.80-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:976656362d68d9f40a5c66f83901430538002465f7db59142784f3893918f3df"},
+    {file = "opencv_python_headless-4.9.0.80-cp37-abi3-win32.whl", hash = "sha256:11e3849d83e6651d4e7699aadda9ec7ed7c38957cbbcb99db074f2a2d2de9670"},
+    {file = "opencv_python_headless-4.9.0.80-cp37-abi3-win_amd64.whl", hash = "sha256:a8056c2cb37cd65dfcdf4153ca16f7362afcf3a50d600d6bb69c660fc61ee29c"},
+]
+
+[package.dependencies]
+numpy = [
+    {version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""},
+    {version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""},
+    {version = ">=1.23.5", markers = "python_version >= \"3.11\""},
+]
+
 [[package]]
 name = "packaging"
 version = "24.0"
@@ -1325,6 +1433,20 @@ files = [
 dev = ["pre-commit", "tox"]
 testing = ["pytest", "pytest-benchmark"]
 
+[[package]]
+name = "proglog"
+version = "0.1.10"
+description = "Log and progress bar manager for console, notebooks, web..."
+optional = false
+python-versions = "*"
+files = [
+    {file = "proglog-0.1.10-py3-none-any.whl", hash = "sha256:19d5da037e8c813da480b741e3fa71fb1ac0a5b02bf21c41577c7f327485ec50"},
+    {file = "proglog-0.1.10.tar.gz", hash = "sha256:658c28c9c82e4caeb2f25f488fff9ceace22f8d69b15d0c1c86d64275e4ddab4"},
+]
+
+[package.dependencies]
+tqdm = "*"
+
 [[package]]
 name = "pycodestyle"
 version = "2.9.1"
@@ -2488,4 +2610,4 @@ watchmedo = ["PyYAML (>=3.10)"]
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.10,<3.12"
-content-hash = "dbb1f3241c006408ab8056349c63d7f947450c01fd518a758af66e2e5c000916"
+content-hash = "c22b1c0eb7fbae1f326837eacfe7af3dd0ee754d7a074c9ae1b465e05d65e98e"
diff --git a/pyproject.toml b/pyproject.toml
index 5af13d3a..2ea59ef3 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -29,6 +29,10 @@ sentence-transformers = "2.*"
 openai = "1.*"
 typing_extensions = "4.*"
 
+[tool.poetry.group.video.dependencies]
+moviepy = "1.*"
+opencv-python-headless = "4.*"
+
 [tool.poetry.group.dev.dependencies]
 autoflake = "1.*"
 pytest = "7.*"
@@ -84,4 +88,5 @@ module = [
     "faiss.*",
     "openai.*",
     "sentence_transformers.*",
+    "moviepy.*",
 ]
diff --git a/tests/data/video/test.mp4 b/tests/data/video/test.mp4
new file mode 100644
index 00000000..596eea38
Binary files /dev/null and b/tests/data/video/test.mp4 differ
diff --git a/tests/tools/test_video.py b/tests/tools/test_video.py
new file mode 100644
index 00000000..8952c238
--- /dev/null
+++ b/tests/tools/test_video.py
@@ -0,0 +1,7 @@
+from vision_agent.tools.video import extract_frames_from_video
+
+
+def test_extract_frames_from_video():
+    video_path = "tests/data/video/test.mp4"
+    res = extract_frames_from_video(video_path)
+    assert len(res) == 1
diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py
index a2b75851..42b3a810 100644
--- a/vision_agent/tools/tools.py
+++ b/vision_agent/tools/tools.py
@@ -457,6 +457,33 @@ def __call__(self, input: List[int]) -> float:
         return round(input[0] / input[1], 2)
 
 
+class ExtractFrames(Tool):
+    name = "extract_frames_"
+    description = "'extract_frames_' extract image frames from the input video, return a list of tuple (frame, timestamp), where the timestamp is the relative time in seconds of the frame occurred in the video."
+    usage = {
+        "required_parameters": [{"name": "video_uri", "type": "str"}],
+        "examples": [
+            {
+                "scenario": "Can you extract the frames from this video? Video: www.foobar.com/video?name=test.mp4",
+                "parameters": {"video_uri": "www.foobar.com/video?name=test.mp4"},
+            },
+            {
+                "scenario": "Can you extract the images from this video file? Video path: tests/data/test.mp4",
+                "parameters": {"video_uri": "tests/data/test.mp4"},
+            },
+        ],
+    }
+
+    def __call__(self, video_uri: str) -> list[tuple[np.ndarray, float]]:
+        try:
+            from vision_agent.tools.video import extract_frames_from_video
+        except Exception as e:
+            raise ImportError(
+                "vision_agent is not installed correctly (cause: missing dependencies), please run 'pip install vision-agent[video]' instead."
+            ) from e
+        return extract_frames_from_video(video_uri)
+
+
 TOOLS = {
     i: {"name": c.name, "description": c.description, "usage": c.usage, "class": c}
     for i, c in enumerate(
@@ -472,6 +499,7 @@ def __call__(self, input: List[int]) -> float:
             Subtract,
             Multiply,
             Divide,
+            ExtractFrames,
         ]
     )
     if (hasattr(c, "name") and hasattr(c, "description") and hasattr(c, "usage"))
diff --git a/vision_agent/tools/video.py b/vision_agent/tools/video.py
new file mode 100644
index 00000000..1957915d
--- /dev/null
+++ b/vision_agent/tools/video.py
@@ -0,0 +1,190 @@
+import logging
+import math
+import os
+from concurrent.futures import ProcessPoolExecutor, as_completed
+from typing import cast
+
+import cv2
+import numpy as np
+from moviepy.video.io.VideoFileClip import VideoFileClip
+from tqdm import tqdm
+
+_LOGGER = logging.getLogger(__name__)
+# The maximum length of the clip to extract frames from, in seconds
+_CLIP_LENGTH = 30.0
+
+
+def extract_frames_from_video(
+    video_uri: str, fps: int = 2, motion_detection_threshold: float = 0.06
+) -> list[tuple[np.ndarray, float]]:
+    """Extract frames from a video
+
+    Parameters
+    ----------
+    video_uri: str, the path to the video file or a video file url
+    fps: int, the frame rate per second to extract the frames
+    motion_detection_threshold: float, the threshold to detect the motion between frames.
+        A value between 0-1, the percentage change that is considered a different frame.
+        A lower value means more frames will be extracted.
+
+    Returns
+    -------
+    list[tuple[np.ndarray, int]], a list of tuples containing the extracted frame and the timestamp in seconds. E.g. [(frame1, 0.0), (frame2, 0.5), ...]
+        The timestamp is the time in seconds from the start of the video. E.g. 12.125 means 12.125 seconds from the start of the video.
+        The frames are sorted by the timestamp in ascending order.
+    """
+    with VideoFileClip(video_uri) as video:
+        video_duration: float = video.duration
+        num_workers = os.cpu_count()
+        clip_length: float = min(video_duration, _CLIP_LENGTH)
+        start_times = list(range(0, math.ceil(video_duration), math.ceil(clip_length)))
+        assert start_times, f"No frames to extract from the input video: {video_uri}"
+        segment_args = [
+            {
+                "video_uri": video_uri,
+                "start": start,
+                "end": (
+                    start + clip_length if i < len(start_times) - 1 else video_duration
+                ),
+                "fps": fps,
+                "motion_detection_threshold": motion_detection_threshold,
+            }
+            for i, start in enumerate(start_times)
+        ]
+        if (
+            cast(float, segment_args[-1]["end"])
+            - cast(float, segment_args[-1]["start"])
+            < 1
+        ):
+            # If the last segment is less than 1s, merge it with the previous segment
+            # This is to avoid the failure of the last segment extraction
+            assert (
+                len(segment_args) > 1
+            ), "Development bug - Expect at least 2 segments."
+            segment_args[-2]["end"] = video_duration
+            segment_args.pop(-1)
+        _LOGGER.info(
+            f"""Created {len(segment_args)} segments from the input video  {video_uri} of length {video.duration}s, with clip size: {clip_length}s and {num_workers} workers.
+            Segments: {segment_args}
+            """
+        )
+        frames = []
+        with tqdm(total=len(segment_args)) as pbar:
+            with ProcessPoolExecutor(max_workers=num_workers) as executor:
+                futures = [
+                    executor.submit(_extract_frames_by_clip, **kwargs)  # type: ignore
+                    for kwargs in segment_args
+                ]
+                for future in as_completed(futures):
+                    result = future.result()
+                    frames.extend(result)
+                    pbar.update(1)
+        frames.sort(key=lambda x: x[1])
+        _LOGGER.info(f"Extracted {len(frames)} frames from video {video_uri}")
+        return frames
+
+
+def _extract_frames_by_clip(
+    video_uri: str,
+    start: int = 0,
+    end: float = -1,
+    fps: int = 2,
+    motion_detection_threshold: float = 0.06,
+) -> list[tuple[np.ndarray, float]]:
+    """Extract frames from a video clip with start and end time in seconds.
+
+    Parameters
+    ----------
+    video_uri: str, the path to the video file or a video file url
+    start: int, the start time (in seconds) of the clip to extract
+    end: float, the end time (in seconds, up to millisecond level precision) of the clip to extract, if -1, extract the whole video
+    fps: int, the frame rate to extract the frames
+    motion_detection_threshold: float, the threshold to detect the motion between frames
+    """
+    with VideoFileClip(video_uri) as video:
+        source_fps = video.fps
+        if end <= 0:
+            end = video.duration
+        _LOGGER.info(
+            f"Extracting frames from video {video_uri} ({video.duration}s) with start={start}s and end={end}s"
+        )
+        clip = video.subclip(start, end)
+        processable_frames = int(clip.duration * fps)
+        _LOGGER.info(
+            f"Extracting frames from video clip of length {clip.duration}s with FPS={fps} and start_time={start}s. Total number of frames in clip: {processable_frames}"
+        )
+        frames = []
+        total_count, skipped_count = 0, 0
+        prev_processed_frame = None
+        pbar = tqdm(
+            total=processable_frames, desc=f"Extracting frames from clip {start}-{end}"
+        )
+        for i, frame in enumerate(clip.iter_frames(fps=fps, dtype="uint8")):
+            curr_processed_frame = _preprocess_frame(frame)
+            total_count += 1
+            pbar.update(1)
+            # Skip the frame if it is similar to the previous one
+            if prev_processed_frame is not None and _similar_frame(
+                prev_processed_frame,
+                curr_processed_frame,
+                threshold=motion_detection_threshold,
+            ):
+                skipped_count += 1
+                continue
+            prev_processed_frame = curr_processed_frame
+            ts = round(clip.reader.pos / source_fps, 3)
+            frames.append((frame, ts))
+
+        _LOGGER.info(
+            f"""Finished!
+                Frames extracted: {len(frames)}
+                Extracted frame timestamp: {[f[1] for f in frames]}
+                Total processed frames: {total_count}
+                Skipped frames:  {skipped_count}
+                Scan FPS: {fps}
+                Clip start time: {start}s, {clip.pos}
+                Clip end time: {end}s
+                Clip duration: {clip.duration}s
+                Clip total frames: {clip.duration * source_fps}
+                Video duration: {video.duration}s
+                Video FPS: {video.fps}
+                Video total frames: {video.reader.nframes}"""
+        )
+        return frames
+
+
+def _preprocess_frame(frame: np.ndarray) -> np.ndarray:
+    # Convert to grayscale
+    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
+    frame = cv2.GaussianBlur(src=frame, ksize=(5, 5), sigmaX=0)
+    return frame
+
+
+def _similar_frame(
+    prev_frame: np.ndarray, curr_frame: np.ndarray, threshold: float
+) -> bool:
+    """Detect two frames are similar or not
+
+    Parameters
+    ----------
+    threshold : float, optional
+        Similarity threshold, a value between 0-1, the percentage change that is considered a different frame.
+    """
+    # calculate difference and update previous frame TODO: don't assume the processed image is cached
+    diff_frame = cv2.absdiff(src1=prev_frame, src2=curr_frame)
+    # Only take different areas that are different enough (>20 / 255)
+    thresh_frame = cv2.threshold(
+        src=diff_frame, thresh=20, maxval=255, type=cv2.THRESH_BINARY
+    )[1]
+    change_percentage = cv2.countNonZero(thresh_frame) / (
+        curr_frame.shape[0] * curr_frame.shape[1]
+    )
+    _LOGGER.debug(f"Image diff: {change_percentage}")
+    return change_percentage < threshold
+
+
+# res = extract_frames(video)
+if __name__ == "__main__":
+    video_path = "/Users/asia/Downloads/frames/baby_cam1.MP4"
+    res = extract_frames_from_video(video_path)
+    print("done, extracted num frames: ", len(res))