diff --git a/poetry.lock b/poetry.lock index 7f1126b1..b6356fcf 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand. [[package]] name = "annotated-types" @@ -557,13 +557,13 @@ files = [ [[package]] name = "e2b" -version = "0.17.2a56" +version = "0.17.2a57" description = "E2B SDK that give agents cloud environments" optional = false python-versions = "<4.0,>=3.8" files = [ - {file = "e2b-0.17.2a56-py3-none-any.whl", hash = "sha256:19db2c8fce72f4fd08f7d5538184a0b237551817b9e80879b65503090a7b59b9"}, - {file = "e2b-0.17.2a56.tar.gz", hash = "sha256:7932ec1b7ab4e588d8769280698725085103b34eaca33e678d4b1a42bc2ff8fd"}, + {file = "e2b-0.17.2a57-py3-none-any.whl", hash = "sha256:db1bfd4cb65d10833faab2df386db35ed3fd7ab1ebee452414d8d006da848119"}, + {file = "e2b-0.17.2a57.tar.gz", hash = "sha256:92f77fdfa646ad83a40ed1e7bdc3c25fd76238eea016a5c96668f5f6d9807548"}, ] [package.dependencies] @@ -591,28 +591,6 @@ attrs = ">=21.3.0" e2b = ">=0.17.2a50,<0.18.0" httpx = ">=0.20.0,<0.28.0" -[[package]] -name = "eva-decord" -version = "0.6.1" -description = "EVA's Decord Video Loader" -optional = false -python-versions = ">=3.7.0" -files = [ - {file = "eva_decord-0.6.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:a81c49d11c3f93c23b40fb106854d6c0b5548508e4b7971ade50c4d1ae4ad68f"}, - {file = "eva_decord-0.6.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:0d7d4c6a698ac4ad3b14c3c85773bba8570d8a1431204a237365e17a940f48c7"}, - {file = "eva_decord-0.6.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f1e756887aa1833dadd0aee0f4e3b3dc10a9080b53a73001501c22eec311f78b"}, - {file = "eva_decord-0.6.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ae41d7958b7d6fc3af66ae1b4072d6f938abe04f2016b56891688ac8a78ee158"}, - {file = "eva_decord-0.6.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:b2aae6fa0968ef5816fe09109aa87227cc5dbc5e3b0ae3a24c1de8d948776799"}, - {file = "eva_decord-0.6.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:b44e20f401f4e7a52e6b1a6cb95fe06a40de4f02be5386da07c6d8f4851ab4ed"}, - {file = "eva_decord-0.6.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:af1a74f414fc84c35b45478aed7868b5afd323fb2b5c50e916ef7efa17524fb1"}, - {file = "eva_decord-0.6.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:c64446dab22acb0ae44f3ee3190cb923fb538c74a4aa22a7fd8340ce3642c5cb"}, - {file = "eva_decord-0.6.1-py3-none-manylinux2010_x86_64.whl", hash = "sha256:75dabf364f2df5dc4c78d685cdeca29733ac422f53508a3c117f1387f1d0ef81"}, - {file = "eva_decord-0.6.1-py3-none-win_amd64.whl", hash = "sha256:f9f09369bef73075d945383bfaf1e41c3db118e7148719369ea134506e4bb525"}, -] - -[package.dependencies] -numpy = ">=1.14.0" - [[package]] name = "exceptiongroup" version = "1.2.2" @@ -657,19 +635,19 @@ devel = ["colorama", "json-spec", "jsonschema", "pylint", "pytest", "pytest-benc [[package]] name = "filelock" -version = "3.15.4" +version = "3.16.0" description = "A platform independent file lock." optional = false python-versions = ">=3.8" files = [ - {file = "filelock-3.15.4-py3-none-any.whl", hash = "sha256:6ca1fffae96225dab4c6eaf1c4f4f28cd2568d3ec2a44e15a08520504de468e7"}, - {file = "filelock-3.15.4.tar.gz", hash = "sha256:2207938cbc1844345cb01a5a95524dae30f0ce089eba5b00378295a17e3e90cb"}, + {file = "filelock-3.16.0-py3-none-any.whl", hash = "sha256:f6ed4c963184f4c84dd5557ce8fece759a3724b37b80c6c4f20a2f63a4dc6609"}, + {file = "filelock-3.16.0.tar.gz", hash = "sha256:81de9eb8453c769b63369f87f11131a7ab04e367f8d97ad39dc230daa07e3bec"}, ] [package.extras] -docs = ["furo (>=2023.9.10)", "sphinx (>=7.2.6)", "sphinx-autodoc-typehints (>=1.25.2)"] -testing = ["covdefaults (>=2.3)", "coverage (>=7.3.2)", "diff-cover (>=8.0.1)", "pytest (>=7.4.3)", "pytest-asyncio (>=0.21)", "pytest-cov (>=4.1)", "pytest-mock (>=3.12)", "pytest-timeout (>=2.2)", "virtualenv (>=20.26.2)"] -typing = ["typing-extensions (>=4.8)"] +docs = ["furo (>=2024.8.6)", "sphinx (>=8.0.2)", "sphinx-autodoc-typehints (>=2.4)"] +testing = ["covdefaults (>=2.3)", "coverage (>=7.6.1)", "diff-cover (>=9.1.1)", "pytest (>=8.3.2)", "pytest-asyncio (>=0.24)", "pytest-cov (>=5)", "pytest-mock (>=3.14)", "pytest-timeout (>=2.3.1)", "virtualenv (>=20.26.3)"] +typing = ["typing-extensions (>=4.12.2)"] [[package]] name = "flake8" @@ -1206,13 +1184,13 @@ test = ["ipykernel", "pre-commit", "pytest (<8)", "pytest-cov", "pytest-timeout" [[package]] name = "langsmith" -version = "0.1.115" +version = "0.1.117" description = "Client library to connect to the LangSmith LLM Tracing and Evaluation Platform." optional = false python-versions = "<4.0,>=3.8.1" files = [ - {file = "langsmith-0.1.115-py3-none-any.whl", hash = "sha256:04e35cfd4c2d4ff1ea10bb577ff43957b05ebb3d9eb4e06e200701f4a2b4ac9f"}, - {file = "langsmith-0.1.115.tar.gz", hash = "sha256:3b775377d858d32354f3ee0dd1ed637068cfe9a1f13e7b3bfa82db1615cdffc9"}, + {file = "langsmith-0.1.117-py3-none-any.whl", hash = "sha256:e936ee9bcf8293b0496df7ba462a3702179fbe51f9dc28744b0fbec0dbf206ae"}, + {file = "langsmith-0.1.117.tar.gz", hash = "sha256:a1b532f49968b9339bcaff9118d141846d52ed3d803f342902e7448edf1d662b"}, ] [package.dependencies] @@ -1735,13 +1713,13 @@ files = [ [[package]] name = "openai" -version = "1.43.1" +version = "1.44.1" description = "The official Python library for the openai API" optional = false python-versions = ">=3.7.1" files = [ - {file = "openai-1.43.1-py3-none-any.whl", hash = "sha256:23ed3aa71e89cf644c911f7ab80087d08c0bf46ce6b75d9a811fc7942cff85c2"}, - {file = "openai-1.43.1.tar.gz", hash = "sha256:b64843711b7c92ded36795062ea1f8cad84ec6c2848646f2a786ac4617a6b9f5"}, + {file = "openai-1.44.1-py3-none-any.whl", hash = "sha256:07e2c2758d1c94151c740b14dab638ba0d04bcb41a2e397045c90e7661cdf741"}, + {file = "openai-1.44.1.tar.gz", hash = "sha256:e0ffdab601118329ea7529e684b606a72c6c9d4f05be9ee1116255fcf5593874"}, ] [package.dependencies] @@ -2156,19 +2134,19 @@ tests-min = ["defusedxml", "packaging", "pytest"] [[package]] name = "platformdirs" -version = "4.2.2" +version = "4.3.2" description = "A small Python package for determining appropriate platform-specific dirs, e.g. a `user data dir`." optional = false python-versions = ">=3.8" files = [ - {file = "platformdirs-4.2.2-py3-none-any.whl", hash = "sha256:2d7a1657e36a80ea911db832a8a6ece5ee53d8de21edd5cc5879af6530b1bfee"}, - {file = "platformdirs-4.2.2.tar.gz", hash = "sha256:38b7b51f512eed9e84a22788b4bce1de17c0adb134d6becb09836e37d8654cd3"}, + {file = "platformdirs-4.3.2-py3-none-any.whl", hash = "sha256:eb1c8582560b34ed4ba105009a4badf7f6f85768b30126f351328507b2beb617"}, + {file = "platformdirs-4.3.2.tar.gz", hash = "sha256:9e5e27a08aa095dd127b9f2e764d74254f482fef22b0970773bfba79d091ab8c"}, ] [package.extras] -docs = ["furo (>=2023.9.10)", "proselint (>=0.13)", "sphinx (>=7.2.6)", "sphinx-autodoc-typehints (>=1.25.2)"] -test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=7.4.3)", "pytest-cov (>=4.1)", "pytest-mock (>=3.12)"] -type = ["mypy (>=1.8)"] +docs = ["furo (>=2024.8.6)", "proselint (>=0.14)", "sphinx (>=8.0.2)", "sphinx-autodoc-typehints (>=2.4)"] +test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=8.3.2)", "pytest-cov (>=5)", "pytest-mock (>=3.14)"] +type = ["mypy (>=1.11.2)"] [[package]] name = "pluggy" @@ -2425,13 +2403,13 @@ typing-extensions = ">=4.6.0,<4.7.0 || >4.7.0" [[package]] name = "pydantic-settings" -version = "2.4.0" +version = "2.5.0" description = "Settings management using Pydantic" optional = false python-versions = ">=3.8" files = [ - {file = "pydantic_settings-2.4.0-py3-none-any.whl", hash = "sha256:bb6849dc067f1687574c12a639e231f3a6feeed0a12d710c1382045c5db1c315"}, - {file = "pydantic_settings-2.4.0.tar.gz", hash = "sha256:ed81c3a0f46392b4d7c0a565c05884e6e54b3456e6f0fe4d8814981172dc9a88"}, + {file = "pydantic_settings-2.5.0-py3-none-any.whl", hash = "sha256:eae04a3dd9adf521a4c959dcefb984e0f3b1d841999daf02f961dcc4d31d2f7f"}, + {file = "pydantic_settings-2.5.0.tar.gz", hash = "sha256:204828c02481a2e7135466b26a7d65d9e15a17d52d1d8f59cacdf9ad625e1140"}, ] [package.dependencies] @@ -2924,13 +2902,13 @@ tests = ["coverage (>=6.0.0)", "flake8", "mypy", "pytest (>=7.0.0)", "pytest-asy [[package]] name = "rich" -version = "13.8.0" +version = "13.8.1" description = "Render rich text, tables, progress bars, syntax highlighting, markdown and more to the terminal" optional = false python-versions = ">=3.7.0" files = [ - {file = "rich-13.8.0-py3-none-any.whl", hash = "sha256:2e85306a063b9492dffc86278197a60cbece75bcb766022f3436f567cae11bdc"}, - {file = "rich-13.8.0.tar.gz", hash = "sha256:a5ac1f1cd448ade0d59cc3356f7db7a7ccda2c8cbae9c7a90c28ff463d3e91f4"}, + {file = "rich-13.8.1-py3-none-any.whl", hash = "sha256:1760a3c0848469b97b558fc61c85233e3dafb69c7a071b4d60c38099d3cd4c06"}, + {file = "rich-13.8.1.tar.gz", hash = "sha256:8260cda28e3db6bf04d2d1ef4dbc03ba80a824c88b0e7668a0f23126a424844a"}, ] [package.dependencies] @@ -3457,13 +3435,13 @@ files = [ [[package]] name = "types-requests" -version = "2.32.0.20240905" +version = "2.32.0.20240907" description = "Typing stubs for requests" optional = false python-versions = ">=3.8" files = [ - {file = "types-requests-2.32.0.20240905.tar.gz", hash = "sha256:e97fd015a5ed982c9ddcd14cc4afba9d111e0e06b797c8f776d14602735e9bd6"}, - {file = "types_requests-2.32.0.20240905-py3-none-any.whl", hash = "sha256:f46ecb55f5e1a37a58be684cf3f013f166da27552732ef2469a0cc8e62a72881"}, + {file = "types-requests-2.32.0.20240907.tar.gz", hash = "sha256:ff33935f061b5e81ec87997e91050f7b4af4f82027a7a7a9d9aaea04a963fdf8"}, + {file = "types_requests-2.32.0.20240907-py3-none-any.whl", hash = "sha256:1d1e79faeaf9d42def77f3c304893dea17a97cae98168ac69f3cb465516ee8da"}, ] [package.dependencies] @@ -3532,13 +3510,13 @@ zstd = ["zstandard (>=0.18.0)"] [[package]] name = "virtualenv" -version = "20.26.3" +version = "20.26.4" description = "Virtual Python Environment builder" optional = false python-versions = ">=3.7" files = [ - {file = "virtualenv-20.26.3-py3-none-any.whl", hash = "sha256:8cc4a31139e796e9a7de2cd5cf2489de1217193116a8fd42328f1bd65f434589"}, - {file = "virtualenv-20.26.3.tar.gz", hash = "sha256:4c43a2a236279d9ea36a0d76f98d84bd6ca94ac4e0f4a3b9d46d05e10fea542a"}, + {file = "virtualenv-20.26.4-py3-none-any.whl", hash = "sha256:48f2695d9809277003f30776d155615ffc11328e6a0a8c1f0ec80188d7874a55"}, + {file = "virtualenv-20.26.4.tar.gz", hash = "sha256:c17f4e0f3e6036e9f26700446f85c76ab11df65ff6d8a9cbfad9f71aabfcf23c"}, ] [package.dependencies] @@ -3625,4 +3603,4 @@ type = ["pytest-mypy"] [metadata] lock-version = "2.0" python-versions = ">=3.9,<4.0" -content-hash = "f9ebed539e44012292a6637d32a8a649dd44ad37f1eab9fb41b12493c700cdc0" +content-hash = "bead91bd0ca1f1b9ecca03980370fbf63bcd345599e89bbd4b5b412c53de3b9f" diff --git a/pyproject.toml b/pyproject.toml index 7f6ac94d..ff4bc5c1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -41,7 +41,6 @@ pillow-heif = "^0.16.0" pytube = "15.0.0" anthropic = "^0.31.0" pydantic = "2.7.4" -eva-decord = "^0.6.1" av = "^11.0.0" [tool.poetry.group.dev.dependencies] diff --git a/vision_agent/tools/__init__.py b/vision_agent/tools/__init__.py index f7b1e4c0..a401fb46 100644 --- a/vision_agent/tools/__init__.py +++ b/vision_agent/tools/__init__.py @@ -37,6 +37,7 @@ grounding_dino, grounding_sam, ixc25_image_vqa, + ixc25_temporal_localization, ixc25_video_vqa, load_image, loca_visual_prompt_counting, diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py index ef150f55..63927f01 100644 --- a/vision_agent/tools/tools.py +++ b/vision_agent/tools/tools.py @@ -781,6 +781,44 @@ def ixc25_video_vqa(prompt: str, frames: List[np.ndarray]) -> str: return cast(str, data["answer"]) +def ixc25_temporal_localization(prompt: str, frames: List[np.ndarray]) -> List[bool]: + """'ixc25_temporal_localization' uses ixc25_video_vqa to temporally segment a video + given a prompt that can be other an object or a phrase. It returns a list of + boolean values indicating whether the object or phrase is present in the + corresponding frame. + + Parameters: + prompt (str): The question about the video + frames (List[np.ndarray]): The reference frames used for the question + + Returns: + List[bool]: A list of boolean values indicating whether the object or phrase is + present in the corresponding frame. + + Example + ------- + >>> output = ixc25_temporal_localization('soccer goal', frames) + >>> print(output) + [False, False, False, True, True, True, False, False, False, False] + >>> save_video([f for i, f in enumerate(frames) if output[i]], 'output.mp4') + """ + + buffer_bytes = frames_to_bytes(frames) + files = [("video", buffer_bytes)] + payload = { + "prompt": prompt, + "chunk_length": 2, + "function_name": "ixc25_temporal_localization", + } + data: List[int] = send_inference_request( + payload, "video-temporal-localization", files=files, v2=True + ) + chunk_size = round(len(frames) / len(data)) + data_explode = [[elt] * chunk_size for elt in data] + data_bool = [bool(elt) for sublist in data_explode for elt in sublist] + return data_bool[: len(frames)] + + def gpt4o_image_vqa(prompt: str, image: np.ndarray) -> str: """'gpt4o_image_vqa' is a tool that can answer any questions about arbitrary images including regular images or images of documents or presentations. It returns text diff --git a/vision_agent/utils/video.py b/vision_agent/utils/video.py index d306f295..388b95dd 100644 --- a/vision_agent/utils/video.py +++ b/vision_agent/utils/video.py @@ -103,7 +103,7 @@ def frames_to_bytes( def extract_frames_from_video( video_uri: str, fps: float = 1.0 ) -> List[Tuple[np.ndarray, float]]: - """Extract frames from a video + """Extract frames from a video along with the timestamp in seconds. Parameters: video_uri (str): the path to the video file or a video file url @@ -115,12 +115,24 @@ def extract_frames_from_video( from the start of the video. E.g. 12.125 means 12.125 seconds from the start of the video. The frames are sorted by the timestamp in ascending order. """ - vr = VideoReader(video_uri) - orig_fps = vr.get_avg_fps() - if fps > orig_fps: - fps = orig_fps - - s = orig_fps / fps - samples = [(int(i * s), int(i * s) / orig_fps) for i in range(int(len(vr) / s))] - frames = vr.get_batch([s[0] for s in samples]).asnumpy() - return [(frames[i, :, :, :], samples[i][1]) for i in range(len(samples))] + + cap = cv2.VideoCapture(video_uri) + orig_fps = cap.get(cv2.CAP_PROP_FPS) + orig_frame_time = 1 / orig_fps + targ_frame_time = 1 / fps + frames = [] + i = 0 + elapsed_time = 0 + while cap.isOpened(): + ret, frame = cap.read() + if not ret: + break + + elapsed_time += orig_frame_time + if elapsed_time >= targ_frame_time: + frames.append((cv2.cvtColor(frame, cv2.COLOR_BGR2RGB), i / orig_fps)) + elapsed_time -= targ_frame_time + + i += 1 + cap.release() + return frames