From 6cf110bfd2ebdc24fceca0bf9cab6c30f7e9841e Mon Sep 17 00:00:00 2001 From: Yazhou Cao Date: Mon, 25 Mar 2024 15:07:32 -0700 Subject: [PATCH 1/7] Add frame extraction tool for video processing --- poetry.lock | 124 ++++++++++++++++++++++- pyproject.toml | 5 + tests/data/video/test.mp4 | Bin 0 -> 17317 bytes tests/tools/test_video.py | 7 ++ vision_agent/tools/tools.py | 28 ++++++ vision_agent/tools/video.py | 190 ++++++++++++++++++++++++++++++++++++ 6 files changed, 353 insertions(+), 1 deletion(-) create mode 100644 tests/data/video/test.mp4 create mode 100644 tests/tools/test_video.py create mode 100644 vision_agent/tools/video.py diff --git a/poetry.lock b/poetry.lock index 3f479663..d7c66b5d 100644 --- a/poetry.lock +++ b/poetry.lock @@ -257,6 +257,17 @@ files = [ [package.extras] dev = ["black", "flake8", "flake8-pyi", "matplotlib", "mypy (==0.770)", "numpy", "pandas", "pytest"] +[[package]] +name = "decorator" +version = "4.4.2" +description = "Decorators for Humans" +optional = false +python-versions = ">=2.6, !=3.0.*, !=3.1.*" +files = [ + {file = "decorator-4.4.2-py2.py3-none-any.whl", hash = "sha256:41fa54c2a0cc4ba648be4fd43cff00aedf5b9465c9bf18d64325bc225f08f760"}, + {file = "decorator-4.4.2.tar.gz", hash = "sha256:e3a62f0520172440ca0dcc823749319382e377f37f140a0b99ef45fecb84bfe7"}, +] + [[package]] name = "distro" version = "1.9.0" @@ -518,6 +529,56 @@ files = [ {file = "idna-3.6.tar.gz", hash = "sha256:9ecdbbd083b06798ae1e86adcbfe8ab1479cf864e4ee30fe4e46a003d12491ca"}, ] +[[package]] +name = "imageio" +version = "2.34.0" +description = "Library for reading and writing a wide range of image, video, scientific, and volumetric data formats." +optional = false +python-versions = ">=3.8" +files = [ + {file = "imageio-2.34.0-py3-none-any.whl", hash = "sha256:08082bf47ccb54843d9c73fe9fc8f3a88c72452ab676b58aca74f36167e8ccba"}, + {file = "imageio-2.34.0.tar.gz", hash = "sha256:ae9732e10acf807a22c389aef193f42215718e16bd06eed0c5bb57e1034a4d53"}, +] + +[package.dependencies] +numpy = "*" +pillow = ">=8.3.2" + +[package.extras] +all-plugins = ["astropy", "av", "imageio-ffmpeg", "pillow-heif", "psutil", "tifffile"] +all-plugins-pypy = ["av", "imageio-ffmpeg", "pillow-heif", "psutil", "tifffile"] +build = ["wheel"] +dev = ["black", "flake8", "fsspec[github]", "pytest", "pytest-cov"] +docs = ["numpydoc", "pydata-sphinx-theme", "sphinx (<6)"] +ffmpeg = ["imageio-ffmpeg", "psutil"] +fits = ["astropy"] +full = ["astropy", "av", "black", "flake8", "fsspec[github]", "gdal", "imageio-ffmpeg", "itk", "numpydoc", "pillow-heif", "psutil", "pydata-sphinx-theme", "pytest", "pytest-cov", "sphinx (<6)", "tifffile", "wheel"] +gdal = ["gdal"] +itk = ["itk"] +linting = ["black", "flake8"] +pillow-heif = ["pillow-heif"] +pyav = ["av"] +test = ["fsspec[github]", "pytest", "pytest-cov"] +tifffile = ["tifffile"] + +[[package]] +name = "imageio-ffmpeg" +version = "0.4.9" +description = "FFMPEG wrapper for Python" +optional = false +python-versions = ">=3.5" +files = [ + {file = "imageio-ffmpeg-0.4.9.tar.gz", hash = "sha256:39bcd1660118ef360fa4047456501071364661aa9d9021d3d26c58f1ee2081f5"}, + {file = "imageio_ffmpeg-0.4.9-py3-none-macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:24095e882a126a0d217197b86265f821b4bb3cf9004104f67c1384a2b4b49168"}, + {file = "imageio_ffmpeg-0.4.9-py3-none-manylinux2010_x86_64.whl", hash = "sha256:2996c64af3e5489227096580269317719ea1a8121d207f2e28d6c24ebc4a253e"}, + {file = "imageio_ffmpeg-0.4.9-py3-none-manylinux2014_aarch64.whl", hash = "sha256:7eead662d2f46d748c0ab446b68f423eb63d2b54d0a8ef96f80607245540866d"}, + {file = "imageio_ffmpeg-0.4.9-py3-none-win32.whl", hash = "sha256:b6de1e18911687c538d5585d8287ab1a23624ca9dc2044fcc4607de667bcf11e"}, + {file = "imageio_ffmpeg-0.4.9-py3-none-win_amd64.whl", hash = "sha256:7e900c695c6541b1cb17feb1baacd4009b30a53a45b81c23d53a67ab13ffb766"}, +] + +[package.dependencies] +setuptools = "*" + [[package]] name = "iniconfig" version = "2.0.0" @@ -803,6 +864,30 @@ files = [ griffe = ">=0.37" mkdocstrings = ">=0.20" +[[package]] +name = "moviepy" +version = "1.0.3" +description = "Video editing with Python" +optional = false +python-versions = "*" +files = [ + {file = "moviepy-1.0.3.tar.gz", hash = "sha256:2884e35d1788077db3ff89e763c5ba7bfddbd7ae9108c9bc809e7ba58fa433f5"}, +] + +[package.dependencies] +decorator = ">=4.0.2,<5.0" +imageio = {version = ">=2.5,<3.0", markers = "python_version >= \"3.4\""} +imageio_ffmpeg = {version = ">=0.2.0", markers = "python_version >= \"3.4\""} +numpy = {version = ">=1.17.3", markers = "python_version > \"2.7\""} +proglog = "<=1.0.0" +requests = ">=2.8.1,<3.0" +tqdm = ">=4.11.2,<5.0" + +[package.extras] +doc = ["Sphinx (>=1.5.2,<2.0)", "numpydoc (>=0.6.0,<1.0)", "pygame (>=1.9.3,<2.0)", "sphinx_rtd_theme (>=0.1.10b0,<1.0)"] +optional = ["matplotlib (>=2.0.0,<3.0)", "opencv-python (>=3.0,<4.0)", "scikit-image (>=0.13.0,<1.0)", "scikit-learn", "scipy (>=0.19.0,<1.5)", "youtube_dl"] +test = ["coverage (<5.0)", "coveralls (>=1.1,<2.0)", "pytest (>=3.0.0,<4.0)", "pytest-cov (>=2.5.1,<3.0)", "requests (>=2.8.1,<3.0)"] + [[package]] name = "mpmath" version = "1.3.0" @@ -1106,6 +1191,29 @@ typing-extensions = ">=4.7,<5" [package.extras] datalib = ["numpy (>=1)", "pandas (>=1.2.3)", "pandas-stubs (>=1.1.0.11)"] +[[package]] +name = "opencv-python-headless" +version = "4.9.0.80" +description = "Wrapper package for OpenCV python bindings." +optional = false +python-versions = ">=3.6" +files = [ + {file = "opencv-python-headless-4.9.0.80.tar.gz", hash = "sha256:71a4cd8cf7c37122901d8e81295db7fb188730e33a0e40039a4e59c1030b0958"}, + {file = "opencv_python_headless-4.9.0.80-cp37-abi3-macosx_10_16_x86_64.whl", hash = "sha256:2ea8a2edc4db87841991b2fbab55fc07b97ecb602e0f47d5d485bd75cee17c1a"}, + {file = "opencv_python_headless-4.9.0.80-cp37-abi3-macosx_11_0_arm64.whl", hash = "sha256:e0ee54e27be493e8f7850847edae3128e18b540dac1d7b2e4001b8944e11e1c6"}, + {file = "opencv_python_headless-4.9.0.80-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:57ce2865e8fec431c6f97a81e9faaf23fa5be61011d0a75ccf47a3c0d65fa73d"}, + {file = "opencv_python_headless-4.9.0.80-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:976656362d68d9f40a5c66f83901430538002465f7db59142784f3893918f3df"}, + {file = "opencv_python_headless-4.9.0.80-cp37-abi3-win32.whl", hash = "sha256:11e3849d83e6651d4e7699aadda9ec7ed7c38957cbbcb99db074f2a2d2de9670"}, + {file = "opencv_python_headless-4.9.0.80-cp37-abi3-win_amd64.whl", hash = "sha256:a8056c2cb37cd65dfcdf4153ca16f7362afcf3a50d600d6bb69c660fc61ee29c"}, +] + +[package.dependencies] +numpy = [ + {version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""}, + {version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""}, + {version = ">=1.23.5", markers = "python_version >= \"3.11\""}, +] + [[package]] name = "packaging" version = "24.0" @@ -1325,6 +1433,20 @@ files = [ dev = ["pre-commit", "tox"] testing = ["pytest", "pytest-benchmark"] +[[package]] +name = "proglog" +version = "0.1.10" +description = "Log and progress bar manager for console, notebooks, web..." +optional = false +python-versions = "*" +files = [ + {file = "proglog-0.1.10-py3-none-any.whl", hash = "sha256:19d5da037e8c813da480b741e3fa71fb1ac0a5b02bf21c41577c7f327485ec50"}, + {file = "proglog-0.1.10.tar.gz", hash = "sha256:658c28c9c82e4caeb2f25f488fff9ceace22f8d69b15d0c1c86d64275e4ddab4"}, +] + +[package.dependencies] +tqdm = "*" + [[package]] name = "pycodestyle" version = "2.9.1" @@ -2488,4 +2610,4 @@ watchmedo = ["PyYAML (>=3.10)"] [metadata] lock-version = "2.0" python-versions = ">=3.10,<3.12" -content-hash = "dbb1f3241c006408ab8056349c63d7f947450c01fd518a758af66e2e5c000916" +content-hash = "c22b1c0eb7fbae1f326837eacfe7af3dd0ee754d7a074c9ae1b465e05d65e98e" diff --git a/pyproject.toml b/pyproject.toml index 8dc00b82..e18a0d7c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,6 +29,10 @@ sentence-transformers = "2.*" openai = "1.*" typing_extensions = "4.*" +[tool.poetry.group.video.dependencies] +moviepy = "1.*" +opencv-python-headless = "4.*" + [tool.poetry.group.dev.dependencies] autoflake = "1.*" pytest = "7.*" @@ -84,4 +88,5 @@ module = [ "faiss.*", "openai.*", "sentence_transformers.*", + "moviepy.*", ] diff --git a/tests/data/video/test.mp4 b/tests/data/video/test.mp4 new file mode 100644 index 0000000000000000000000000000000000000000..596eea389407ac08915148bbd2212567a1382b4a GIT binary patch literal 17317 zcmeIaV|3-));G9gvtqkq+jfN&RczZfDy+C-I~CiuZB)g!(L3kd``+{1en)qY{?s3O zjr`Z-Z_YL6+sRO$YgED$^-xapiLZ%jR63D zJZmF;C!kLSB*goB>6+MK``)r-V-oEW(K6}Txf?S(D-j)$p{;{45fhMgU}5B9BQnr8 zHfCjH1R5mifes8`6vQOxSc!yGMSz}0#)d$Hh^?Kwm9dEv5i=tr3mr2fGc!PqioZe(n0rEf!T>tM?8k%ivO$=V9&V{7MRZfoPnO=PHVpl`^>MC4#>!pA~n zWNct%YiP;G#LdXfNThG0Z{_Z2%*W`)!p-Q$#Kb~mZOmt8>_+71Yyjjqi0mBQfl%PC z>tMvkM9&BW0XHIRb2non-H(V&K!mP?zKy9d9}_!~p_zlNwZ1M8%0%SkU~FY&?g->K z-8hX5oq&v?y)_>r@Cf=w9=0~de9TO=OiV;3`i@Szc8-?jb{`&p2e7x(wKXwuGn zVwgBdIKwG2Yq*4LtAS*eJ7x72yBs)gTA>9Fa*%iLI15Ocy$|vpe|C8W9f16NMBINBNbSLF@c`0qSj9Fp_dSX7Og(pm zAl;Fd(rIs}rBki(Y?(IwL6VgpoYDcqKc%a>#4B~NY1DI$*^MwixZw$^dlE~k###zP zm8OGwwFxQ3@cq-C=#MQU-CI0`)#@OQVe7ItII_e*7}P*DF#V(9^7pN#n+=^qYPdB1 z0sQlRA*2H&IOrEs?Z=vdDB-OP(~68WeRnL05wr7CBLQ;?>JtvTB<~cxKJd8Y@6o6A z#ha&5EEhthmfmU&%syXP5+&P_VyashV6>VI7>Kewra-;>$5lfpefOU15Cf`u{#YfX z_79R}xuV@kodiZAJB9GY1-{;axx-bIHubGj&N~FQR!^rR-&riy5o3%qmFYA&uk7|6Lkqj7F|iiHX2Au80n@)~5K| zW`$1oK9>0&vN`CX@pg^EdgSJQk8p+_VoM`AeFKD4+Ufz8ztV*8s0{;HaCT z)_()aEfc3WD&I4;A2WuBq!r(-svUHJO~BCu6(Q0O>x%T?e8J;BC_wc}+@kH}r_9a> zpYG9T)j3MrmJU$F)-$k{wXJ)~h5_a-S{I3R7Wum73GuV!tSFLXJWh=WwntIPc%*N8 zO%zo`W@31@R|8PldIf(x3)Ie>#wW8gDudZcU%$$)F+9HpXS^yxbSPp_IpxdfF$mm{ z=D!kk-b&P!Q|(5eRTzpLmYRcW22LoLym8pYKesALfg>LSDYrsU*2AlEI`Qs2}HD z6aC7cO5Sm!PVi(>u?p>bf^*mr$mm((>UP*e0dn2pUSlGc_|Ahk9NO-1#mgg~LS0$B z7Zs7kDWslkx)psDTUw!+R28PGr{$bN(LD>S_WbAtoRNegDdB6-K9jvo3EH5L2quqn z#ZiKnF>Q5mntUff&{6YbG-u{E#5(#6dgX;zr6JOX!Jm>rYiQ*D&ddp|#2H>#IS(DF z{SpuLsm^>Mx0Q~oVZjVCp^RG8b``Bs_NI?_s7t$9cED#rH1@}w(OaF~z3BJ)qQchJ zR8Or=S!sr7;^HF38NnG2WWtx{{l=juuCZ>g!wx)^aAW&0p6a>c-H1JEK}Z;C9v`MX zRt4~gl-aF7IE`2~Zwlj73RjO2yI7%0U1cj-YcT!Plo+nPX`8DCz{Qm3A(2#qbR6O*c??DwNMH4Hopd25`+9g4aOiLpT68RpCTgQ&2`_AVmC ztk9+Mk1=k0otVZuoW(`qM&pKnVtCk0b+xk6JWq-^f8afQH|Sa+9qzIUUIb@BXeK7=o_ zxcQa;+N2+9#B&)Pj;gD2B+bn;oox;mXy8{4;aT(vt~8>f=%$!fRkp(1tecGa+Qre4 z*23$ztw4{J+-!Z^QUW9j-!p2SejeI&;SrlNx@kH!M?lWdasE)$SSvuwBhN^kIfMHBW*g zcNXKMaM4M**Fd88CRQP|X*MHl_qrAo`F$S*c&@U9NZpCR<1|h}$MX>+kzX}XhoHLp zc6b7AZrKz%T-U4l#pMN|r#-66cK-a4m+hMA*tdt7Q)L*=Q=>m6_becrwWcAIw1SpvM$_cUh!; zSfBtL*eyxjMX+vs-Ov8$!n3Lyc3FwwwzZ^vEuA96t@n5gk=%nO!K8>%>8Of~Km&S)F z*){IH1j>4l@-$z+TAte%y2%)@nRapBFvF@Esc(Z3qHO#jCg})Mf5+on=o3Ml>O_{h{`~c-9LQ=fyUVcQZ9pu>5q`y08PH_7T3H=u+UrR zd?vkPCI|Ni4ar2erKALN?92gfLXd+4 zuZebH)wrdLatArJ2-ByZ5T{c~Atd-K-4_bAbaC1C=9=_M8B*J;&FUoG_&R6&7I8t_9gev+b8d^#6^`g3F*87LTC}w zk+!C(b%F^yQK`8%hXiqgT=H2SsYCH*pN^*-Z_9*SRdtOFK8P^2+}Ac`h~H0`%|4T#<7*t)_Z5}lSaP;b<9GG~aZc8+9j6VqOrtl@ z-=ci zOFF@PO>X-R{T?QgflL)#OV*sWvp@rfcR9Ab=7nI`H23-;TYLX6pj1pcNa1qM~yX%UWs&^^JbICR#Wd{9=Q~B=<7UP);L% zHy&G%ahNrbZwK#ebER0GTH3&Nd@qm-vQG|{Ul2b?uosY|?o57byV&)5#Gyet;0k^i z8~(gBhA#JsQn0%5H)KC=<>bqv-K#YH90$MiI0!1O1!GI}x4Wn`5-SV{E_|IYH2e|uiwB{;EsRI! z;-8wEmM`24Pi10I+#r@c zeH(TJ+uUe5$jvtm`x&|79VbOFVr8x{11FKU=TNAZh(C4ka6G)S40pt(bNl^)J+&?l%!<_RCDiVJMm@T;9WSY4c}BsB z(=#tdRw$!YgjK&-OyZpLrzQ>70zQB1sghxov^Ar98-%DF&RY(NSJGgv2&EyidZGF} z9-RfD)1X0+BM>V#Qo`lYZ9ljtBX(qRxFSe#plw_OXm!CEZQ{P}NS#|seU()roQ9SS z$*dpH5rDOTh9GNsTPjRlCufvYjwBC>FP9p zo{?WYZZTA%UJSvA19OCTti|K?7PTgXGG9BTN1sf`Il;xFO5d^5(UZ{~U!K&a5=aE= zg6Vn_w_n7M9rC^~Ksh4W5$h$!f>UCHI(KRGyO(44d;|@^l>}B?r4EzGb(@FQ0FJ}N zyYy{-*yi`pn#!5^fNTr9sv(^X3QsC&e^l?+Wb7@7K?V!m@9n1+S-)|w;>mrM6Pv)* z`{p=1wYP2k#@hY;dk5pudt8^$pB6|Vz)~Mlk%iGg87-^4G;5q<10b8=SMub)7+DC> zb+!6$Qf2JJK@Z7`gIdx>i8>=TWe39=Ur;YimWlAM``<5n;MBcCL^zM4zO|2aaltG8 z$@97!cv{=gbm~fype~D3Ivq)h6Fe>{uJz+`?`@p7cR_Z_KrQ@Q?JTY_N3)?MCiD4X zSlcJSKku{^$y$9ab*YFR*R5s@0mJHR)2H2~JJ8vVm6dYq*18`aI=*|eE6UK07I#LS zEjY__J|YN@23a0}Gc@J2_gQAvL8;`+iSa)xk*(nqXucgJ>OR^2#h&GE_%^WRmV73) zq0p)RZ}mrt8Pu=_u2BnU>93+AKk>lS!lsgiTXRMnU;bD#tCF06_;2rRURp52>5I1gT$^=l)kGgn%(T>zjr4Q3lVI}Tau8pgH4Sr{W z--zo3{bW^K1r%*oYDFvTqXnuRjWc^4H9O(cWg)t|@#+C$#WUQa(b$*OZ zuHrdJPd~spTZ3s}KiG2sjE8w>&-zq(Z^Wf4i}U-JOWO6MBj3e)*>L)xfT&Rp1&<<#JCh-EUObD(RamS%FD(|whq_6bbx?r%PaVN4W6 zvdZzt+i#HNeUWD?Y{Ono{OWw08jkRXYK=2m@eBqU-(~m;`uD8srLgFF%Xd|SpP944o_(n}BE_?UJu4PPQ8pqQ{+;-G0qsQ|QB&i6&ctc=90TUOW!c zdzOa&#H)2*qesm;aepx?8uDRWm*@FH1M(A-|6oC|buaF&z_nvV?A7yJl}-nuz1p_h z{ZB)FVu3RYKb&JlrtdCZjz=gG-n@0KbGia;>CaLEwa?iMO0rbK`?MEYReZ`C3uXhK ze4CpdFycgS<>`l!i>afU%4x238gZNEu_I3zDOrJnXkd87D5vRS8-KR=>8Mb06f$jRE}tM*py01QCR(kGwz+Mj5N7ivL4({^h8T%pA(g#3 z9l0-}B>Nj+@JIHGIB-(#SgxD)vw~CToQ?5W7ELyZP-MA#n(z zo1S4;(Bb9;7OgUBU>+Gy!-CyA-fBkf6z3^OBsveFZK=w5~SJIxp4HCa-MRLWii*pa~3;1vy2q?sU$3VS|a3 zv0ABf;L_*PPK=^ul!xNFp^zr$Q#%()y^=&e=gk-C_78-+i#P-puv?ap?beEb8H=Nv z2d6T)=R}EL7^~?SaqPvqI~$4lFz@IiyyFSQv;l|(tMEO$C-^rwU4|s|W`Wk6EIcT> zu)?dWhmF_r{$4jn4z0U}@PeHgdQ_~jzgjuJbb}XPG*^N1{~=WZ7%oF+H(pcx?(p=T z2tr#p#s}eDL88HR#WuOjJacn0Q19kTdcD3dyiIV=WcZwzs(6S}pLc+8k5z16=Y+Bc z{*HR(!8#!|sjEiyuB(lYZVTN|n;SJxq9WeKDvt@lz_759XCB%5&Vd)~wX^m{^W=GV zMC2NvkO~TE<`SjEHY#jH^p4s817SIo^6?MeH9Di##k%J0Imymx3jnTD)lTIJ6eD%MMVy|e6hlcFeD>Umf0GNRO-HUBji)AB)?^T7_BJq%AiU_uw27Pq#$J+ z%@HBYFIl74T zAErhxzB<($!l*6WB=N=zZ5>TcE|bO~ZVPSPJ%{9L=-}cM7Qnu=;}$BEKNt!n%U7IL z?ufcy)VHNbD?(CcZ5bolob!Z*v(FM9_bvNStNo=Quw@lLhtxG%(H8U47vk<^`f3Qs zo-`7WSHuyJTV2}AOB{rB*;45m#g9M3mAM|$=`ch)>lR8%-v;d0R{WYq6Ij3A1(Zk2 zzFUM;R=u>Z<#^AY{UrN>W;!5Ymy&(z-ox?oXQ<@R;re}paPzP~vQ?ahRI1+tdJuz+ zUmx<)Zo)m)E&5Zxj*I-J$6R>_*4~)kY!R!j_%ti)@kp=eVQu!{j4cq zvWWF+V1i5B@A}GmZw;SKYd!Hsyynw zo+>A zE@4*&PSqPRTTEee3|>9_fvahRD_N}A@Ku#R?APgI5qHP74Q(V8-SK{fYm1J1RC_|- zVS~HG5MmRwte*ty6VFxOrK%BbbGb>!zW0xE9z)*9OUC3*ntBj0D$vVC5N?Mj6t?&! zX3hbA>TEE3n2rK7ymL(fULL^8JU(<-udt(!(`;GlS7$&-aj=|m#UQ>)WVQCP0Wx|XZRPJrgzG97j$bXezN!Sp2%q;>Pv=@sQny?J9{M(r(p zVwmtVe$ioGvEn#KSmaX#(wt$XSv{zGLcQ42)r2iR!m44|fr$$&tzbRIZ~vNP)Cy1z zn7G0q5Vt+y@A!fIEH27u^ zX67&r5uQ)*8lIBh?9$q^WWOi(J#OvhK?>AwIn~!iuJP0kM>Cs*-(lBNC1ykF6zkQM zoFygBR~u6TuA%a@Mla{>V9aSGaq*YZ$NdUdaOARuIlJnKiV->8z6_ONMmVL+4YmoEr9a4?%pMkj z{6c}3D6018n4F-)+r`;?b%pYd0bz)p@p#sVGJr{;aFjKh7Qg5L1$~Fxs>$m570+4z z#p3y4UAUB?(X`k#PC}*F2+X!?hb3)y?+-b?U_Nrw7bI9nEt_(GiBrTF73q#3p>sUK zF)lYDv<>i#LRI6)$tCoG*Ohr4Ikk(r*f$C-Cco98R`46J?%vm(Z-l{1HBn|V*~;QD zlM(h_S^m#B!r^C4Mdvw_JWVXHf*cCA$!#j$x&B&#ND#C}UZULOHM6~;0_Iz=#5kc< zm|qxK=hSK+wc&Ky>4?b$Aw_XW`ax5@^R+#UkD{Q6L(l5nTP8WDLKpoY+Bm;pfZ>cq z6UY$FVh;lh6rmJbk?%5#8#XXRPtOgAvwZj(t=qft@2yl~G3|wsbgsAuYKNlkOTd)P z&7iA$$7<2`UE1b_E_OxX*45VYh!{9JqlT^!sC+2;cefQXb^`9#z~M^sF1sK*-W)F1 zaI21*r>pSR--)n|6!%w3g3ZxH_n*bA@UIVJP73V?e5P2yqCj;Q+fyk<}wVR;=V$w)JKSD;j!GkiT3P2Y+O z0WtQ1_5#P=BYh0oY|2*qWi}1ue1?7Kg|^gJtp7em)Leq zaxa_*{s||m`psO1(8G{XNoeT2J5R294y{|uF}qdz9?r)`<(>)*NnO7SybPnaH(2tJ zk@x%LrRPLOGm2NsFFTe@erX>1)9h~}eJRt3WFiQMWdeHxl~s?r%mN2jxIb4dkHFe7 zIDZq-{2Hb*`T1_w*|);fI(h+XCa;}SsY5?)i;OnF-_v)J)53n=2Ii`a(&d1aR#1?V z6W^Dn5l`&0CY9^Nyx_by9~?Q`v@@ak^L@5a*KyJ;Do=j)|0RDUWe?G0{AAIZyZdFQy>2W0z$1 z-;-)coSaK5rD$$1w?DsVu84z0o4_8IFiVD5py&u8){;N4@5^T>zNqkwcu7&+kg;)P z2XRQt2o2gw{$Z4 z-s?|-kJL%;{)OB`cIWTJo;>dEmS2#?_9910*aAAQt0#Go#2{EM6^Vu>4x7{5v2Q~{ zMZOcQI+(20Xa^NXqI{}*XaI7qSEF{*?UdEiD{b`xR85?w4DZmwLlfAQ4t_A1Oq_Gm zYAL55O~Zb-W5VpMh?e_|0cR-@Yjb5n@;VzM8jr4q2rc}ZC1uF)dpIR^420%;QxX-5 za*O@G@Mz-FgC6W_RT7QwPQWAlN4_Nm`V0C=McE-;XJGFQm2kppq z2F1h6(KGx4Mn)_YRI5o{Rv0#wdAi*Ico=ov;qC--IE3pv3#o@ zl|2o?ZJ{@l_vwt)dJH5DVx}dr8Djeqp}%`CM1mVGb>n&ve^6NMovZVHfmNR>NvT@q z2ehzAX`sZo9M#bTJ)gUW@8z})?Tz&JD)6PyJF@n-I0R@X7|;j76E(gO!_Vs4IvdUK z!aCV5aC>TbgG}Gw|IXyG<@3Wuncf9e#PfUj%!ps%(+%oAjXt0bQ2OSx!s;ZRSNJZ041|t1HZIR=`s+p$;fpDKk2Xr}{Mcx^* zFbjila>9>c9qG%?$!QRpus2HL_glq3n=@8}^|MLsq3-f(Jqq8Rx0fd6E=2QG-6+27 zB+2>5dD@ejsBOrXq_@1`C!XAM-#ryrpu>f5lNvz#31RL!VeI~p(JzU1QpVFyW?9=Z|0C?F#nMszMIg10j=OY4wozh5Bdrd(j9o$q@?Z3qCv~pr~XyPtM*4#)BpANq*Da2c_+g9ul~G50uP!!p^CGM@+~b`Xh9r-CMCin_|=ZgB0$ zDeSw}O#Nl6k2Tg*53G3K`;P*L<8|jROuaV^c_F&ydbD=Z`Q|6r**D_??KAbfo{=2H z2YISr2j0O{DA&|mSidwULE}&Pj&z6b+Lk9JfZ|W>=cqTcH|2hT&ubs4i3PY~<9vpf$e)rY3A<-s$S2kb@5b>^)0p=_^|A!#q!Z5AUkDjcVNh3&b! z9cks8%V#+dKztk=kK!ZPU#ytzn&|9yO@*b>pQhU^%ITR%S}qDJ^4n=>a>s8CRP3XXqwzlZ6+r>x*b74tB!o^S?9h~Vbc>0F`}xJeVi^skRN9B)jk&Ei zb`czD)_}8xa;gqBAZcn2dz5L1m${=fkzH)6loGru!=Tx-oT$Z=Uc4| znTHiKg?I!}Sm_=XO*B`746?kQF?`ScrEuq-v*+!`PiJ?$0DEND-9}Of>;#n=S!~c} zow={(y`Lfu7b(Xz2;1We}UE@(xw9uM$63B6o#X;CBkSzuO4 zjMU1#yv>(E8s?yenmP@mz@%!S{%E3&yZzS7b7|1SKM8l?fP0LlGW`-UM1)%2(`8TZ zWt#0I=4BfKP?mjt;tSY*kj;~e9aw!GG)tfM!XVNfT!_SWtIDHZrPFjkGQ{<|+jPrd zr%zBdp3iFreGIU_;H}kmH&~0%;?danQmx{xLvH2R(#jAENET(c z`C0Hoe0Tn}l2JxRCe$t8Sr+9y*UWABWNERapceTuDz(*l@}4WDtmpGv$J2qX*)01(p%jdl@ zy16Z>2%TNae$tbyFSC(S&2@CQ4WxOu&_wQ@hMG8x7|biw27e;z5}7RBZW+UTNaD6+ zb{0*WQ&I0f!PzSquO}+avT(*9h?8yQCeNU&AoUT|3plXm%V%||SM??bKvQ7HuW?El zzfM*xre-7zy5x^zCTsR6f45iR76@sqYh`HcUaVfhT>Kg@a*>TB*IR`#U_XsOnWtxs zamDm)dyNiu=UsyZMp>%zo~6MHF2o+tjB-`=z4^|u)FfB6#M1XktOmmTdYDtzJi&w0 z2i^`H)Kc;M`$o$N^O6YCPD?5z(XfCwLblv=?1L4nO`2!Spp{oUizzuvj%n4q^~`F! z?I-w=8m2mUNng2AkBAX}H_9^ffPLOiIf#+21>d}P==kljPyCkW(KFbQ7y#mIV|CwU z&|}{P_eE#D?G@s>1N9!fuM`?;=Nv)Az{U?wIIBo@+#MENx4+n-kN+%(OpoEY3ZMU6 zQDFJmKg^ld7#J1<0l z)j_dE58wQz)ptm%fRhBic02T}>ZK^Ve2G-r3g;WMgQwlYqb84&SXW(g2Xd&QN@$n@x3LtC z9x{#Cp*fGzBVF}6L~SHFI8UH#eiz0VO3~RbKa)<<78ZmYrQvUi6}LlsK{2l#gJTaP z3$_vNve*)3Eyk)z2tN2~ZqhS>Ut@Y_+C6kpr44^u1v1TYRD8%Sz_ZG`Vq+k4~*=xrGwI=prZKcqrKP1lD`jG`QpX6O;elox-QCG{V z;_@3V8Ry}KLM&fCeBc+gsZs0f8OofG{K;$SsV<$37vbQzlM^Hf^2Kqlb;r1Bks>KF z9jbkPFTD`qZ%QgKgUyDDoYs{+h}mD=XBF0rUsD*{*sD}E7PK-C;)g}-ift9!9|++Y ztV$$*mB~ND9D@|-t2%aQHbmz>gjaRGj;r0|#nVokW3sA!={ti`o!N#9U-}e#{pWJ-4+-Qs=z%Nbw`R?~X`^Vc9{gNnfmZIVwj7UBKDNWU4>$LD=eBfK|S0Yxl>Nb!6kU)7QPx61e?gp2!ipwcE^XTSs?PDuVf;&lV?us84>FHD zumH9Xdz{HbD7WlUhe@X+V_3I#PU7c2SprzgYfx5B4g-91+)UAmpT-`@R4HHfwaE|5oVsw(;V_)t-zOKjcmsJk`$~$AU@ejLrWKMah~&ouujVX z+@AG$z3Qb5efJSi(>qJOzVW%QEU-gCWF6llzwC;f*v=kW_!l_|%7bKkgK2gk=|w!4 z6p15BK4qlFD5#Q$e4&*bt~W_M?*GN;@a>M|o{yWd%b!m@@9sP}@oBv8^FCHxIEYOv zrkfLT;xkkbFa4}zqo)0NBM&6jAL-~z);Tvj3#HMqntI@%8qtPqMmmgA0i1){ z@bi5p6`8R5##yC8LE=p+Hfp}tWETCD=*^cm-P3n~){Emzh_6GR_nEGp3Z^fvn)m6_ zr-)moN$;^;3W=uymHUv0Go z))^DL{hz#{mH-p_Q~(zTF))HbuRN*B3hQQfKgLyI4B6pGc_Nq|hue#ckw=HV(YliZ*anigUY_3(BAnHT>6n zAhk%b`Pq%0%9wWw54F+#9Au88Gh+ULB?Kp$>aq69TAQ&S-At+>Zwkd}>lUN__y~wM zy89J%*jXW#QP#4fW?$~0d_sBjZC2BX3-7MY{e6+eOr}*kR_-erH_=g-hq-DCTlDax zDLl6m6HbL2`SRJ_Caw>@<$z<-Ay2{hbc|F!=t)_?h{p`QYt zssR-J4rWGkDTx=$H%D_6dLxt7X@MatNnR$1#=>^7LqJ~b_i(%t(@SoT4DSrIVLS`< zAB6E~%={uR!OC+j?8G7R&X@VruNP(6w=<+sOl-CFLiD7ETgyDF?Dt^vs&k1n0deb( z zGY8lYCRdTIDc4OA&?yf=$U!4wp)7+@0-|7jDs3<>1bG6?H*U}k_3H;Y`z^>^+NK}n zoJk7WBb0==zPa1BihLcNAAB7wvJR4Kgoma<9jjb-&pTU7rXy>LgP~pK4Jp%#fM{5W@{!0VXS^WK?2M zF`l}aTh$EfFi{bcDNn{pmV-;qti8$9EIxDjTHTyA79Ge)FyJ`k`xb8NL;A<=G1zg# z7J9LLXX=JjI=fxa$&}zG!9)|Ge(k014<46y<>y9N<1+!l<9%?|oWHIhI62+Ptcp%R z&U)++CtZHjQG5|Fd=vkoR7!rl^}9LlE#T)wg$2wq`3t@nqQsYq1psKE!Upqc^Urk^ z=Nb8({jIX^=ml+vnEL7wKkeV+z-h~Fx{+*Xu zZpzh_JRMZ6KS}qxvDfmk`v^Rp&PP;!MRpMVhQ2|2f}Q3{*CVc+b-QlWBd{p>9DHQ+ zEIaMbpo5<~k7Wf}Gxi4yDcwv20I!p*fz2a$%e(r61GLiubxCt=dO2m*XZqJ_$Y>|~ ziw8?s*O9>(WRU(Gr(8_|u_wVePWVgQk0#krmIeG01Id{rivoRnp-bA0d9{#8c`NMD z3rqUB$H(vyveM1E9Bk=<7s2`Q2mPoZ{ZS6fDX0a@l#3?vThY9$)q+`6N{UDO7Jr-H zs?_USq6H0(?NHT=Xuh8Q$~JJ5eELhHAv5<*5-O2+Nq!T}5d~i(leMyBB`1-Lgttyn zZ4_n$Dp&YG=X-{C73Wzl;oF&1sWDB(%JVtpcq?H&KQG6t5>~NKm?JhGB4?$u{C+{dJX+lU8!Wy8!(eMtP;olg{}Oe)3QE=)<9jJh1{& z&C96O#pO&8a{~J>rthMP8CzfZkqK~IyW!6AzOIN7wniGJ=i)PUM~tK@-4Tk)NX?Vw z5m}9=Fw}cq9(NKF|5%l~#OJ#7S;zk@`mw>IEopRvFWV>$K5EH6h2 z1P#Alr#;#m9r*W{7$9^sS_V_e@Kkr3+hwnLld8%Nah^ospb-X{uEJgmQoD#({DsQP zCaQ{#bF7|022qei!qFnN>3pA=d>L4e^(*LTsUpfFZ^}bTm-87 zW}3^L`TcX!8|uTgEANBRwsDq_wXQgemv4BVxRM698I}TZz1)?<30^x!%SHLGfu8 z?5sT#+~jdUUZ1F{H!4AbLr)~N`g}dJY@fWE-3$i@$ABUjvej5&Ivp>R$f@!!J|kG? z$ImJ3m*`hEeIp7Ad&mTqx;@mUMUl7TT# z2J{>08?g_Bd~r+R2x5Yo6LJI|%Gcg!@#UESPLdK3+J<{E1Go{8LN|POy88o`?T>r8 zLaAg{z5r0fzxYgHw;%yb#Sb1NQ{PEA3(xLv-p~Tb+XHz-p;R-eVj;5>S)_LWC=yTu zE0h@`hC)mH{%>5t2kyfc7^CO|_Ynh#D+l7N{>FV|1L8)+(Eouu{71gyf8Z|ugZn#Q z7UDl~S#-Z`}X0`JMlLPH5$KklDX* zSqQ*<J(#>jKnSS-Y4S0qH*$*Z}|lH3a|xe0=?9{SN`C{x4bZ zKc4?P4io?Yj{sgLwgeiroGky!3HA@ie?5Hga+VQaocT zN8sJ}zZNjUM_llKof#a=+Q?iVXd$vT`tRBqfoeT~O5a~NIx{0Hhrc-B_HnEI-~0zv z{kY{Wsc&OsW&Gg-Rsp={YyxD^T&({J`p;vN8U1a;GXh?w{s05lhaKM8!HVc_9t?Qt z+rSFQS2#L3{(B%FKA9gE`+@Xh{WIp@8b4s!l0f$3`B?vFkH7W**|FZ=fdVTqstAY? z0PcMofN2V(<;%k0W`e*v0Kg3%tRL3c4+H>dcKm0Qr0ezJ;RBBbyyyLQOF>_=0f2{( zw3#4?XY!2{ZhUFT>wD;7ITmBF~Fn`4Y z(*NK7cR&8$_6Pc-pTK7R)lWR&Yz#jX!1b@$e9Y}Hehz5!_{;M52GsL}0a^I}dQgG# z$2XRL>;Jzj17C}Vw!pJ~tdDv>YBg{+a?%IVZ`Q_tQ6LK-{^#2r_z~ftZ)f-Mj{IMX n&iwDs6!H=$+mC?wz<;mt5#%E%Kvv(ygpG@ynS-8*k@5ckH212{ literal 0 HcmV?d00001 diff --git a/tests/tools/test_video.py b/tests/tools/test_video.py new file mode 100644 index 00000000..8952c238 --- /dev/null +++ b/tests/tools/test_video.py @@ -0,0 +1,7 @@ +from vision_agent.tools.video import extract_frames_from_video + + +def test_extract_frames_from_video(): + video_path = "tests/data/video/test.mp4" + res = extract_frames_from_video(video_path) + assert len(res) == 1 diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py index c5f50e98..23116444 100644 --- a/vision_agent/tools/tools.py +++ b/vision_agent/tools/tools.py @@ -505,6 +505,33 @@ def __call__(self, input: List[int]) -> float: return round(input[0] / input[1], 2) +class ExtractFrames(Tool): + name = "extract_frames_" + description = "'extract_frames_' extract image frames from the input video, return a list of tuple (frame, timestamp), where the timestamp is the relative time in seconds of the frame occurred in the video." + usage = { + "required_parameters": [{"name": "video_uri", "type": "str"}], + "examples": [ + { + "scenario": "Can you extract the frames from this video? Video: www.foobar.com/video?name=test.mp4", + "parameters": {"video_uri": "www.foobar.com/video?name=test.mp4"}, + }, + { + "scenario": "Can you extract the images from this video file? Video path: tests/data/test.mp4", + "parameters": {"video_uri": "tests/data/test.mp4"}, + }, + ], + } + + def __call__(self, video_uri: str) -> list[tuple[np.ndarray, float]]: + try: + from vision_agent.tools.video import extract_frames_from_video + except Exception as e: + raise ImportError( + "vision_agent is not installed correctly (cause: missing dependencies), please run 'pip install vision-agent[video]' instead." + ) from e + return extract_frames_from_video(video_uri) + + TOOLS = { i: {"name": c.name, "description": c.description, "usage": c.usage, "class": c} for i, c in enumerate( @@ -520,6 +547,7 @@ def __call__(self, input: List[int]) -> float: Subtract, Multiply, Divide, + ExtractFrames, ] ) if (hasattr(c, "name") and hasattr(c, "description") and hasattr(c, "usage")) diff --git a/vision_agent/tools/video.py b/vision_agent/tools/video.py new file mode 100644 index 00000000..1957915d --- /dev/null +++ b/vision_agent/tools/video.py @@ -0,0 +1,190 @@ +import logging +import math +import os +from concurrent.futures import ProcessPoolExecutor, as_completed +from typing import cast + +import cv2 +import numpy as np +from moviepy.video.io.VideoFileClip import VideoFileClip +from tqdm import tqdm + +_LOGGER = logging.getLogger(__name__) +# The maximum length of the clip to extract frames from, in seconds +_CLIP_LENGTH = 30.0 + + +def extract_frames_from_video( + video_uri: str, fps: int = 2, motion_detection_threshold: float = 0.06 +) -> list[tuple[np.ndarray, float]]: + """Extract frames from a video + + Parameters + ---------- + video_uri: str, the path to the video file or a video file url + fps: int, the frame rate per second to extract the frames + motion_detection_threshold: float, the threshold to detect the motion between frames. + A value between 0-1, the percentage change that is considered a different frame. + A lower value means more frames will be extracted. + + Returns + ------- + list[tuple[np.ndarray, int]], a list of tuples containing the extracted frame and the timestamp in seconds. E.g. [(frame1, 0.0), (frame2, 0.5), ...] + The timestamp is the time in seconds from the start of the video. E.g. 12.125 means 12.125 seconds from the start of the video. + The frames are sorted by the timestamp in ascending order. + """ + with VideoFileClip(video_uri) as video: + video_duration: float = video.duration + num_workers = os.cpu_count() + clip_length: float = min(video_duration, _CLIP_LENGTH) + start_times = list(range(0, math.ceil(video_duration), math.ceil(clip_length))) + assert start_times, f"No frames to extract from the input video: {video_uri}" + segment_args = [ + { + "video_uri": video_uri, + "start": start, + "end": ( + start + clip_length if i < len(start_times) - 1 else video_duration + ), + "fps": fps, + "motion_detection_threshold": motion_detection_threshold, + } + for i, start in enumerate(start_times) + ] + if ( + cast(float, segment_args[-1]["end"]) + - cast(float, segment_args[-1]["start"]) + < 1 + ): + # If the last segment is less than 1s, merge it with the previous segment + # This is to avoid the failure of the last segment extraction + assert ( + len(segment_args) > 1 + ), "Development bug - Expect at least 2 segments." + segment_args[-2]["end"] = video_duration + segment_args.pop(-1) + _LOGGER.info( + f"""Created {len(segment_args)} segments from the input video {video_uri} of length {video.duration}s, with clip size: {clip_length}s and {num_workers} workers. + Segments: {segment_args} + """ + ) + frames = [] + with tqdm(total=len(segment_args)) as pbar: + with ProcessPoolExecutor(max_workers=num_workers) as executor: + futures = [ + executor.submit(_extract_frames_by_clip, **kwargs) # type: ignore + for kwargs in segment_args + ] + for future in as_completed(futures): + result = future.result() + frames.extend(result) + pbar.update(1) + frames.sort(key=lambda x: x[1]) + _LOGGER.info(f"Extracted {len(frames)} frames from video {video_uri}") + return frames + + +def _extract_frames_by_clip( + video_uri: str, + start: int = 0, + end: float = -1, + fps: int = 2, + motion_detection_threshold: float = 0.06, +) -> list[tuple[np.ndarray, float]]: + """Extract frames from a video clip with start and end time in seconds. + + Parameters + ---------- + video_uri: str, the path to the video file or a video file url + start: int, the start time (in seconds) of the clip to extract + end: float, the end time (in seconds, up to millisecond level precision) of the clip to extract, if -1, extract the whole video + fps: int, the frame rate to extract the frames + motion_detection_threshold: float, the threshold to detect the motion between frames + """ + with VideoFileClip(video_uri) as video: + source_fps = video.fps + if end <= 0: + end = video.duration + _LOGGER.info( + f"Extracting frames from video {video_uri} ({video.duration}s) with start={start}s and end={end}s" + ) + clip = video.subclip(start, end) + processable_frames = int(clip.duration * fps) + _LOGGER.info( + f"Extracting frames from video clip of length {clip.duration}s with FPS={fps} and start_time={start}s. Total number of frames in clip: {processable_frames}" + ) + frames = [] + total_count, skipped_count = 0, 0 + prev_processed_frame = None + pbar = tqdm( + total=processable_frames, desc=f"Extracting frames from clip {start}-{end}" + ) + for i, frame in enumerate(clip.iter_frames(fps=fps, dtype="uint8")): + curr_processed_frame = _preprocess_frame(frame) + total_count += 1 + pbar.update(1) + # Skip the frame if it is similar to the previous one + if prev_processed_frame is not None and _similar_frame( + prev_processed_frame, + curr_processed_frame, + threshold=motion_detection_threshold, + ): + skipped_count += 1 + continue + prev_processed_frame = curr_processed_frame + ts = round(clip.reader.pos / source_fps, 3) + frames.append((frame, ts)) + + _LOGGER.info( + f"""Finished! + Frames extracted: {len(frames)} + Extracted frame timestamp: {[f[1] for f in frames]} + Total processed frames: {total_count} + Skipped frames: {skipped_count} + Scan FPS: {fps} + Clip start time: {start}s, {clip.pos} + Clip end time: {end}s + Clip duration: {clip.duration}s + Clip total frames: {clip.duration * source_fps} + Video duration: {video.duration}s + Video FPS: {video.fps} + Video total frames: {video.reader.nframes}""" + ) + return frames + + +def _preprocess_frame(frame: np.ndarray) -> np.ndarray: + # Convert to grayscale + frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) + frame = cv2.GaussianBlur(src=frame, ksize=(5, 5), sigmaX=0) + return frame + + +def _similar_frame( + prev_frame: np.ndarray, curr_frame: np.ndarray, threshold: float +) -> bool: + """Detect two frames are similar or not + + Parameters + ---------- + threshold : float, optional + Similarity threshold, a value between 0-1, the percentage change that is considered a different frame. + """ + # calculate difference and update previous frame TODO: don't assume the processed image is cached + diff_frame = cv2.absdiff(src1=prev_frame, src2=curr_frame) + # Only take different areas that are different enough (>20 / 255) + thresh_frame = cv2.threshold( + src=diff_frame, thresh=20, maxval=255, type=cv2.THRESH_BINARY + )[1] + change_percentage = cv2.countNonZero(thresh_frame) / ( + curr_frame.shape[0] * curr_frame.shape[1] + ) + _LOGGER.debug(f"Image diff: {change_percentage}") + return change_percentage < threshold + + +# res = extract_frames(video) +if __name__ == "__main__": + video_path = "/Users/asia/Downloads/frames/baby_cam1.MP4" + res = extract_frames_from_video(video_path) + print("done, extracted num frames: ", len(res)) From 8daf1f0b223ae5ab2a261d82d653cbc09270ea93 Mon Sep 17 00:00:00 2001 From: Yazhou Cao Date: Tue, 26 Mar 2024 11:48:47 -0700 Subject: [PATCH 2/7] Update ExtractFrames tool --- poetry.lock | 4 ++-- vision_agent/tools/tools.py | 15 ++++++++++++--- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/poetry.lock b/poetry.lock index d7c66b5d..d6ba3c83 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1209,9 +1209,9 @@ files = [ [package.dependencies] numpy = [ + {version = ">=1.23.5", markers = "python_version >= \"3.11\""}, {version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""}, {version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""}, - {version = ">=1.23.5", markers = "python_version >= \"3.11\""}, ] [[package]] @@ -1275,8 +1275,8 @@ files = [ [package.dependencies] numpy = [ - {version = ">=1.22.4,<2", markers = "python_version < \"3.11\""}, {version = ">=1.23.2,<2", markers = "python_version == \"3.11\""}, + {version = ">=1.22.4,<2", markers = "python_version < \"3.11\""}, ] python-dateutil = ">=2.8.2" pytz = ">=2020.1" diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py index 23116444..787f1914 100644 --- a/vision_agent/tools/tools.py +++ b/vision_agent/tools/tools.py @@ -507,7 +507,7 @@ def __call__(self, input: List[int]) -> float: class ExtractFrames(Tool): name = "extract_frames_" - description = "'extract_frames_' extract image frames from the input video, return a list of tuple (frame, timestamp), where the timestamp is the relative time in seconds of the frame occurred in the video." + description = "'extract_frames_' extract image frames from the input video, return a list of tuple (frame, timestamp), where the timestamp is the relative time in seconds of the frame occurred in the video, the frame is a local image file path that stores the frame." usage = { "required_parameters": [{"name": "video_uri", "type": "str"}], "examples": [ @@ -522,14 +522,23 @@ class ExtractFrames(Tool): ], } - def __call__(self, video_uri: str) -> list[tuple[np.ndarray, float]]: + def __call__(self, video_uri: str) -> list[tuple[str, float]]: try: from vision_agent.tools.video import extract_frames_from_video except Exception as e: raise ImportError( "vision_agent is not installed correctly (cause: missing dependencies), please run 'pip install vision-agent[video]' instead." ) from e - return extract_frames_from_video(video_uri) + frames = extract_frames_from_video(video_uri) + result = [] + _LOGGER.info( + f"Extracted {len(frames)} frames from video {video_uri}. Temporarily saving them as images to disk for downstream tasks." + ) + for frame, ts in frames: + with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp: + Image.fromarray(frame).save(tmp) + result.append((tmp.name, ts)) + return result TOOLS = { From 57ba26824c6cc416dbbd8df9dfeaef43ca7c6a51 Mon Sep 17 00:00:00 2001 From: Yazhou Cao Date: Tue, 26 Mar 2024 16:38:32 -0700 Subject: [PATCH 3/7] minor updates --- docs/api/tools.md | 4 +++- poetry.lock | 2 +- pyproject.toml | 2 -- tests/tools/test_video.py | 1 + vision_agent/tools/tools.py | 16 ++++++++++------ vision_agent/tools/video.py | 13 +++---------- 6 files changed, 18 insertions(+), 20 deletions(-) diff --git a/docs/api/tools.md b/docs/api/tools.md index fa3fba93..71935d46 100644 --- a/docs/api/tools.md +++ b/docs/api/tools.md @@ -2,4 +2,6 @@ ::: vision_agent.tools.prompts -::: vision_agent.tools.tools \ No newline at end of file +::: vision_agent.tools.tools + +::: vision_agent.tools.video \ No newline at end of file diff --git a/poetry.lock b/poetry.lock index d6ba3c83..e50ff090 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2610,4 +2610,4 @@ watchmedo = ["PyYAML (>=3.10)"] [metadata] lock-version = "2.0" python-versions = ">=3.10,<3.12" -content-hash = "c22b1c0eb7fbae1f326837eacfe7af3dd0ee754d7a074c9ae1b465e05d65e98e" +content-hash = "93a4e362ddaf2a1e65a6457c212896853b1c4409e0456f9209f33b795b5ec748" diff --git a/pyproject.toml b/pyproject.toml index e18a0d7c..7b7686d0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,8 +28,6 @@ torch = "2.1.*" # 2.2 causes sentence-transformers to seg fault sentence-transformers = "2.*" openai = "1.*" typing_extensions = "4.*" - -[tool.poetry.group.video.dependencies] moviepy = "1.*" opencv-python-headless = "4.*" diff --git a/tests/tools/test_video.py b/tests/tools/test_video.py index 8952c238..a19b529f 100644 --- a/tests/tools/test_video.py +++ b/tests/tools/test_video.py @@ -2,6 +2,7 @@ def test_extract_frames_from_video(): + # TODO: consider generating a video on the fly instead video_path = "tests/data/video/test.mp4" res = extract_frames_from_video(video_path) assert len(res) == 1 diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py index 787f1914..99afcee3 100644 --- a/vision_agent/tools/tools.py +++ b/vision_agent/tools/tools.py @@ -11,6 +11,7 @@ from PIL.Image import Image as ImageType from vision_agent.image_utils import convert_to_b64, get_image_size +from vision_agent.tools.video import extract_frames_from_video _LOGGER = logging.getLogger(__name__) @@ -523,12 +524,15 @@ class ExtractFrames(Tool): } def __call__(self, video_uri: str) -> list[tuple[str, float]]: - try: - from vision_agent.tools.video import extract_frames_from_video - except Exception as e: - raise ImportError( - "vision_agent is not installed correctly (cause: missing dependencies), please run 'pip install vision-agent[video]' instead." - ) from e + """Extract frames from a video clip with start and end time in seconds. + + + Parameters: + video_uri: the path to the video file or a url points to the video data + + Returns: + a list of tuples containing the extracted frame and the timestamp in seconds. E.g. [(path_to_frame1, 0.0), (path_to_frame2, 0.5), ...]. The timestamp is the time in seconds from the start of the video. E.g. 12.125 means 12.125 seconds from the start of the video. The frames are sorted by the timestamp in ascending order. + """ frames = extract_frames_from_video(video_uri) result = [] _LOGGER.info( diff --git a/vision_agent/tools/video.py b/vision_agent/tools/video.py index 1957915d..a86efc49 100644 --- a/vision_agent/tools/video.py +++ b/vision_agent/tools/video.py @@ -23,9 +23,9 @@ def extract_frames_from_video( ---------- video_uri: str, the path to the video file or a video file url fps: int, the frame rate per second to extract the frames - motion_detection_threshold: float, the threshold to detect the motion between frames. - A value between 0-1, the percentage change that is considered a different frame. - A lower value means more frames will be extracted. + motion_detection_threshold: float, The threshold to detect motion between changes/frames. + A value between 0-1, which represents the percentage change required for the frames to be considered in motion. + For example, a lower value means more frames will be extracted. Returns ------- @@ -181,10 +181,3 @@ def _similar_frame( ) _LOGGER.debug(f"Image diff: {change_percentage}") return change_percentage < threshold - - -# res = extract_frames(video) -if __name__ == "__main__": - video_path = "/Users/asia/Downloads/frames/baby_cam1.MP4" - res = extract_frames_from_video(video_path) - print("done, extracted num frames: ", len(res)) From b8be42dfde8d1b0be1d72c665f95e801f8081ebd Mon Sep 17 00:00:00 2001 From: Yazhou Cao Date: Tue, 26 Mar 2024 16:49:01 -0700 Subject: [PATCH 4/7] Update docs --- README.md | 1 + docs/index.md | 1 + vision_agent/tools/tools.py | 4 +++- vision_agent/tools/video.py | 39 +++++++++++++++---------------------- 4 files changed, 21 insertions(+), 24 deletions(-) diff --git a/README.md b/README.md index 5687dd43..6879e734 100644 --- a/README.md +++ b/README.md @@ -104,6 +104,7 @@ you. For example: | Crop | Crop crops an image given a bounding box and returns a file name of the cropped image. | | BboxArea | BboxArea returns the area of the bounding box in pixels normalized to 2 decimal places. | | SegArea | SegArea returns the area of the segmentation mask in pixels normalized to 2 decimal places. | +| ExtractFrames | ExtractFrames extracts image frames from the input video. | It also has a basic set of calculate tools such as add, subtract, multiply and divide. diff --git a/docs/index.md b/docs/index.md index 2a83bef7..e0033dfa 100644 --- a/docs/index.md +++ b/docs/index.md @@ -90,6 +90,7 @@ you. For example: | Crop | Crop crops an image given a bounding box and returns a file name of the cropped image. | | BboxArea | BboxArea returns the area of the bounding box in pixels normalized to 2 decimal places. | | SegArea | SegArea returns the area of the segmentation mask in pixels normalized to 2 decimal places. | +| ExtractFrames | ExtractFrames extracts image frames from the input video. | It also has a basic set of calculate tools such as add, subtract, multiply and divide. diff --git a/vision_agent/tools/tools.py b/vision_agent/tools/tools.py index 99afcee3..6e55f210 100644 --- a/vision_agent/tools/tools.py +++ b/vision_agent/tools/tools.py @@ -507,6 +507,8 @@ def __call__(self, input: List[int]) -> float: class ExtractFrames(Tool): + r"""Extract frames from a video.""" + name = "extract_frames_" description = "'extract_frames_' extract image frames from the input video, return a list of tuple (frame, timestamp), where the timestamp is the relative time in seconds of the frame occurred in the video, the frame is a local image file path that stores the frame." usage = { @@ -524,7 +526,7 @@ class ExtractFrames(Tool): } def __call__(self, video_uri: str) -> list[tuple[str, float]]: - """Extract frames from a video clip with start and end time in seconds. + """Extract frames from a video. Parameters: diff --git a/vision_agent/tools/video.py b/vision_agent/tools/video.py index a86efc49..5745c2fe 100644 --- a/vision_agent/tools/video.py +++ b/vision_agent/tools/video.py @@ -19,19 +19,15 @@ def extract_frames_from_video( ) -> list[tuple[np.ndarray, float]]: """Extract frames from a video - Parameters - ---------- - video_uri: str, the path to the video file or a video file url - fps: int, the frame rate per second to extract the frames - motion_detection_threshold: float, The threshold to detect motion between changes/frames. - A value between 0-1, which represents the percentage change required for the frames to be considered in motion. - For example, a lower value means more frames will be extracted. + Parameters: + video_uri: the path to the video file or a video file url + fps: the frame rate per second to extract the frames + motion_detection_threshold: The threshold to detect motion between changes/frames. + A value between 0-1, which represents the percentage change required for the frames to be considered in motion. + For example, a lower value means more frames will be extracted. - Returns - ------- - list[tuple[np.ndarray, int]], a list of tuples containing the extracted frame and the timestamp in seconds. E.g. [(frame1, 0.0), (frame2, 0.5), ...] - The timestamp is the time in seconds from the start of the video. E.g. 12.125 means 12.125 seconds from the start of the video. - The frames are sorted by the timestamp in ascending order. + Returns: + a list of tuples containing the extracted frame and the timestamp in seconds. E.g. [(frame1, 0.0), (frame2, 0.5), ...]. The timestamp is the time in seconds from the start of the video. E.g. 12.125 means 12.125 seconds from the start of the video. The frames are sorted by the timestamp in ascending order. """ with VideoFileClip(video_uri) as video: video_duration: float = video.duration @@ -93,13 +89,12 @@ def _extract_frames_by_clip( ) -> list[tuple[np.ndarray, float]]: """Extract frames from a video clip with start and end time in seconds. - Parameters - ---------- - video_uri: str, the path to the video file or a video file url - start: int, the start time (in seconds) of the clip to extract - end: float, the end time (in seconds, up to millisecond level precision) of the clip to extract, if -1, extract the whole video - fps: int, the frame rate to extract the frames - motion_detection_threshold: float, the threshold to detect the motion between frames + Parameters: + video_uri: the path to the video file or a video file url + start: the start time (in seconds) of the clip to extract + end: the end time (in seconds, up to millisecond level precision) of the clip to extract, if -1, extract the whole video + fps: the frame rate to extract the frames + motion_detection_threshold: the threshold to detect the motion between frames """ with VideoFileClip(video_uri) as video: source_fps = video.fps @@ -165,10 +160,8 @@ def _similar_frame( ) -> bool: """Detect two frames are similar or not - Parameters - ---------- - threshold : float, optional - Similarity threshold, a value between 0-1, the percentage change that is considered a different frame. + Parameters: + threshold: similarity threshold, a value between 0-1, the percentage change that is considered a different frame. """ # calculate difference and update previous frame TODO: don't assume the processed image is cached diff_frame = cv2.absdiff(src1=prev_frame, src2=curr_frame) From d6bd2233e0542ae5a43c34095ea34ebadf750c37 Mon Sep 17 00:00:00 2001 From: Yazhou Cao Date: Wed, 27 Mar 2024 09:31:46 -0700 Subject: [PATCH 5/7] Attempt to support py 3.9 --- .github/workflows/ci_cd.yml | 2 +- vision_agent/tools/video.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/ci_cd.yml b/.github/workflows/ci_cd.yml index d21d703b..ade39771 100644 --- a/.github/workflows/ci_cd.yml +++ b/.github/workflows/ci_cd.yml @@ -9,7 +9,7 @@ jobs: Test: strategy: matrix: - python-version: [3.10.11] + python-version: [3.9, 3.10.11] os: [ ubuntu-22.04, windows-2022, macos-12 ] runs-on: ${{ matrix.os }} steps: diff --git a/vision_agent/tools/video.py b/vision_agent/tools/video.py index 5745c2fe..606166ed 100644 --- a/vision_agent/tools/video.py +++ b/vision_agent/tools/video.py @@ -2,7 +2,7 @@ import math import os from concurrent.futures import ProcessPoolExecutor, as_completed -from typing import cast +from typing import List, Tuple, cast import cv2 import numpy as np @@ -16,7 +16,7 @@ def extract_frames_from_video( video_uri: str, fps: int = 2, motion_detection_threshold: float = 0.06 -) -> list[tuple[np.ndarray, float]]: +) -> List[Tuple[np.ndarray, float]]: """Extract frames from a video Parameters: @@ -86,7 +86,7 @@ def _extract_frames_by_clip( end: float = -1, fps: int = 2, motion_detection_threshold: float = 0.06, -) -> list[tuple[np.ndarray, float]]: +) -> List[Tuple[np.ndarray, float]]: """Extract frames from a video clip with start and end time in seconds. Parameters: From 773c47a90bce22818e0ee4bf375e848e92b7080c Mon Sep 17 00:00:00 2001 From: Yazhou Cao Date: Wed, 27 Mar 2024 09:38:36 -0700 Subject: [PATCH 6/7] Update pyproject.toml --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 7b7686d0..a0fd24d2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,7 +16,7 @@ packages = [{include = "vision_agent"}] "documentation" = "https://github.com/landing-ai/vision-agent" [tool.poetry.dependencies] # main dependency group -python = ">=3.10,<3.12" +python = ">=3.9,<3.12" numpy = ">=1.21.0,<2.0.0" pillow = "10.*" From ce3a7d6767c0e5c6d7309a90ccf2dc11fe869ddb Mon Sep 17 00:00:00 2001 From: Yazhou Cao Date: Wed, 27 Mar 2024 09:45:02 -0700 Subject: [PATCH 7/7] Test 3.9 and 3.11 in CI --- .github/workflows/ci_cd.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci_cd.yml b/.github/workflows/ci_cd.yml index ade39771..56c31d19 100644 --- a/.github/workflows/ci_cd.yml +++ b/.github/workflows/ci_cd.yml @@ -9,7 +9,7 @@ jobs: Test: strategy: matrix: - python-version: [3.9, 3.10.11] + python-version: [3.9, 3.11] os: [ ubuntu-22.04, windows-2022, macos-12 ] runs-on: ${{ matrix.os }} steps: