diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..3978a0f --- /dev/null +++ b/.dockerignore @@ -0,0 +1,2 @@ +.git +.gitignore diff --git a/.flake8 b/.flake8 new file mode 100644 index 0000000..6deafc2 --- /dev/null +++ b/.flake8 @@ -0,0 +1,2 @@ +[flake8] +max-line-length = 120 diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..f3c231a --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +__pycache__ +*.py[cod] + +*.pth +*.pb diff --git a/Dockerfile.cpu b/Dockerfile.cpu new file mode 100644 index 0000000..8d54486 --- /dev/null +++ b/Dockerfile.cpu @@ -0,0 +1,67 @@ +FROM ubuntu:18.04 + +WORKDIR /usr/src/app + +ENV LANG="C.UTF-8" LC_ALL="C.UTF-8" PATH="/opt/venv/bin:$PATH" PIP_NO_CACHE_DIR="false" CFLAGS="-mavx2" CXXFLAGS="-mavx2" + +RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ + python3 python3-pip python3-venv \ + wget make g++ ffmpeg python3-dev libblas-dev liblapack-dev swig \ + cmake yasm zlib1g-dev && \ + rm -rf /var/lib/apt/lists/* + +COPY requirements.txt . + +RUN python3 -m venv /opt/venv && \ + python3 -m pip install pip==19.2.3 pip-tools==4.0.0 + +# For pytorch and torchvision we need platform specific (cpu vs. gpu) wheels from +# https://download.pytorch.org/whl/cpu/torch_stable.html +# To generate hashes run: python3 -m pip hash *.whl +RUN echo "https://download.pytorch.org/whl/cpu/torch-1.2.0%2Bcpu-cp36-cp36m-manylinux1_x86_64.whl \ + --hash=sha256:7b9b943673d3acb446248ba0d6feed6926bf60ce719ace4707a6559c1f57ced7 \ + \n \ + https://download.pytorch.org/whl/cpu/torchvision-0.4.0%2Bcpu-cp36-cp36m-manylinux1_x86_64.whl \ + --hash=sha256:63f342b858b18839fcf3ff8ad857e44a4ff0fcb8cb8e2bdc2f4ed9afa7cec9e0 \ + \n" >> requirements.txt && cat requirements.txt + +RUN python3 -m piptools sync + +RUN python3 -c "from torchvision.models import resnet50; resnet50(pretrained=True, progress=False)" && \ + python3 -c "from torchvision.models.video import r2plus1d_18; r2plus1d_18(pretrained=True, progress=False)" + +RUN wget -q https://github.com/libjpeg-turbo/libjpeg-turbo/archive/2.0.3.tar.gz -O libjpeg-turbo.tar.gz && \ + echo "a69598bf079463b34d45ca7268462a18b6507fdaa62bb1dfd212f02041499b5d libjpeg-turbo.tar.gz" | sha256sum -c && \ + tar xf libjpeg-turbo.tar.gz && \ + rm libjpeg-turbo.tar.gz && \ + cd libjpeg-turbo* && \ + mkdir build && \ + cd build && \ + cmake -DCMAKE_BUILD_TYPE=Release -DREQUIRE_SIMD=On -DCMAKE_INSTALL_PREFIX=/usr/local .. && \ + make -j $(nproc) && \ + make install && \ + ldconfig && \ + cd ../../ && \ + rm -rf libjpeg-turbo* + +RUN python3 -m pip uninstall -y pillow && \ + python3 -m pip install --no-binary :all: --compile pillow-simd==6.0.0.post0 + +RUN wget -q https://github.com/facebookresearch/faiss/archive/v1.5.3.tar.gz -O faiss.tar.gz && \ + echo "b24d347b0285d01c2ed663ccc7596cd0ea95071f3dd5ebb573ccfc28f15f043b faiss.tar.gz" | sha256sum -c && \ + tar xf faiss.tar.gz && \ + rm faiss.tar.gz && \ + cd faiss* && \ + ./configure --without-cuda && \ + make -j $(nproc) && \ + make -j $(nproc) -C python && \ + make install && \ + make -C python install && \ + cd .. && \ + rm -rf faiss* + +COPY . . + +EXPOSE 5000 +ENTRYPOINT ["/usr/src/app/bin/sfi"] +CMD ["-h"] diff --git a/Dockerfile.gpu b/Dockerfile.gpu new file mode 100644 index 0000000..1dba111 --- /dev/null +++ b/Dockerfile.gpu @@ -0,0 +1,68 @@ +FROM nvidia/cuda:10.1-cudnn7-devel + +WORKDIR /usr/src/app + +ENV LANG="C.UTF-8" LC_ALL="C.UTF-8" PATH="/opt/venv/bin:$PATH" PIP_NO_CACHE_DIR="false" CFLAGS="-mavx2" CXXFLAGS="-mavx2" + +RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ + python3 python3-pip python3-venv \ + wget make g++ ffmpeg python3-dev libblas-dev liblapack-dev swig \ + cmake yasm zlib1g-dev && \ + rm -rf /var/lib/apt/lists/* + +COPY requirements.txt . + +RUN python3 -m venv /opt/venv && \ + python3 -m pip install pip==19.2.3 pip-tools==4.0.0 + +# For pytorch and torchvision we need platform specific (cpu vs. gpu) wheels from +# https://download.pytorch.org/whl/cu100/torch_stable.html +# To generate hashes run: python3 -m pip hash *.whl +RUN echo "https://download.pytorch.org/whl/cu100/torch-1.2.0-cp36-cp36m-manylinux1_x86_64.whl \ + --hash=sha256:a13bf6f78a49d844b85c142b8cd62d2e1833a11ed21ea0bc6b1ac73d24c76415 \ + \n \ + https://download.pytorch.org/whl/cu100/torchvision-0.4.0-cp36-cp36m-manylinux1_x86_64.whl \ + --hash=sha256:2f67efdf6edd9ea7f9cd9a3917ae5c63d5684e3bdb5cc9c2b364c15bdfe4456b \ + \n" >> requirements.txt + +RUN python3 -m piptools sync + +RUN python3 -c "from torchvision.models import resnet50; resnet50(pretrained=True, progress=False)" && \ + python3 -c "from torchvision.models.video import r2plus1d_18; r2plus1d_18(pretrained=True, progress=False)" + +RUN wget -q https://github.com/libjpeg-turbo/libjpeg-turbo/archive/2.0.3.tar.gz -O libjpeg-turbo.tar.gz && \ + echo "a69598bf079463b34d45ca7268462a18b6507fdaa62bb1dfd212f02041499b5d libjpeg-turbo.tar.gz" | sha256sum -c && \ + tar xf libjpeg-turbo.tar.gz && \ + rm libjpeg-turbo.tar.gz && \ + cd libjpeg-turbo* && \ + mkdir build && \ + cd build && \ + cmake -DCMAKE_BUILD_TYPE=Release -DREQUIRE_SIMD=On -DCMAKE_INSTALL_PREFIX=/usr/local .. && \ + make -j $(nproc) && \ + make install && \ + ldconfig && \ + cd ../../ && \ + rm -rf libjpeg-turbo* + + +RUN python3 -m pip uninstall -y pillow && \ + python3 -m pip install --no-binary :all: --compile pillow-simd==6.0.0.post0 + +RUN wget -q https://github.com/facebookresearch/faiss/archive/v1.5.3.tar.gz -O faiss.tar.gz && \ + echo "b24d347b0285d01c2ed663ccc7596cd0ea95071f3dd5ebb573ccfc28f15f043b faiss.tar.gz" | sha256sum -c && \ + tar xf faiss.tar.gz && \ + rm faiss.tar.gz && \ + cd faiss* && \ + ./configure --with-cuda-arch="-gencode=arch=compute_37,code=compute_37 -gencode=arch=compute_70,code=compute_70" --with-cuda="/usr/local/cuda" && \ + make -j $(nproc) && \ + make -j $(nproc) -C python && \ + make install && \ + make -C python install && \ + cd .. && \ + rm -rf faiss* + +COPY . . + +EXPOSE 5000 +ENTRYPOINT ["/usr/src/app/bin/sfi"] +CMD ["-h"] diff --git a/LICENSE.md b/LICENSE.md new file mode 100644 index 0000000..2620140 --- /dev/null +++ b/LICENSE.md @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2019 MoabitCoin + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..4d72bfc --- /dev/null +++ b/Makefile @@ -0,0 +1,32 @@ +dockerimage ?= moabitcoin/sfi +dockerfile ?= Dockerfile.cpu +srcdir ?= $(shell pwd) +datadir ?= $(shell pwd) + +install: + @docker build -t $(dockerimage) -f $(dockerfile) . + +i: install + + +update: + @docker build -t $(dockerimage) -f $(dockerfile) . --pull --no-cache + +u: update + + +run: + @docker run -it --rm --ipc="host" --network="host" -p 5000:5000 -v $(srcdir)/sfi:/usr/src/app/sfi -v $(datadir):/data --entrypoint=/bin/bash $(dockerimage) + +r: run + + +publish: + @docker image save $(dockerimage) \ + | pv -N "Publish $(dockerimage) to $(sshopts)" -s $(shell docker image inspect $(dockerimage) --format "{{.Size}}") \ + | ssh $(sshopts) "docker image load" + +p: publish + + +.PHONY: install i run r update u publish p diff --git a/README.md b/README.md new file mode 100644 index 0000000..aa4012b --- /dev/null +++ b/README.md @@ -0,0 +1,104 @@ +# Semantic Frame Index + +Fast and efficient queries on video frames by semantic similarity. + + +## Use Case + +We record tens of thousand hours of drive video data and need to be able to search for semantically similar scenarios. +Simlarity could mean similar lighting conditions, similar vehicle types, similar traffic volumes, similar objects on the road, and so on. + + +## Implementation Sketch + +We +- extract key frames using a neural net for frame similarity in feature space +- extract a trained convolutional neural net's high level feature maps for all key frames +- compute Maximum Activations of Convolution (MAC) features from the high-level feature maps +- index the feature maps for approximate nearest neighbor searches based on L2 distance +- query the indexed dataset for semantically similar scenarios + + +## Usage + +All tools can be invoked via + + ./bin/sfi + + ./bin/sfi --help + ./bin/sfi --help + + +### stream-index + +Builds an index from a directory of images for fast and efficient approximate nearest neighbor queries based on L2 distance. +The quantizer for the index needs to get trained on a small subset of the feature maps to approximate the dataset's centroids. +We recommend runing this step on GPUs. + + +### save-feature + +Extracts high level feature maps and computes MACs for an image frames from a trained convolutional neural net. + + +### save-frames + +Extracts semantic key frames from videos based on a trained convolution net for feature similarity between frames. + + +### query-server + +Loads up the index (slow) and keeps it in memory to handle nearest neighbor queries (fast). +Responds to queries by searching the index, aggregating results, and re-ranking them. + + +### query-client + +Sends nearest neighbor requests against the query server and reports results to the user. +The query and results are based on the saved MAC features. + + +### model-train + +Trains a binary classification model on a dataset (potentially noisy and obtained from the index). +We recommend runing this step on GPUs. + + +### model-infer + +Predicts binary classification labels on a dataset, using a trained model. + + +## Development + +Create a self-contained reproducible development environment + + make i + +Get into the development environment + + make r + +The Python source code directory is mounted into the container: if you modify it on the host it will get modified in the container. + +To make data visible in the container set the datadir env var, e.g. to make your `/tmp` directory show up in `/data` inside the container run + + make r datadir=/tmp + +See the `Makefile` for options and more advanced targets. + + +## References + +- [Particular object retrieval with integral max-pooling of CNN activations](https://arxiv.org/abs/1511.05879) +- Product Quantizer (PQ) [part 1](http://mccormickml.com/2017/10/13/product-quantizer-tutorial-part-1/), and [part 2](http://mccormickml.com/2017/10/22/product-quantizer-tutorial-part-2/) +- [Product Quantization for Nearest Neighbor Search](https://hal.inria.fr/file/index/docid/514462/filename/paper_hal.pdf) +- [Billion-scale similarity search with GPUs](https://arxiv.org/pdf/1702.08734.pdf) +- [faiss wiki](https://github.com/facebookresearch/faiss/wiki) + + +## License + +Copyright © 2019 MoabitCoin + +Distributed under the MIT License (MIT). diff --git a/bin/sfi b/bin/sfi new file mode 100755 index 0000000..9f648f8 --- /dev/null +++ b/bin/sfi @@ -0,0 +1,3 @@ +#!/bin/bash + +python3 -m sfi.tools "$@" diff --git a/requirements.in b/requirements.in new file mode 100644 index 0000000..320a6af --- /dev/null +++ b/requirements.in @@ -0,0 +1,7 @@ +numpy +pillow +tqdm +flask +requests +einops +scikit-video diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..49c0fd5 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,145 @@ +# +# This file is autogenerated by pip-compile +# To update, run: +# +# pip-compile --generate-hashes +# +certifi==2019.6.16 \ + --hash=sha256:046832c04d4e752f37383b628bc601a7ea7211496b4638f6514d0e5b9acc4939 \ + --hash=sha256:945e3ba63a0b9f577b1395204e13c3a231f9bc0223888be653286534e5873695 \ + # via requests +chardet==3.0.4 \ + --hash=sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae \ + --hash=sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691 \ + # via requests +click==7.0 \ + --hash=sha256:2335065e6395b9e67ca716de5f7526736bfa6ceead690adf616d925bdc622b13 \ + --hash=sha256:5b94b49521f6456670fdb30cd82a4eca9412788a93fa6dd6df72c94d5a8ff2d7 \ + # via flask +einops==0.1.0 \ + --hash=sha256:4ab512fe059c0841e1a315449ca9d7f35eaa05c8c095a14f2c1b92b2b77684d2 \ + --hash=sha256:4fd64864fcb8159074da3213b9327c242536784416cbf423745ef8579850d30b +flask==1.1.1 \ + --hash=sha256:13f9f196f330c7c2c5d7a5cf91af894110ca0215ac051b5844701f2bfd934d52 \ + --hash=sha256:45eb5a6fd193d6cf7e0cf5d8a5b31f83d5faae0293695626f539a823e93b13f6 +idna==2.8 \ + --hash=sha256:c357b3f628cf53ae2c4c05627ecc484553142ca23264e593d327bcde5e9c3407 \ + --hash=sha256:ea8b7f6188e6fa117537c3df7da9fc686d485087abf6ac197f9c46432f7e4a3c \ + # via requests +itsdangerous==1.1.0 \ + --hash=sha256:321b033d07f2a4136d3ec762eac9f16a10ccd60f53c0c91af90217ace7ba1f19 \ + --hash=sha256:b12271b2047cb23eeb98c8b5622e2e5c5e9abd9784a153e9d8ef9cb4dd09d749 \ + # via flask +jinja2==2.10.1 \ + --hash=sha256:065c4f02ebe7f7cf559e49ee5a95fb800a9e4528727aec6f24402a5374c65013 \ + --hash=sha256:14dd6caf1527abb21f08f86c784eac40853ba93edb79552aa1e4b8aef1b61c7b \ + # via flask +markupsafe==1.1.1 \ + --hash=sha256:00bc623926325b26bb9605ae9eae8a215691f33cae5df11ca5424f06f2d1f473 \ + --hash=sha256:09027a7803a62ca78792ad89403b1b7a73a01c8cb65909cd876f7fcebd79b161 \ + --hash=sha256:09c4b7f37d6c648cb13f9230d847adf22f8171b1ccc4d5682398e77f40309235 \ + --hash=sha256:1027c282dad077d0bae18be6794e6b6b8c91d58ed8a8d89a89d59693b9131db5 \ + --hash=sha256:24982cc2533820871eba85ba648cd53d8623687ff11cbb805be4ff7b4c971aff \ + --hash=sha256:29872e92839765e546828bb7754a68c418d927cd064fd4708fab9fe9c8bb116b \ + --hash=sha256:43a55c2930bbc139570ac2452adf3d70cdbb3cfe5912c71cdce1c2c6bbd9c5d1 \ + --hash=sha256:46c99d2de99945ec5cb54f23c8cd5689f6d7177305ebff350a58ce5f8de1669e \ + --hash=sha256:500d4957e52ddc3351cabf489e79c91c17f6e0899158447047588650b5e69183 \ + --hash=sha256:535f6fc4d397c1563d08b88e485c3496cf5784e927af890fb3c3aac7f933ec66 \ + --hash=sha256:62fe6c95e3ec8a7fad637b7f3d372c15ec1caa01ab47926cfdf7a75b40e0eac1 \ + --hash=sha256:6dd73240d2af64df90aa7c4e7481e23825ea70af4b4922f8ede5b9e35f78a3b1 \ + --hash=sha256:717ba8fe3ae9cc0006d7c451f0bb265ee07739daf76355d06366154ee68d221e \ + --hash=sha256:79855e1c5b8da654cf486b830bd42c06e8780cea587384cf6545b7d9ac013a0b \ + --hash=sha256:7c1699dfe0cf8ff607dbdcc1e9b9af1755371f92a68f706051cc8c37d447c905 \ + --hash=sha256:88e5fcfb52ee7b911e8bb6d6aa2fd21fbecc674eadd44118a9cc3863f938e735 \ + --hash=sha256:8defac2f2ccd6805ebf65f5eeb132adcf2ab57aa11fdf4c0dd5169a004710e7d \ + --hash=sha256:98c7086708b163d425c67c7a91bad6e466bb99d797aa64f965e9d25c12111a5e \ + --hash=sha256:9add70b36c5666a2ed02b43b335fe19002ee5235efd4b8a89bfcf9005bebac0d \ + --hash=sha256:9bf40443012702a1d2070043cb6291650a0841ece432556f784f004937f0f32c \ + --hash=sha256:ade5e387d2ad0d7ebf59146cc00c8044acbd863725f887353a10df825fc8ae21 \ + --hash=sha256:b00c1de48212e4cc9603895652c5c410df699856a2853135b3967591e4beebc2 \ + --hash=sha256:b1282f8c00509d99fef04d8ba936b156d419be841854fe901d8ae224c59f0be5 \ + --hash=sha256:b2051432115498d3562c084a49bba65d97cf251f5a331c64a12ee7e04dacc51b \ + --hash=sha256:ba59edeaa2fc6114428f1637ffff42da1e311e29382d81b339c1817d37ec93c6 \ + --hash=sha256:c8716a48d94b06bb3b2524c2b77e055fb313aeb4ea620c8dd03a105574ba704f \ + --hash=sha256:cd5df75523866410809ca100dc9681e301e3c27567cf498077e8551b6d20e42f \ + --hash=sha256:e249096428b3ae81b08327a63a485ad0878de3fb939049038579ac0ef61e17e7 \ + # via jinja2 +numpy==1.17.0 \ + --hash=sha256:03e311b0a4c9f5755da7d52161280c6a78406c7be5c5cc7facfbcebb641efb7e \ + --hash=sha256:0cdd229a53d2720d21175012ab0599665f8c9588b3b8ffa6095dd7b90f0691dd \ + --hash=sha256:312bb18e95218bedc3563f26fcc9c1c6bfaaf9d453d15942c0839acdd7e4c473 \ + --hash=sha256:464b1c48baf49e8505b1bb754c47a013d2c305c5b14269b5c85ea0625b6a988a \ + --hash=sha256:5adfde7bd3ee4864536e230bcab1c673f866736698724d5d28c11a4d63672658 \ + --hash=sha256:7724e9e31ee72389d522b88c0d4201f24edc34277999701ccd4a5392e7d8af61 \ + --hash=sha256:8d36f7c53ae741e23f54793ffefb2912340b800476eb0a831c6eb602e204c5c4 \ + --hash=sha256:910d2272403c2ea8a52d9159827dc9f7c27fb4b263749dca884e2e4a8af3b302 \ + --hash=sha256:951fefe2fb73f84c620bec4e001e80a80ddaa1b84dce244ded7f1e0cbe0ed34a \ + --hash=sha256:9588c6b4157f493edeb9378788dcd02cb9e6a6aeaa518b511a1c79d06cbd8094 \ + --hash=sha256:9ce8300950f2f1d29d0e49c28ebfff0d2f1e2a7444830fbb0b913c7c08f31511 \ + --hash=sha256:be39cca66cc6806652da97103605c7b65ee4442c638f04ff064a7efd9a81d50a \ + --hash=sha256:c3ab2d835b95ccb59d11dfcd56eb0480daea57cdf95d686d22eff35584bc4554 \ + --hash=sha256:eb0fc4a492cb896346c9e2c7a22eae3e766d407df3eb20f4ce027f23f76e4c54 \ + --hash=sha256:ec0c56eae6cee6299f41e780a0280318a93db519bbb2906103c43f3e2be1206c \ + --hash=sha256:f4e4612de60a4f1c4d06c8c2857cdcb2b8b5289189a12053f37d3f41f06c60d0 +pillow==6.1.0 \ + --hash=sha256:0804f77cb1e9b6dbd37601cee11283bba39a8d44b9ddb053400c58e0c0d7d9de \ + --hash=sha256:0ab7c5b5d04691bcbd570658667dd1e21ca311c62dcfd315ad2255b1cd37f64f \ + --hash=sha256:0b3e6cf3ea1f8cecd625f1420b931c83ce74f00c29a0ff1ce4385f99900ac7c4 \ + --hash=sha256:365c06a45712cd723ec16fa4ceb32ce46ad201eb7bbf6d3c16b063c72b61a3ed \ + --hash=sha256:38301fbc0af865baa4752ddae1bb3cbb24b3d8f221bf2850aad96b243306fa03 \ + --hash=sha256:3aef1af1a91798536bbab35d70d35750bd2884f0832c88aeb2499aa2d1ed4992 \ + --hash=sha256:3fe0ab49537d9330c9bba7f16a5f8b02da615b5c809cdf7124f356a0f182eccd \ + --hash=sha256:45a619d5c1915957449264c81c008934452e3fd3604e36809212300b2a4dab68 \ + --hash=sha256:49f90f147883a0c3778fd29d3eb169d56416f25758d0f66775db9184debc8010 \ + --hash=sha256:571b5a758baf1cb6a04233fb23d6cf1ca60b31f9f641b1700bfaab1194020555 \ + --hash=sha256:5ac381e8b1259925287ccc5a87d9cf6322a2dc88ae28a97fe3e196385288413f \ + --hash=sha256:6153db744a743c0c8c91b8e3b9d40e0b13a5d31dbf8a12748c6d9bfd3ddc01ad \ + --hash=sha256:6fd63afd14a16f5d6b408f623cc2142917a1f92855f0df997e09a49f0341be8a \ + --hash=sha256:70acbcaba2a638923c2d337e0edea210505708d7859b87c2bd81e8f9902ae826 \ + --hash=sha256:70b1594d56ed32d56ed21a7fbb2a5c6fd7446cdb7b21e749c9791eac3a64d9e4 \ + --hash=sha256:76638865c83b1bb33bcac2a61ce4d13c17dba2204969dedb9ab60ef62bede686 \ + --hash=sha256:7b2ec162c87fc496aa568258ac88631a2ce0acfe681a9af40842fc55deaedc99 \ + --hash=sha256:7cee2cef07c8d76894ebefc54e4bb707dfc7f258ad155bd61d87f6cd487a70ff \ + --hash=sha256:7d16d4498f8b374fc625c4037742fbdd7f9ac383fd50b06f4df00c81ef60e829 \ + --hash=sha256:b50bc1780681b127e28f0075dfb81d6135c3a293e0c1d0211133c75e2179b6c0 \ + --hash=sha256:bd0582f831ad5bcad6ca001deba4568573a4675437db17c4031939156ff339fa \ + --hash=sha256:cfd40d8a4b59f7567620410f966bb1f32dc555b2b19f82a91b147fac296f645c \ + --hash=sha256:e3ae410089de680e8f84c68b755b42bc42c0ceb8c03dbea88a5099747091d38e \ + --hash=sha256:e9046e559c299b395b39ac7dbf16005308821c2f24a63cae2ab173bd6aa11616 \ + --hash=sha256:ef6be704ae2bc8ad0ebc5cb850ee9139493b0fc4e81abcc240fb392a63ebc808 \ + --hash=sha256:f8dc19d92896558f9c4317ee365729ead9d7bbcf2052a9a19a3ef17abbb8ac5b +requests==2.22.0 \ + --hash=sha256:11e007a8a2aa0323f5a921e9e6a2d7e4e67d9877e85773fba9ba6419025cbeb4 \ + --hash=sha256:9cf5292fcd0f598c671cfc1e0d7d1a7f13bb8085e9a590f48c010551dc6c4b31 +scikit-video==1.1.11 \ + --hash=sha256:4fc131e509aaeeb0eecb6acb58b92a7ef905be5dbe27ed1d1ae089634b601f23 \ + --hash=sha256:5061d2aeae1892b73a97c89a82942b3e8eebf2fe23e59c60e06ede5f8a24ed1e +scipy==1.3.1 \ + --hash=sha256:0baa64bf42592032f6f6445a07144e355ca876b177f47ad8d0612901c9375bef \ + --hash=sha256:243b04730d7223d2b844bda9500310eecc9eda0cba9ceaf0cde1839f8287dfa8 \ + --hash=sha256:2643cfb46d97b7797d1dbdb6f3c23fe3402904e3c90e6facfe6a9b98d808c1b5 \ + --hash=sha256:396eb4cdad421f846a1498299474f0a3752921229388f91f60dc3eda55a00488 \ + --hash=sha256:3ae3692616975d3c10aca6d574d6b4ff95568768d4525f76222fb60f142075b9 \ + --hash=sha256:435d19f80b4dcf67dc090cc04fde2c5c8a70b3372e64f6a9c58c5b806abfa5a8 \ + --hash=sha256:46a5e55850cfe02332998b3aef481d33f1efee1960fe6cfee0202c7dd6fc21ab \ + --hash=sha256:75b513c462e58eeca82b22fc00f0d1875a37b12913eee9d979233349fce5c8b2 \ + --hash=sha256:7ccfa44a08226825126c4ef0027aa46a38c928a10f0a8a8483c80dd9f9a0ad44 \ + --hash=sha256:89dd6a6d329e3f693d1204d5562dd63af0fd7a17854ced17f9cbc37d5b853c8d \ + --hash=sha256:a81da2fe32f4eab8b60d56ad43e44d93d392da228a77e229e59b51508a00299c \ + --hash=sha256:a9d606d11eb2eec7ef893eb825017fbb6eef1e1d0b98a5b7fc11446ebeb2b9b1 \ + --hash=sha256:ac37eb652248e2d7cbbfd89619dce5ecfd27d657e714ed049d82f19b162e8d45 \ + --hash=sha256:cbc0611699e420774e945f6a4e2830f7ca2b3ee3483fca1aa659100049487dd5 \ + --hash=sha256:d02d813ec9958ed63b390ded463163685af6025cb2e9a226ec2c477df90c6957 \ + --hash=sha256:dd3b52e00f93fd1c86f2d78243dfb0d02743c94dd1d34ffea10055438e63b99d \ + # via scikit-video +tqdm==4.33.0 \ + --hash=sha256:1dc82f87a8726602fa7177a091b5e8691d6523138a8f7acd08e58088f51e389f \ + --hash=sha256:47220a4f2aeebbc74b0ab317584264ea44c745e1fd5ff316b675cd0aff8afad8 +urllib3==1.25.3 \ + --hash=sha256:b246607a25ac80bedac05c6f282e3cdaf3afb65420fd024ac94435cabe6e18d1 \ + --hash=sha256:dbe59173209418ae49d485b87d1681aefa36252ee85884c31346debd19463232 \ + # via requests +werkzeug==0.15.5 \ + --hash=sha256:87ae4e5b5366da2347eb3116c0e6c681a0e939a33b2805e2c0cbd282664932c4 \ + --hash=sha256:a13b74dd3c45f758d4ebdb224be8f1ab8ef58b3c0ffc1783a8c7d9f4f50227e6 \ + # via flask diff --git a/scripts/drawlines.py b/scripts/drawlines.py new file mode 100755 index 0000000..2febc55 --- /dev/null +++ b/scripts/drawlines.py @@ -0,0 +1,32 @@ +#!/usr/bin/env python3 + +import sys + +from PIL import Image +from PIL import ImageDraw + + +if len(sys.argv) != 3: + sys.exit("Usage: {} in.jpg out.jpg".format(sys.argv[0])) + + +infile = sys.argv[1] +outfile = sys.argv[2] + +image = Image.open(infile) +w, h = image.size + +draw = ImageDraw.Draw(image) + +n = 7 + +dx = w // n +dy = h // n + +for i in range(1, n): + draw.line([i * dx, 0, i * dx, h], fill="green", width=1) + +for j in range(1, n): + draw.line([0, j * dy, w, j * dy], fill="green", width=1) + +image.save(outfile, optimize=True) diff --git a/scripts/key-frames-to-video b/scripts/key-frames-to-video new file mode 100755 index 0000000..ac39725 --- /dev/null +++ b/scripts/key-frames-to-video @@ -0,0 +1,17 @@ +#!/usr/bin/env bash + +set -o errexit +set -o pipefail +set -o nounset + +if [ $# -lt 2 ]; then + echo "Usage: $(basename $0) framedir video.mp4 [fps] [resolution]" + exit 1 +fi + +readonly framedir="${1}" +readonly video="${2}" +readonly rate="${3:-2}" +readonly resolution="${4:-320x180}" + +ffmpeg -y -loglevel error -r "${rate}" -vsync 0 -f image2 -pattern_type glob -i "${framedir}/*.jpg" -s "${resolution}" -vcodec libx264 "${video}" diff --git a/scripts/scale-crop-image b/scripts/scale-crop-image new file mode 100755 index 0000000..3a8a5f2 --- /dev/null +++ b/scripts/scale-crop-image @@ -0,0 +1,14 @@ +#!/usr/bin/env bash + +set -o errexit +set -o pipefail +set -o nounset + +if [ $# -lt 2 ]; then + echo "Usage: $(basename $0) in.jpg out.jpg [WxH]" + exit 1 +fi + +readonly resolution="${3:-224x224}" + +convert "${1}" -resize "${resolution}^" -gravity Center -extent "${resolution}" "${2}" diff --git a/scripts/split-image b/scripts/split-image new file mode 100755 index 0000000..7756622 --- /dev/null +++ b/scripts/split-image @@ -0,0 +1,14 @@ +#!/usr/bin/env bash + +set -o errexit +set -o pipefail +set -o nounset + +if [ $# -lt 2 ]; then + echo "Usage: $(basename $0) in.jpg outdir [WxH]" + exit 1 +fi + +readonly resolution="${3:-32x32}" + +convert "${1}" -crop "${resolution}" +repage +adjoin "${2}/split-%d.jpg" diff --git a/scripts/video-to-key-frames b/scripts/video-to-key-frames new file mode 100755 index 0000000..0b369ff --- /dev/null +++ b/scripts/video-to-key-frames @@ -0,0 +1,12 @@ +#!/usr/bin/env bash + +set -o errexit +set -o pipefail +set -o nounset + +if [ $# -ne 2 ]; then + echo "Usage: $(basename $0) video.mp4 outdir" + exit 1 +fi + +ffmpeg -y -loglevel error -skip_frame nokey -i "${1}" -vsync 0 -f image2 "${2}/frame-%d.jpg" diff --git a/scripts/video-to-resampled-frames b/scripts/video-to-resampled-frames new file mode 100755 index 0000000..887d8c9 --- /dev/null +++ b/scripts/video-to-resampled-frames @@ -0,0 +1,12 @@ +#!/usr/bin/env bash + +set -o errexit +set -o pipefail +set -o nounset + +if [ $# -ne 3 ]; then + echo "Usage: $(basename $0) video.mp4 outdir fps" + exit 1 +fi + +ffmpeg -y -loglevel error -r "${3}" -i "${1}" -vsync 0 -f image2 "${2}/frame-%d.jpg" diff --git a/sfi/__init__.py b/sfi/__init__.py new file mode 100644 index 0000000..37769da --- /dev/null +++ b/sfi/__init__.py @@ -0,0 +1,4 @@ +"""Semantic Frame Index + + Fast and efficient queries on video frames by semantic similarity. +""" diff --git a/sfi/datasets.py b/sfi/datasets.py new file mode 100644 index 0000000..372dfad --- /dev/null +++ b/sfi/datasets.py @@ -0,0 +1,50 @@ +from PIL import Image + +from torch.utils.data import Dataset + +from sfi.utils import files + +# PyTorch can not transport a Path object through data loaders. +# Serialize Path to str here; users have to encode via Path(path). + + +class ImageDirectory(Dataset): + def __init__(self, root, transform=None): + super().__init__() + + self.paths = files(root) + self.transform = transform + + def __len__(self): + return len(self.paths) + + def __getitem__(self, i): + path = str(self.paths[i]) + image = Image.open(path) + + if self.transform is not None: + image = self.transform(image) + + return image, path + + +class ImageSingleton(Dataset): + def __init__(self, root, transform=None): + super().__init__() + + self.path = root + self.transform = transform + + def __len__(self): + return 1 + + def __getitem__(self, i): + assert i == 0 + + path = str(self.path) + image = Image.open(path) + + if self.transform is not None: + image = self.transform(image) + + return image, path diff --git a/sfi/features.py b/sfi/features.py new file mode 100644 index 0000000..d414310 --- /dev/null +++ b/sfi/features.py @@ -0,0 +1,74 @@ +import sys + +import torch +import torch.nn as nn +from torchvision.models import resnet50 +from torchvision.transforms import Compose, Normalize, ToTensor, Resize + +from einops import rearrange + +from sfi.transforms import ToImageMode, PadToMultiple + + +class FeatureExtractor: + def __init__(self, image_size): + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + if torch.cuda.is_available(): + print("Using CUDA, benchmarking implementations", file=sys.stderr) + torch.backends.cudnn.benchmark = True + + # Set up pre-trained resnet in inference mode + resnet = resnet50(pretrained=True, progress=False) + + # Chop off classification head + resnet.fc = nn.Identity() + + # In addition do not pool, keep spatial information if user wants to + resnet.avgpool = nn.Identity() + + for params in resnet.parameters(): + params.requires_grad = False + + resnet = resnet.to(device) + resnet = nn.DataParallel(resnet) + + resnet.eval() + + self.net = resnet + self.device = device + self.image_size = image_size + + @property + def transform(self): + # ImageNet statistics (because we use pre-trained model) + mean, std = [0.485, 0.456, 0.406], [0.229, 0.224, 0.225] + + return Compose([ + ToImageMode("RGB"), + Resize(self.image_size), + # resnet5 downsamples x2 five times + PadToMultiple(32, fill=0), + ToTensor(), + Normalize(mean=mean, std=std)]) + + # batch of NCHW image tensors to batch of NHWC feature tensors + def __call__(self, images): + n, c, h, w = images.size(0), 2048, images.size(2), images.size(3) + + assert h % 32 == 0, "height divisible by 32 for resnet50" + assert w % 32 == 0, "width divisible by 32 for resnet50" + + with torch.no_grad(): + images = images.to(self.device) + + # resnet5 downsamples x2 five times + h, w = h // 32, w // 32 + + # resnet50 outputs flat view over a batch with 2048 channels, spatial resolution HxW + # https://github.com/pytorch/vision/blob/ac2e995a4352267f65e7cc6d354bde683a4fb402/torchvision/models/resnet.py#L202-L204 + + features = self.net(images) + features = rearrange(features, "n (c h w) -> n (h w) c", n=n, h=h, w=w, c=c) + + return features diff --git a/sfi/index.py b/sfi/index.py new file mode 100644 index 0000000..5d6424c --- /dev/null +++ b/sfi/index.py @@ -0,0 +1,52 @@ +import numpy as np +from einops import rearrange + +from faiss import IndexPQ + +from sfi.io import IndexIO, JsonIO + + +# TODO: benchmark +kNumResultsPerIndex = 512 + + +class IndexQueryError(Exception): + pass + + +class Index: + def __init__(self, path, metadata, features_size, num_probes=1): + self.index = IndexIO.load(path) + self.index.nprobes = num_probes + + # Disable Polysemous Codes until we know threshold for MACs + # self.index.search_type = IndexPQ.ST_polysemous + # self.index.polysemous_ht = 768 + + self.metadata = JsonIO.load(metadata) + self.features_size = features_size + + def query(self, query, num_results=1): + N, C = query.shape + + if N != self.features_size * self.features_size: + raise IndexQueryError("query feature size does not match index feature size") + + # C-array required for faiss FFI: tensors might not be contiguous + query = np.ascontiguousarray(query) + + dists, idxs = self.index.search(query, kNumResultsPerIndex) + + dists = rearrange(dists, "() n -> n") + idxs = rearrange(idxs, "() n -> n") + + results = list(zip(dists, idxs)) + + _, uniqued = np.unique([i for _, i in results], return_index=True) + results = [results[i] for i in uniqued] + results = sorted(results, key=lambda v: v[0]) + + results = [(round(d.item(), 3), self.metadata[i]) + for d, i in results[:num_results]] + + return results diff --git a/sfi/io.py b/sfi/io.py new file mode 100644 index 0000000..46d236c --- /dev/null +++ b/sfi/io.py @@ -0,0 +1,36 @@ +import json + +import numpy as np +import faiss + + +class ArrayIO: + @staticmethod + def save(path, x): + return np.save(str(path), x, allow_pickle=False) + + @staticmethod + def load(path): + return np.load(str(path), allow_pickle=False) + + +class IndexIO: + @staticmethod + def save(path, x): + return faiss.write_index(x, str(path)) + + @staticmethod + def load(path): + return faiss.read_index(str(path)) + + +class JsonIO: + @staticmethod + def save(path, x): + with path.open("w") as fd: + return json.dump(x, fd) + + @staticmethod + def load(path): + with path.open("r") as fd: + return json.load(fd) diff --git a/sfi/mixup.py b/sfi/mixup.py new file mode 100644 index 0000000..a830605 --- /dev/null +++ b/sfi/mixup.py @@ -0,0 +1,59 @@ +import torch +import torch.nn as nn + +import numpy as np + + +# Mixup for data augmentation +# https://arxiv.org/abs/1710.09412 + +class MixupDataLoaderAdaptor: + def __init__(self, dataloader, alpha=0.4): + self.dataloader = dataloader + self.dataiter = None + self.alpha = alpha + + def __len__(self): + return len(self.dataloader) + + def __iter__(self): + self.dataiter = iter(self.dataloader) + return self + + def __next__(self): + inputs1, labels1 = next(self.dataiter) + + n = inputs1.size(0) + + # draw t from (symmetric) beta distribution + # take from one side to prevent duplicates + + t = np.random.beta(self.alpha, self.alpha, size=n) + t = np.concatenate([t[:, None], 1 - t[:, None]], axis=1).max(axis=1) + t = torch.FloatTensor(t) + t = t.view(n, 1, 1, 1) + + # shuffle the batch inputs and targets to get second batch + + r = np.random.permutation(n) + inputs2, labels2 = inputs1[r], labels1[r] + + # mix up the original batch with the shuffled batch + + inputs = t * inputs1 + (1 - t) * inputs2 + + # With CrossEntropy we do not need the mixed up labels + # labels = t * labels1.float() + (1 - t) * labels2.float() + + return inputs, t, labels1, labels2 + + +class MixupCrossEntropyLossAdaptor(nn.Module): + def __init__(self, *args, **kwargs): + super().__init__() + self.criterion = nn.CrossEntropyLoss(*args, **kwargs) + + def forward(self, outputs, t, labels1, labels2): + lhs = t * self.criterion(outputs, labels1) + rhs = (1 - t) * self.criterion(outputs, labels2) + return (lhs + rhs).mean() diff --git a/sfi/tools/__init__.py b/sfi/tools/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/sfi/tools/__main__.py b/sfi/tools/__main__.py new file mode 100644 index 0000000..bbf2621 --- /dev/null +++ b/sfi/tools/__main__.py @@ -0,0 +1,89 @@ +from pathlib import Path + +import argparse + +import sfi.tools.frames +import sfi.tools.feature +import sfi.tools.feature3d +import sfi.tools.stream +import sfi.tools.server +import sfi.tools.client +import sfi.tools.train +import sfi.tools.infer +import sfi.tools.export + +parser = argparse.ArgumentParser(prog="sficmd") +subcmd = parser.add_subparsers(title="commands", metavar="") +subcmd.required = True + +Fmt = argparse.ArgumentDefaultsHelpFormatter + +frames = subcmd.add_parser("save-frames", help="saves key frames for video", formatter_class=Fmt) +frames.add_argument("--video", type=Path, required=True, help="file load video from") +frames.add_argument("--frames", type=Path, required=True, help="directory to save key frames to") +frames.add_argument("--similarity", type=float, default=0.95, help="similarity key frame threshold") +frames.add_argument("--pool", choices=["mean", "max"], default="mean", help="spatial pooling mode") +frames.add_argument("--image-size", type=int, default=7 * 32, choices=[v * 32 for v in range(1, 15)]) +frames.add_argument("--batch-size", type=int, default=8) +frames.set_defaults(main=sfi.tools.frames.main) + +stream = subcmd.add_parser("stream-index", help="builds an index in streaming mode", formatter_class=Fmt) +stream.add_argument("--index", type=Path, required=True, help="file to save index to") +stream.add_argument("--frames", type=Path, required=True, help="directory to load image frames from") +stream.add_argument("--num-train", type=int, required=True, help="number of samples to train on") +stream.add_argument("--image-size", type=int, default=14 * 32, choices=[v * 32 for v in range(1, 15)]) +stream.add_argument("--batch-size", type=int, default=64) +stream.add_argument("--num-workers", type=int, default=0) +stream.set_defaults(main=sfi.tools.stream.main) + +feature = subcmd.add_parser("save-feature", help="saves features for frames", formatter_class=Fmt) +feature.add_argument("--frame", type=Path, required=True, help="path to image frame") +feature.add_argument("--feature", type=Path, required=True, help="path to save features to") +feature.add_argument("--image-size", type=int, default=14 * 32, choices=[v * 32 for v in range(1, 15)]) +feature.set_defaults(main=sfi.tools.feature.main) + +feature3d = subcmd.add_parser("save-feature3d", help="saves features for videos", formatter_class=Fmt) +feature3d.add_argument("--video", type=Path, required=True, help="path to video") +feature3d.add_argument("--feature", type=Path, required=True, help="path to save features to") +feature3d.add_argument("--timesteps", type=int, default=64, help="frames per sequence along time axis") +feature3d.set_defaults(main=sfi.tools.feature3d.main) + +server = subcmd.add_parser("query-server", help="starts up the index query http server", formatter_class=Fmt) +server.add_argument("--index", type=Path, required=True, help="file to load index from") +server.add_argument("--host", type=str, default="127.0.0.1") +server.add_argument("--port", type=int, default=5000) +server.add_argument("--num-probes", type=int, default=1, help="number of cells to visit during search") +server.add_argument("--features-size", type=int, default=1, choices=range(1, 15)) +server.set_defaults(main=sfi.tools.server.main) + +client = subcmd.add_parser("query-client", help="queries the query server for similar features", formatter_class=Fmt) +client.add_argument("--host", type=str, default="127.0.0.1") +client.add_argument("--port", type=int, default=5000) +client.add_argument("--query", type=Path, required=True, help="feature file to query the index with") +client.add_argument("--num-results", type=int, default=10, help="number of similar frames to query for") +client.set_defaults(main=sfi.tools.client.main) + +train = subcmd.add_parser("model-train", help="trains a classifier model", formatter_class=Fmt) +train.add_argument("--model", type=Path, required=True, help="file to save trained model to") +train.add_argument("--resume-from", type=Path, help="file to load trained model from") +train.add_argument("--dataset", type=Path, required=True, help="directory to load dataset from") +train.add_argument("--batch-size", type=int, default=24) +train.add_argument("--num-workers", type=int, default=0) +train.add_argument("--num-epochs", type=int, default=100) +train.set_defaults(main=sfi.tools.train.main) + +infer = subcmd.add_parser("model-infer", help="runs inference with a classifier model", formatter_class=Fmt) +infer.add_argument("--model", type=Path, required=True, help="file to load trained model from") +infer.add_argument("--dataset", type=Path, required=True, help="directory to load dataset from") +infer.add_argument("--results", type=Path, required=True, help="file to save results to") +infer.add_argument("--batch-size", type=int, default=64) +infer.add_argument("--num-workers", type=int, default=0) +infer.set_defaults(main=sfi.tools.infer.main) + +export = subcmd.add_parser("model-export", help="export a classifier model to onnx", formatter_class=Fmt) +export.add_argument("--model", type=Path, required=True, help="file to load trained model from") +export.add_argument("--onnx", type=Path, required=True, help="file to save trained onnx model to") +export.set_defaults(main=sfi.tools.export.main) + +args = parser.parse_args() +args.main(args) diff --git a/sfi/tools/client.py b/sfi/tools/client.py new file mode 100644 index 0000000..c224ccf --- /dev/null +++ b/sfi/tools/client.py @@ -0,0 +1,33 @@ +import sys +import json +import base64 + +import requests +from einops import rearrange + +from sfi.io import ArrayIO + + +def main(args): + query = ArrayIO.load(args.query) + + if len(query.shape) == 1: # handle (C,) as (1, C) + query = rearrange(query, "n -> () n") + + N, C = query.shape + dtype = str(query.dtype) + feature = base64.b64encode(query.ravel()).decode("utf-8") + + url = "http://{}:{}".format(args.host, args.port) + + payload = {"num_results": args.num_results, + "feature": feature, + "shape": [N, C], + "dtype": dtype} + + res = requests.post(url, data=json.dumps(payload)) + + if res.status_code != requests.codes.ok: + sys.exit("Error: unable to query server") + + print(json.dumps(res.json())) diff --git a/sfi/tools/export.py b/sfi/tools/export.py new file mode 100644 index 0000000..343592f --- /dev/null +++ b/sfi/tools/export.py @@ -0,0 +1,28 @@ +import torch +import torch.onnx +import torch.nn as nn + +from torchvision.models import resnet50 + +def main(args): + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + if torch.cuda.is_available(): + print("Using CUDA, benchmarking implementations", file=sys.stderr) + torch.backends.cudnn.benchmark = True + + # Binary classifier on top of resnet50 + model = resnet50() + model.fc = nn.Linear(model.fc.in_features, 2) + + model = model.to(device) + model = nn.DataParallel(model) + + # Restore trained weights + weights = torch.load(str(args.model), map_location=device) + model.load_state_dict(weights) + + # Run dummy batch through model to trace computational graph + batch = torch.rand(1, 3, 224, 224, device=device) + + torch.onnx.export(model.module, batch, str(args.onnx)) diff --git a/sfi/tools/feature.py b/sfi/tools/feature.py new file mode 100644 index 0000000..8d450cf --- /dev/null +++ b/sfi/tools/feature.py @@ -0,0 +1,32 @@ +from torch.utils.data import DataLoader + +from einops import reduce + +from sfi.datasets import ImageSingleton +from sfi.features import FeatureExtractor +from sfi.io import ArrayIO + + +def main(args): + extract = FeatureExtractor(image_size=args.image_size) + + # We use this tool to compute query features on images of arbitrary sizes. + # That's why we can not batch images and have to feed them one by one. + + dataset = ImageSingleton(root=args.frame, transform=extract.transform) + loader = DataLoader(dataset, batch_size=1, num_workers=0) + + for images, paths in loader: + assert images.size(0) == 1, "image batch size of one for required" + + n, c, h, w = images.size(0), 2048, images.size(2), images.size(3) + + # resnet5 downsamples x2 five times + h, w = h // 32, w // 32 + + # MAC feature descriptor + features = extract(images) + features = reduce(features, "n (h w) c -> n c", "max", n=n, h=h, w=w, c=c) + features = features.data.cpu().numpy() + + ArrayIO.save(args.feature, features[0]) diff --git a/sfi/tools/feature3d.py b/sfi/tools/feature3d.py new file mode 100644 index 0000000..99f6e3d --- /dev/null +++ b/sfi/tools/feature3d.py @@ -0,0 +1,66 @@ +import sys + +import torch +import torch.nn as nn + +from torchvision.models.video import r2plus1d_18 + +from einops import rearrange + +from skvideo.io import vread + +from sfi.utils import batched + + +def main(args): + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + if torch.cuda.is_available(): + print("Using CUDA, benchmarking implementations", file=sys.stderr) + torch.backends.cudnn.benchmark = True + + # r2d2 says "beep beep" + resnet = r2plus1d_18(pretrained=True, progress=False) + + resnet.fc = nn.Identity() + # resnet.avgpool = nn.Identity() + + for params in resnet.parameters(): + params.requires_grad = False + + resnet = resnet.to(device) + resnet = nn.DataParallel(resnet) + + resnet.eval() + + # Pre-trained Kinetics-400 statistics for normalization + mean, std = [0.43216, 0.394666, 0.37645], [0.22803, 0.22145, 0.216989] + + mean = rearrange(torch.as_tensor(mean), "n -> () n () ()") + std = rearrange(torch.as_tensor(std), "n -> () n () ()") + + video = vread(str(args.video)) + + with torch.no_grad(): + for i, batch in enumerate(batched(video, args.timesteps)): + # TODO: + # - encapsulate video dataset + # - abstract away transforms + # - fix timesteps vs batching + + batch = rearrange(batch, "t h w c -> t c h w") + batch = torch.tensor(batch) + batch = batch.to(torch.float32) / 255 + + batch = (batch - mean) / std + + # model expects NxCxTxHxW + inputs = rearrange(batch, "t c h w -> () c t h w") + inputs = inputs.to(device) + + outputs = resnet(inputs) + outputs = rearrange(outputs, "() n -> n") + outputs = outputs.data.cpu().numpy() + + print("seq={}, frames=range({}, {}), prediction={}" + .format(i, i * args.timesteps, (i + 1) * args.timesteps, outputs.shape)) diff --git a/sfi/tools/frames.py b/sfi/tools/frames.py new file mode 100644 index 0000000..814bca5 --- /dev/null +++ b/sfi/tools/frames.py @@ -0,0 +1,67 @@ +import sys + +from torch.utils.data import DataLoader + +from PIL import Image + +import numpy as np +from einops import reduce + +from skvideo.io import vread + +from sfi.features import FeatureExtractor +from sfi.utils import batched + + +def main(args): + args.frames.mkdir(exist_ok=True) + + key = None + video = vread(str(args.video)) + extract = FeatureExtractor(image_size=args.image_size) + + nframes, nkeys = 0, 0 + + for i, batch in enumerate(batched(video, args.batch_size)): + # We should use the IterableDataset from upcoming PyTorch version for FramesDataset + + frames = [Image.fromarray(each) for each in batch] + + dataset = [extract.transform(frame) for frame in frames] + dataloader = DataLoader(dataset, batch_size=args.batch_size) + + assert len(dataloader) == 1 + images = next(iter(dataloader)) + + n, c, h, w = images.size(0), 2048, images.size(2), images.size(3) + + features = extract(images) + features = features.data.cpu().numpy() + + # resnet5 downsamples x2 five times + h, w = h // 32, w // 32 + + features = reduce(features, "n (h w) c -> n c", reduction=args.pool, n=n, h=h, w=w, c=c) + + for j, (frame, feature) in enumerate(zip(frames, features)): + nframes += 1 + + fid = i * args.batch_size + j + + if key: + prev_frame, prev_feature = key + + if similarity(prev_feature, feature) > args.similarity: + continue + + nkeys += 1 + key = frame, feature + frame.save(args.frames / "{:010d}.jpg".format(fid)) + + if nframes != 0: + print("Processed total={} keep={} drop={} ratio={}" + .format(nframes, nkeys, nframes - nkeys, round(nkeys / nframes, 2)), file=sys.stderr) + + +def similarity(x, y): + return (x @ y) / (np.linalg.norm(x) * np.linalg.norm(y)) diff --git a/sfi/tools/infer.py b/sfi/tools/infer.py new file mode 100644 index 0000000..d8dd4ae --- /dev/null +++ b/sfi/tools/infer.py @@ -0,0 +1,70 @@ +import sys +from pathlib import Path + +import torch +import torch.nn as nn +import torch.backends.cudnn +from torch.utils.data import DataLoader + +from torchvision.models import resnet50 +from torchvision.transforms import Compose, Normalize, ToTensor, Resize, CenterCrop + +from tqdm import tqdm + +from sfi.io import JsonIO +from sfi.datasets import ImageDirectory +from sfi.transforms import ToImageMode + + +def main(args): + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + if torch.cuda.is_available(): + print("Using CUDA, benchmarking implementations", file=sys.stderr) + torch.backends.cudnn.benchmark = True + + # ImageNet statistics (because we use pre-trained model) + mean, std = [0.485, 0.456, 0.406], [0.229, 0.224, 0.225] + + transform = Compose([ + ToImageMode("RGB"), + Resize(256), + CenterCrop(224), + ToTensor(), + Normalize(mean=mean, std=std)]) + + dataset = ImageDirectory(root=args.dataset, transform=transform) + dataloader = DataLoader(dataset, batch_size=args.batch_size, num_workers=args.num_workers) + + # Binary classifier on top of resnet50 + model = resnet50() + model.fc = nn.Linear(model.fc.in_features, 2) + + model = model.to(device) + model = nn.DataParallel(model) + + # Restore trained weights + weights = torch.load(str(args.model), map_location=device) + model.load_state_dict(weights) + + model.eval() + + results = [] + + with torch.no_grad(): + for inputs, paths in tqdm(dataloader, desc="infer", unit="batch", ascii=True): + inputs = inputs.to(device) + + outputs = model(inputs) + + _, preds = torch.max(outputs, dim=1) + preds = preds.data.cpu().numpy() + + probs = nn.functional.softmax(outputs, dim=1) + probs = probs.data.cpu().numpy() + + for path, pred, prob in zip(paths, preds, probs): + result = {"class": pred.item(), "probability": round(prob.max().item(), 3), "path": Path(path).name} + results.append(result) + + JsonIO.save(args.results, results) diff --git a/sfi/tools/server.py b/sfi/tools/server.py new file mode 100644 index 0000000..6b263be --- /dev/null +++ b/sfi/tools/server.py @@ -0,0 +1,58 @@ +import sys +import base64 +import binascii + +import numpy as np +from einops import rearrange + +from flask import Flask, request, jsonify, abort + +from sfi.index import Index, IndexQueryError + + +app = Flask(__name__) +index = None + + +@app.route("/", methods=["POST"]) +def query(): + if not index: + return abort(503) + + req = request.get_json(force=True, silent=False, cache=False) + + if not all(v in req for v in ["feature", "shape", "dtype"]): + return abort(400) + + try: + feature = base64.b64decode(req["feature"]) + except binascii.Error: + return abort(400) + + N, C = req["shape"] + dtype = req["dtype"] + + try: + vs = np.frombuffer(feature, dtype=dtype) + vs = rearrange(vs, "(n c) -> n c", n=N, c=C) + except ValueError: + return abort(400) + + num_results = req.get("num_results", 1) + + try: + results = index.query(vs, num_results=num_results) + except IndexQueryError: + return abort(400) + + return jsonify([{"distance": d, "path": p} for d, p in results]) + + +def main(args): + print("Loading index from disk", file=sys.stderr) + + global index + index = Index(path=args.index, metadata=args.index.with_suffix(".json"), + features_size=args.features_size, num_probes=args.num_probes) + + app.run(host=args.host, port=args.port, debug=False) diff --git a/sfi/tools/stream.py b/sfi/tools/stream.py new file mode 100644 index 0000000..102421d --- /dev/null +++ b/sfi/tools/stream.py @@ -0,0 +1,92 @@ +import sys +from pathlib import Path + +from torch.utils.data import DataLoader, random_split + +import numpy as np +from einops import reduce + +from faiss import IndexFlatL2, IndexIVFPQ + +from tqdm import tqdm + +from sfi.datasets import ImageDirectory +from sfi.features import FeatureExtractor +from sfi.io import IndexIO, JsonIO + +kNumCells = 100 +kNumCentroids = 256 # Note: on gpu this will not work; see links below +kNumBitsPerIdx = 8 + +# Gpu centroid limitations +# - https://github.com/facebookresearch/faiss/blob/a8118acbc516b0263dde610862c806400cc48bf5/gpu/impl/IVFPQ.cu#L69-L92 +# - https://github.com/facebookresearch/faiss/blob/a8118acbc516b0263dde610862c806400cc48bf5/ProductQuantizer.cpp#L189 + + +def main(args): + # https://github.com/facebookresearch/faiss/blob/a8118acbc516b0263dde610862c806400cc48bf5/Clustering.cpp#L78-L80 + if args.num_train < max(kNumCells, kNumCentroids): + sys.exit("Error: require at least {} training samples".format(max(kNumCells, kNumCentroids))) + + extract = FeatureExtractor(image_size=args.image_size) + + dataset = ImageDirectory(root=args.frames, transform=extract.transform) + train_dataset, index_dataset = random_split(dataset, [args.num_train, len(dataset) - args.num_train]) + + if len(train_dataset) > len(index_dataset) or len(train_dataset) > 0.25 * len(index_dataset): + sys.exit("Error: training dataset too big: train={}, index={}".format(len(train_dataset), len(index_dataset))) + + train_loader = DataLoader(train_dataset, batch_size=args.batch_size, num_workers=args.num_workers) + index_loader = DataLoader(index_dataset, batch_size=args.batch_size, num_workers=args.num_workers) + + N, C = len(train_dataset), 2048 + + train_features = np.empty(shape=(N, C), dtype=np.float32) + + for i, (images, paths) in enumerate(tqdm(train_loader, desc="Train", unit="batch", ascii=True)): + n, h, w = images.size(0), images.size(2), images.size(3) + + features = extract(images) + features = features.data.cpu().numpy() + + # resnet5 downsamples x2 five times + h, w = h // 32, w // 32 + + # MAC feature + features = reduce(features, "n (h w) c -> n c", "max", n=n, h=h, w=w, c=C) + + train_features[i * args.batch_size: i * args.batch_size + n] = features + + quantizer = IndexFlatL2(C) + + index = IndexIVFPQ(quantizer, C, kNumCells, kNumCentroids, kNumBitsPerIdx) + index.do_polysemous_training = True + + print("Training index on training features", file=sys.stderr) + index.train(train_features) + + metadata = [] + + for images, paths in tqdm(index_loader, desc="Index", unit="batch", ascii=True): + n, h, w = images.size(0), images.size(2), images.size(3) + + # resnet5 downsamples x2 five times + h, w = h // 32, w // 32 + + # MAC feature descriptor + features = extract(images) + features = reduce(features, "n (h w) c -> n c", "max", n=n, h=h, w=w, c=C) + features = features.data.cpu().numpy() + + # C-array required for faiss FFI: tensors might not be contiguous + features = np.ascontiguousarray(features) + + # Add a batch of (batch*49, 2048) unpooled features to the index at once + index.add(features) + + for path in paths: + fname = Path(path).name + metadata.append(fname) + + IndexIO.save(args.index.with_suffix(".idx"), index) + JsonIO.save(args.index.with_suffix(".json"), metadata) diff --git a/sfi/tools/train.py b/sfi/tools/train.py new file mode 100644 index 0000000..fe0ca8b --- /dev/null +++ b/sfi/tools/train.py @@ -0,0 +1,153 @@ +import sys +import copy +import collections + +import torch +import torch.nn as nn +import torch.optim +import torch.backends.cudnn +from torch.utils.data import DataLoader + +from torchvision.models import resnet50 +from torchvision.datasets import ImageFolder +from torchvision.transforms import Compose, Normalize, ToTensor, Resize, RandomHorizontalFlip + +from tqdm import tqdm + +from sfi.transforms import ToImageMode +from sfi.mixup import MixupDataLoaderAdaptor, MixupCrossEntropyLossAdaptor +from sfi.utils import decay_weights + + +def main(args): + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + if torch.cuda.is_available(): + print("Using CUDA, benchmarking implementations", file=sys.stderr) + torch.backends.cudnn.benchmark = True + + # ImageNet statistics (because we use pre-trained model) + mean, std = [0.485, 0.456, 0.406], [0.229, 0.224, 0.225] + + transform = Compose([ + ToImageMode("RGB"), + Resize(256), + RandomHorizontalFlip(), + ToTensor(), + Normalize(mean=mean, std=std)]) + + train_dataset = ImageFolder(root=args.dataset / "train", transform=transform) + val_dataset = ImageFolder(root=args.dataset / "val", transform=transform) + + train_loader = DataLoader(train_dataset, batch_size=args.batch_size, num_workers=args.num_workers, shuffle=True) + train_loader = MixupDataLoaderAdaptor(train_loader) + + val_loader = DataLoader(val_dataset, batch_size=args.batch_size, num_workers=args.num_workers, shuffle=False) + + model = resnet50(pretrained=True, progress=False) + + # Add binary classification head + model.fc = nn.Linear(model.fc.in_features, 2) + + model = model.to(device) + model = nn.DataParallel(model) + + if args.resume_from: + weights = torch.load(str(args.resume_from), map_location=device) + model.load_state_dict(weights) + + optimizer = torch.optim.Adam(model.parameters(), lr=1e-5) + + counts = collections.Counter(train_dataset.targets).values() + weight = torch.tensor([min(counts) / v for v in counts]).to(device) + + train_criterion = MixupCrossEntropyLossAdaptor(weight=weight) + val_criterion = nn.CrossEntropyLoss(weight=weight) + + best_wts = copy.deepcopy(model.state_dict()) + best_acc = 0.0 + + for epoch in range(args.num_epochs): + print("Epoch {}/{}".format(epoch, args.num_epochs - 1)) + print("-" * 10) + + loss, _, _, _ = train(model, train_criterion, optimizer, device, + dataset=train_dataset, dataloader=train_loader) + + print("train loss: {:.4f}".format(loss)) + + loss, acc, precision, recall = validate(model, val_criterion, device, + dataset=val_dataset, dataloader=val_loader) + + print("val loss: {:.4f} acc: {:.4f} precision: {:.4f} recall: {:.4f}".format(loss, acc, precision, recall)) + + if acc > best_acc: + best_acc = acc + best_wts = copy.deepcopy(model.state_dict()) + + print() + + print("Best acc: {:4f}".format(best_acc)) + + torch.save(best_wts, str(args.model)) + + +def train(model, criterion, optimizer, device, dataset, dataloader): + model.train() + + running_loss = 0.0 + + for inputs, t, labels1, labels2 in tqdm(dataloader, desc="train", unit="batch", ascii=True): + inputs = inputs.to(device) + t = t.to(device) + labels1 = labels1.to(device) + labels2 = labels2.to(device) + + optimizer.zero_grad() + + outputs = model(inputs) + + loss = criterion(outputs, t, labels1, labels2) + + loss.backward() + decay_weights(optimizer, 1e-4) + optimizer.step() + + running_loss += loss.item() * inputs.size(0) + + epoch_loss = running_loss / len(dataset) + + return epoch_loss, float("NaN"), float("NaN"), float("NaN") + + +def validate(model, criterion, device, dataset, dataloader): + model.eval() + + running_loss = 0.0 + tn, fn, tp, fp = 0, 0, 0, 0 + + with torch.no_grad(): + for inputs, labels in tqdm(dataloader, desc="val", unit="batch", ascii=True): + inputs = inputs.to(device) + labels = labels.to(device) + + outputs = model(inputs) + _, preds = torch.max(outputs, dim=1) + + loss = criterion(outputs, labels) + + running_loss += loss.item() * inputs.size(0) + + confusion = preds.float() / labels.float() + tn += torch.sum(torch.isnan(confusion)).item() + fn += torch.sum(confusion == float("inf")).item() + tp += torch.sum(confusion == 1).item() + fp += torch.sum(confusion == 0).item() + + epoch_loss = running_loss / len(dataset) + + accuracy = (tp + tn) / (tp + tn + fp + fn) + precision = tp / (tp + fp) + recall = tp / (tp + fn) + + return epoch_loss, accuracy, precision, recall diff --git a/sfi/transforms.py b/sfi/transforms.py new file mode 100644 index 0000000..4320a9b --- /dev/null +++ b/sfi/transforms.py @@ -0,0 +1,44 @@ +import torchvision.transforms.functional as F + + +def to_image_mode(image, mode): + return image.convert(mode) + + +class ToImageMode: + def __init__(self, mode): + self.mode = mode + + def __call__(self, image): + return to_image_mode(image, self.mode) + + +def pad_to_multiple(image, multiple, fill=0, padding_mode="constant"): + w, h = image.size + + def next_multiple_of(n, multiple): + return ((n // multiple) + int(bool(n % multiple))) * multiple + + padded_w = next_multiple_of(w, multiple) + padded_h = next_multiple_of(h, multiple) + + pad_left = (padded_w - w) // 2 + pad_right = pad_left + (padded_w - w) % 2 + + pad_top = (padded_h - h) // 2 + pad_bottom = pad_top + (padded_h - h) % 2 + + padding = (pad_left, pad_top, pad_right, pad_bottom) + + return F.pad(image, padding, fill=fill, padding_mode=padding_mode) + + +class PadToMultiple: + def __init__(self, multiple, fill=0, padding_mode="constant"): + self.multiple = multiple + self.fill = fill + self.padding_mode = padding_mode + + def __call__(self, image): + return pad_to_multiple(image, multiple=self.multiple, fill=self.fill, + padding_mode=self.padding_mode) diff --git a/sfi/utils.py b/sfi/utils.py new file mode 100644 index 0000000..ad655be --- /dev/null +++ b/sfi/utils.py @@ -0,0 +1,20 @@ +import itertools + + +def batched(iterable, n): + counter = itertools.count() + + for _, group in itertools.groupby(iterable, lambda _: next(counter) // n): + yield list(group) + + +def files(path): + return sorted([p for p in path.iterdir() if p.is_file()]) + + +# Proper weight decay for Adam, not L2 penalty +# https://github.com/pytorch/pytorch/pull/4429 +def decay_weights(optimizer, v): + for group in optimizer.param_groups: + for param in group["params"]: + param.data.add_(-v * group["lr"])