From 1c5c3fc763fb20d04fae756e5a946e75bb5fc1a6 Mon Sep 17 00:00:00 2001 From: daniel-j-h Date: Mon, 24 Feb 2020 13:51:46 +0100 Subject: [PATCH] Initial commit --- .dockerignore | 2 + .flake8 | 2 + .gitignore | 5 + Dockerfile.cpu | 67 +++++++++++++ Dockerfile.gpu | 68 +++++++++++++ LICENSE.md | 21 ++++ Makefile | 32 +++++++ README.md | 104 ++++++++++++++++++++ bin/sfi | 3 + requirements.in | 7 ++ requirements.txt | 145 ++++++++++++++++++++++++++++ scripts/drawlines.py | 32 +++++++ scripts/key-frames-to-video | 17 ++++ scripts/scale-crop-image | 14 +++ scripts/split-image | 14 +++ scripts/video-to-key-frames | 12 +++ scripts/video-to-resampled-frames | 12 +++ sfi/__init__.py | 4 + sfi/datasets.py | 50 ++++++++++ sfi/features.py | 74 +++++++++++++++ sfi/index.py | 52 ++++++++++ sfi/io.py | 36 +++++++ sfi/mixup.py | 59 ++++++++++++ sfi/tools/__init__.py | 0 sfi/tools/__main__.py | 89 +++++++++++++++++ sfi/tools/client.py | 33 +++++++ sfi/tools/export.py | 28 ++++++ sfi/tools/feature.py | 32 +++++++ sfi/tools/feature3d.py | 66 +++++++++++++ sfi/tools/frames.py | 67 +++++++++++++ sfi/tools/infer.py | 70 ++++++++++++++ sfi/tools/server.py | 58 +++++++++++ sfi/tools/stream.py | 92 ++++++++++++++++++ sfi/tools/train.py | 153 ++++++++++++++++++++++++++++++ sfi/transforms.py | 44 +++++++++ sfi/utils.py | 20 ++++ 36 files changed, 1584 insertions(+) create mode 100644 .dockerignore create mode 100644 .flake8 create mode 100644 .gitignore create mode 100644 Dockerfile.cpu create mode 100644 Dockerfile.gpu create mode 100644 LICENSE.md create mode 100644 Makefile create mode 100644 README.md create mode 100755 bin/sfi create mode 100644 requirements.in create mode 100644 requirements.txt create mode 100755 scripts/drawlines.py create mode 100755 scripts/key-frames-to-video create mode 100755 scripts/scale-crop-image create mode 100755 scripts/split-image create mode 100755 scripts/video-to-key-frames create mode 100755 scripts/video-to-resampled-frames create mode 100644 sfi/__init__.py create mode 100644 sfi/datasets.py create mode 100644 sfi/features.py create mode 100644 sfi/index.py create mode 100644 sfi/io.py create mode 100644 sfi/mixup.py create mode 100644 sfi/tools/__init__.py create mode 100644 sfi/tools/__main__.py create mode 100644 sfi/tools/client.py create mode 100644 sfi/tools/export.py create mode 100644 sfi/tools/feature.py create mode 100644 sfi/tools/feature3d.py create mode 100644 sfi/tools/frames.py create mode 100644 sfi/tools/infer.py create mode 100644 sfi/tools/server.py create mode 100644 sfi/tools/stream.py create mode 100644 sfi/tools/train.py create mode 100644 sfi/transforms.py create mode 100644 sfi/utils.py diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..3978a0f --- /dev/null +++ b/.dockerignore @@ -0,0 +1,2 @@ +.git +.gitignore diff --git a/.flake8 b/.flake8 new file mode 100644 index 0000000..6deafc2 --- /dev/null +++ b/.flake8 @@ -0,0 +1,2 @@ +[flake8] +max-line-length = 120 diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..f3c231a --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +__pycache__ +*.py[cod] + +*.pth +*.pb diff --git a/Dockerfile.cpu b/Dockerfile.cpu new file mode 100644 index 0000000..8d54486 --- /dev/null +++ b/Dockerfile.cpu @@ -0,0 +1,67 @@ +FROM ubuntu:18.04 + +WORKDIR /usr/src/app + +ENV LANG="C.UTF-8" LC_ALL="C.UTF-8" PATH="/opt/venv/bin:$PATH" PIP_NO_CACHE_DIR="false" CFLAGS="-mavx2" CXXFLAGS="-mavx2" + +RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ + python3 python3-pip python3-venv \ + wget make g++ ffmpeg python3-dev libblas-dev liblapack-dev swig \ + cmake yasm zlib1g-dev && \ + rm -rf /var/lib/apt/lists/* + +COPY requirements.txt . + +RUN python3 -m venv /opt/venv && \ + python3 -m pip install pip==19.2.3 pip-tools==4.0.0 + +# For pytorch and torchvision we need platform specific (cpu vs. gpu) wheels from +# https://download.pytorch.org/whl/cpu/torch_stable.html +# To generate hashes run: python3 -m pip hash *.whl +RUN echo "https://download.pytorch.org/whl/cpu/torch-1.2.0%2Bcpu-cp36-cp36m-manylinux1_x86_64.whl \ + --hash=sha256:7b9b943673d3acb446248ba0d6feed6926bf60ce719ace4707a6559c1f57ced7 \ + \n \ + https://download.pytorch.org/whl/cpu/torchvision-0.4.0%2Bcpu-cp36-cp36m-manylinux1_x86_64.whl \ + --hash=sha256:63f342b858b18839fcf3ff8ad857e44a4ff0fcb8cb8e2bdc2f4ed9afa7cec9e0 \ + \n" >> requirements.txt && cat requirements.txt + +RUN python3 -m piptools sync + +RUN python3 -c "from torchvision.models import resnet50; resnet50(pretrained=True, progress=False)" && \ + python3 -c "from torchvision.models.video import r2plus1d_18; r2plus1d_18(pretrained=True, progress=False)" + +RUN wget -q https://github.com/libjpeg-turbo/libjpeg-turbo/archive/2.0.3.tar.gz -O libjpeg-turbo.tar.gz && \ + echo "a69598bf079463b34d45ca7268462a18b6507fdaa62bb1dfd212f02041499b5d libjpeg-turbo.tar.gz" | sha256sum -c && \ + tar xf libjpeg-turbo.tar.gz && \ + rm libjpeg-turbo.tar.gz && \ + cd libjpeg-turbo* && \ + mkdir build && \ + cd build && \ + cmake -DCMAKE_BUILD_TYPE=Release -DREQUIRE_SIMD=On -DCMAKE_INSTALL_PREFIX=/usr/local .. && \ + make -j $(nproc) && \ + make install && \ + ldconfig && \ + cd ../../ && \ + rm -rf libjpeg-turbo* + +RUN python3 -m pip uninstall -y pillow && \ + python3 -m pip install --no-binary :all: --compile pillow-simd==6.0.0.post0 + +RUN wget -q https://github.com/facebookresearch/faiss/archive/v1.5.3.tar.gz -O faiss.tar.gz && \ + echo "b24d347b0285d01c2ed663ccc7596cd0ea95071f3dd5ebb573ccfc28f15f043b faiss.tar.gz" | sha256sum -c && \ + tar xf faiss.tar.gz && \ + rm faiss.tar.gz && \ + cd faiss* && \ + ./configure --without-cuda && \ + make -j $(nproc) && \ + make -j $(nproc) -C python && \ + make install && \ + make -C python install && \ + cd .. && \ + rm -rf faiss* + +COPY . . + +EXPOSE 5000 +ENTRYPOINT ["/usr/src/app/bin/sfi"] +CMD ["-h"] diff --git a/Dockerfile.gpu b/Dockerfile.gpu new file mode 100644 index 0000000..1dba111 --- /dev/null +++ b/Dockerfile.gpu @@ -0,0 +1,68 @@ +FROM nvidia/cuda:10.1-cudnn7-devel + +WORKDIR /usr/src/app + +ENV LANG="C.UTF-8" LC_ALL="C.UTF-8" PATH="/opt/venv/bin:$PATH" PIP_NO_CACHE_DIR="false" CFLAGS="-mavx2" CXXFLAGS="-mavx2" + +RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ + python3 python3-pip python3-venv \ + wget make g++ ffmpeg python3-dev libblas-dev liblapack-dev swig \ + cmake yasm zlib1g-dev && \ + rm -rf /var/lib/apt/lists/* + +COPY requirements.txt . + +RUN python3 -m venv /opt/venv && \ + python3 -m pip install pip==19.2.3 pip-tools==4.0.0 + +# For pytorch and torchvision we need platform specific (cpu vs. gpu) wheels from +# https://download.pytorch.org/whl/cu100/torch_stable.html +# To generate hashes run: python3 -m pip hash *.whl +RUN echo "https://download.pytorch.org/whl/cu100/torch-1.2.0-cp36-cp36m-manylinux1_x86_64.whl \ + --hash=sha256:a13bf6f78a49d844b85c142b8cd62d2e1833a11ed21ea0bc6b1ac73d24c76415 \ + \n \ + https://download.pytorch.org/whl/cu100/torchvision-0.4.0-cp36-cp36m-manylinux1_x86_64.whl \ + --hash=sha256:2f67efdf6edd9ea7f9cd9a3917ae5c63d5684e3bdb5cc9c2b364c15bdfe4456b \ + \n" >> requirements.txt + +RUN python3 -m piptools sync + +RUN python3 -c "from torchvision.models import resnet50; resnet50(pretrained=True, progress=False)" && \ + python3 -c "from torchvision.models.video import r2plus1d_18; r2plus1d_18(pretrained=True, progress=False)" + +RUN wget -q https://github.com/libjpeg-turbo/libjpeg-turbo/archive/2.0.3.tar.gz -O libjpeg-turbo.tar.gz && \ + echo "a69598bf079463b34d45ca7268462a18b6507fdaa62bb1dfd212f02041499b5d libjpeg-turbo.tar.gz" | sha256sum -c && \ + tar xf libjpeg-turbo.tar.gz && \ + rm libjpeg-turbo.tar.gz && \ + cd libjpeg-turbo* && \ + mkdir build && \ + cd build && \ + cmake -DCMAKE_BUILD_TYPE=Release -DREQUIRE_SIMD=On -DCMAKE_INSTALL_PREFIX=/usr/local .. && \ + make -j $(nproc) && \ + make install && \ + ldconfig && \ + cd ../../ && \ + rm -rf libjpeg-turbo* + + +RUN python3 -m pip uninstall -y pillow && \ + python3 -m pip install --no-binary :all: --compile pillow-simd==6.0.0.post0 + +RUN wget -q https://github.com/facebookresearch/faiss/archive/v1.5.3.tar.gz -O faiss.tar.gz && \ + echo "b24d347b0285d01c2ed663ccc7596cd0ea95071f3dd5ebb573ccfc28f15f043b faiss.tar.gz" | sha256sum -c && \ + tar xf faiss.tar.gz && \ + rm faiss.tar.gz && \ + cd faiss* && \ + ./configure --with-cuda-arch="-gencode=arch=compute_37,code=compute_37 -gencode=arch=compute_70,code=compute_70" --with-cuda="/usr/local/cuda" && \ + make -j $(nproc) && \ + make -j $(nproc) -C python && \ + make install && \ + make -C python install && \ + cd .. && \ + rm -rf faiss* + +COPY . . + +EXPOSE 5000 +ENTRYPOINT ["/usr/src/app/bin/sfi"] +CMD ["-h"] diff --git a/LICENSE.md b/LICENSE.md new file mode 100644 index 0000000..2620140 --- /dev/null +++ b/LICENSE.md @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2019 MoabitCoin + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..4d72bfc --- /dev/null +++ b/Makefile @@ -0,0 +1,32 @@ +dockerimage ?= moabitcoin/sfi +dockerfile ?= Dockerfile.cpu +srcdir ?= $(shell pwd) +datadir ?= $(shell pwd) + +install: + @docker build -t $(dockerimage) -f $(dockerfile) . + +i: install + + +update: + @docker build -t $(dockerimage) -f $(dockerfile) . --pull --no-cache + +u: update + + +run: + @docker run -it --rm --ipc="host" --network="host" -p 5000:5000 -v $(srcdir)/sfi:/usr/src/app/sfi -v $(datadir):/data --entrypoint=/bin/bash $(dockerimage) + +r: run + + +publish: + @docker image save $(dockerimage) \ + | pv -N "Publish $(dockerimage) to $(sshopts)" -s $(shell docker image inspect $(dockerimage) --format "{{.Size}}") \ + | ssh $(sshopts) "docker image load" + +p: publish + + +.PHONY: install i run r update u publish p diff --git a/README.md b/README.md new file mode 100644 index 0000000..aa4012b --- /dev/null +++ b/README.md @@ -0,0 +1,104 @@ +# Semantic Frame Index + +Fast and efficient queries on video frames by semantic similarity. + + +## Use Case + +We record tens of thousand hours of drive video data and need to be able to search for semantically similar scenarios. +Simlarity could mean similar lighting conditions, similar vehicle types, similar traffic volumes, similar objects on the road, and so on. + + +## Implementation Sketch + +We +- extract key frames using a neural net for frame similarity in feature space +- extract a trained convolutional neural net's high level feature maps for all key frames +- compute Maximum Activations of Convolution (MAC) features from the high-level feature maps +- index the feature maps for approximate nearest neighbor searches based on L2 distance +- query the indexed dataset for semantically similar scenarios + + +## Usage + +All tools can be invoked via + + ./bin/sfi + + ./bin/sfi --help + ./bin/sfi --help + + +### stream-index + +Builds an index from a directory of images for fast and efficient approximate nearest neighbor queries based on L2 distance. +The quantizer for the index needs to get trained on a small subset of the feature maps to approximate the dataset's centroids. +We recommend runing this step on GPUs. + + +### save-feature + +Extracts high level feature maps and computes MACs for an image frames from a trained convolutional neural net. + + +### save-frames + +Extracts semantic key frames from videos based on a trained convolution net for feature similarity between frames. + + +### query-server + +Loads up the index (slow) and keeps it in memory to handle nearest neighbor queries (fast). +Responds to queries by searching the index, aggregating results, and re-ranking them. + + +### query-client + +Sends nearest neighbor requests against the query server and reports results to the user. +The query and results are based on the saved MAC features. + + +### model-train + +Trains a binary classification model on a dataset (potentially noisy and obtained from the index). +We recommend runing this step on GPUs. + + +### model-infer + +Predicts binary classification labels on a dataset, using a trained model. + + +## Development + +Create a self-contained reproducible development environment + + make i + +Get into the development environment + + make r + +The Python source code directory is mounted into the container: if you modify it on the host it will get modified in the container. + +To make data visible in the container set the datadir env var, e.g. to make your `/tmp` directory show up in `/data` inside the container run + + make r datadir=/tmp + +See the `Makefile` for options and more advanced targets. + + +## References + +- [Particular object retrieval with integral max-pooling of CNN activations](https://arxiv.org/abs/1511.05879) +- Product Quantizer (PQ) [part 1](http://mccormickml.com/2017/10/13/product-quantizer-tutorial-part-1/), and [part 2](http://mccormickml.com/2017/10/22/product-quantizer-tutorial-part-2/) +- [Product Quantization for Nearest Neighbor Search](https://hal.inria.fr/file/index/docid/514462/filename/paper_hal.pdf) +- [Billion-scale similarity search with GPUs](https://arxiv.org/pdf/1702.08734.pdf) +- [faiss wiki](https://github.com/facebookresearch/faiss/wiki) + + +## License + +Copyright © 2019 MoabitCoin + +Distributed under the MIT License (MIT). diff --git a/bin/sfi b/bin/sfi new file mode 100755 index 0000000..9f648f8 --- /dev/null +++ b/bin/sfi @@ -0,0 +1,3 @@ +#!/bin/bash + +python3 -m sfi.tools "$@" diff --git a/requirements.in b/requirements.in new file mode 100644 index 0000000..320a6af --- /dev/null +++ b/requirements.in @@ -0,0 +1,7 @@ +numpy +pillow +tqdm +flask +requests +einops +scikit-video diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..49c0fd5 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,145 @@ +# +# This file is autogenerated by pip-compile +# To update, run: +# +# pip-compile --generate-hashes +# +certifi==2019.6.16 \ + --hash=sha256:046832c04d4e752f37383b628bc601a7ea7211496b4638f6514d0e5b9acc4939 \ + --hash=sha256:945e3ba63a0b9f577b1395204e13c3a231f9bc0223888be653286534e5873695 \ + # via requests +chardet==3.0.4 \ + --hash=sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae \ + --hash=sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691 \ + # via requests +click==7.0 \ + --hash=sha256:2335065e6395b9e67ca716de5f7526736bfa6ceead690adf616d925bdc622b13 \ + --hash=sha256:5b94b49521f6456670fdb30cd82a4eca9412788a93fa6dd6df72c94d5a8ff2d7 \ + # via flask +einops==0.1.0 \ + --hash=sha256:4ab512fe059c0841e1a315449ca9d7f35eaa05c8c095a14f2c1b92b2b77684d2 \ + --hash=sha256:4fd64864fcb8159074da3213b9327c242536784416cbf423745ef8579850d30b +flask==1.1.1 \ + --hash=sha256:13f9f196f330c7c2c5d7a5cf91af894110ca0215ac051b5844701f2bfd934d52 \ + --hash=sha256:45eb5a6fd193d6cf7e0cf5d8a5b31f83d5faae0293695626f539a823e93b13f6 +idna==2.8 \ + --hash=sha256:c357b3f628cf53ae2c4c05627ecc484553142ca23264e593d327bcde5e9c3407 \ + --hash=sha256:ea8b7f6188e6fa117537c3df7da9fc686d485087abf6ac197f9c46432f7e4a3c \ + # via requests +itsdangerous==1.1.0 \ + --hash=sha256:321b033d07f2a4136d3ec762eac9f16a10ccd60f53c0c91af90217ace7ba1f19 \ + --hash=sha256:b12271b2047cb23eeb98c8b5622e2e5c5e9abd9784a153e9d8ef9cb4dd09d749 \ + # via flask +jinja2==2.10.1 \ + --hash=sha256:065c4f02ebe7f7cf559e49ee5a95fb800a9e4528727aec6f24402a5374c65013 \ + --hash=sha256:14dd6caf1527abb21f08f86c784eac40853ba93edb79552aa1e4b8aef1b61c7b \ + # via flask +markupsafe==1.1.1 \ + --hash=sha256:00bc623926325b26bb9605ae9eae8a215691f33cae5df11ca5424f06f2d1f473 \ + --hash=sha256:09027a7803a62ca78792ad89403b1b7a73a01c8cb65909cd876f7fcebd79b161 \ + --hash=sha256:09c4b7f37d6c648cb13f9230d847adf22f8171b1ccc4d5682398e77f40309235 \ + --hash=sha256:1027c282dad077d0bae18be6794e6b6b8c91d58ed8a8d89a89d59693b9131db5 \ + --hash=sha256:24982cc2533820871eba85ba648cd53d8623687ff11cbb805be4ff7b4c971aff \ + --hash=sha256:29872e92839765e546828bb7754a68c418d927cd064fd4708fab9fe9c8bb116b \ + --hash=sha256:43a55c2930bbc139570ac2452adf3d70cdbb3cfe5912c71cdce1c2c6bbd9c5d1 \ + --hash=sha256:46c99d2de99945ec5cb54f23c8cd5689f6d7177305ebff350a58ce5f8de1669e \ + --hash=sha256:500d4957e52ddc3351cabf489e79c91c17f6e0899158447047588650b5e69183 \ + --hash=sha256:535f6fc4d397c1563d08b88e485c3496cf5784e927af890fb3c3aac7f933ec66 \ + --hash=sha256:62fe6c95e3ec8a7fad637b7f3d372c15ec1caa01ab47926cfdf7a75b40e0eac1 \ + --hash=sha256:6dd73240d2af64df90aa7c4e7481e23825ea70af4b4922f8ede5b9e35f78a3b1 \ + --hash=sha256:717ba8fe3ae9cc0006d7c451f0bb265ee07739daf76355d06366154ee68d221e \ + --hash=sha256:79855e1c5b8da654cf486b830bd42c06e8780cea587384cf6545b7d9ac013a0b \ + --hash=sha256:7c1699dfe0cf8ff607dbdcc1e9b9af1755371f92a68f706051cc8c37d447c905 \ + --hash=sha256:88e5fcfb52ee7b911e8bb6d6aa2fd21fbecc674eadd44118a9cc3863f938e735 \ + --hash=sha256:8defac2f2ccd6805ebf65f5eeb132adcf2ab57aa11fdf4c0dd5169a004710e7d \ + --hash=sha256:98c7086708b163d425c67c7a91bad6e466bb99d797aa64f965e9d25c12111a5e \ + --hash=sha256:9add70b36c5666a2ed02b43b335fe19002ee5235efd4b8a89bfcf9005bebac0d \ + --hash=sha256:9bf40443012702a1d2070043cb6291650a0841ece432556f784f004937f0f32c \ + --hash=sha256:ade5e387d2ad0d7ebf59146cc00c8044acbd863725f887353a10df825fc8ae21 \ + --hash=sha256:b00c1de48212e4cc9603895652c5c410df699856a2853135b3967591e4beebc2 \ + --hash=sha256:b1282f8c00509d99fef04d8ba936b156d419be841854fe901d8ae224c59f0be5 \ + --hash=sha256:b2051432115498d3562c084a49bba65d97cf251f5a331c64a12ee7e04dacc51b \ + --hash=sha256:ba59edeaa2fc6114428f1637ffff42da1e311e29382d81b339c1817d37ec93c6 \ + --hash=sha256:c8716a48d94b06bb3b2524c2b77e055fb313aeb4ea620c8dd03a105574ba704f \ + --hash=sha256:cd5df75523866410809ca100dc9681e301e3c27567cf498077e8551b6d20e42f \ + --hash=sha256:e249096428b3ae81b08327a63a485ad0878de3fb939049038579ac0ef61e17e7 \ + # via jinja2 +numpy==1.17.0 \ + --hash=sha256:03e311b0a4c9f5755da7d52161280c6a78406c7be5c5cc7facfbcebb641efb7e \ + --hash=sha256:0cdd229a53d2720d21175012ab0599665f8c9588b3b8ffa6095dd7b90f0691dd \ + --hash=sha256:312bb18e95218bedc3563f26fcc9c1c6bfaaf9d453d15942c0839acdd7e4c473 \ + --hash=sha256:464b1c48baf49e8505b1bb754c47a013d2c305c5b14269b5c85ea0625b6a988a \ + --hash=sha256:5adfde7bd3ee4864536e230bcab1c673f866736698724d5d28c11a4d63672658 \ + --hash=sha256:7724e9e31ee72389d522b88c0d4201f24edc34277999701ccd4a5392e7d8af61 \ + --hash=sha256:8d36f7c53ae741e23f54793ffefb2912340b800476eb0a831c6eb602e204c5c4 \ + --hash=sha256:910d2272403c2ea8a52d9159827dc9f7c27fb4b263749dca884e2e4a8af3b302 \ + --hash=sha256:951fefe2fb73f84c620bec4e001e80a80ddaa1b84dce244ded7f1e0cbe0ed34a \ + --hash=sha256:9588c6b4157f493edeb9378788dcd02cb9e6a6aeaa518b511a1c79d06cbd8094 \ + --hash=sha256:9ce8300950f2f1d29d0e49c28ebfff0d2f1e2a7444830fbb0b913c7c08f31511 \ + --hash=sha256:be39cca66cc6806652da97103605c7b65ee4442c638f04ff064a7efd9a81d50a \ + --hash=sha256:c3ab2d835b95ccb59d11dfcd56eb0480daea57cdf95d686d22eff35584bc4554 \ + --hash=sha256:eb0fc4a492cb896346c9e2c7a22eae3e766d407df3eb20f4ce027f23f76e4c54 \ + --hash=sha256:ec0c56eae6cee6299f41e780a0280318a93db519bbb2906103c43f3e2be1206c \ + --hash=sha256:f4e4612de60a4f1c4d06c8c2857cdcb2b8b5289189a12053f37d3f41f06c60d0 +pillow==6.1.0 \ + --hash=sha256:0804f77cb1e9b6dbd37601cee11283bba39a8d44b9ddb053400c58e0c0d7d9de \ + --hash=sha256:0ab7c5b5d04691bcbd570658667dd1e21ca311c62dcfd315ad2255b1cd37f64f \ + --hash=sha256:0b3e6cf3ea1f8cecd625f1420b931c83ce74f00c29a0ff1ce4385f99900ac7c4 \ + --hash=sha256:365c06a45712cd723ec16fa4ceb32ce46ad201eb7bbf6d3c16b063c72b61a3ed \ + --hash=sha256:38301fbc0af865baa4752ddae1bb3cbb24b3d8f221bf2850aad96b243306fa03 \ + --hash=sha256:3aef1af1a91798536bbab35d70d35750bd2884f0832c88aeb2499aa2d1ed4992 \ + --hash=sha256:3fe0ab49537d9330c9bba7f16a5f8b02da615b5c809cdf7124f356a0f182eccd \ + --hash=sha256:45a619d5c1915957449264c81c008934452e3fd3604e36809212300b2a4dab68 \ + --hash=sha256:49f90f147883a0c3778fd29d3eb169d56416f25758d0f66775db9184debc8010 \ + --hash=sha256:571b5a758baf1cb6a04233fb23d6cf1ca60b31f9f641b1700bfaab1194020555 \ + --hash=sha256:5ac381e8b1259925287ccc5a87d9cf6322a2dc88ae28a97fe3e196385288413f \ + --hash=sha256:6153db744a743c0c8c91b8e3b9d40e0b13a5d31dbf8a12748c6d9bfd3ddc01ad \ + --hash=sha256:6fd63afd14a16f5d6b408f623cc2142917a1f92855f0df997e09a49f0341be8a \ + --hash=sha256:70acbcaba2a638923c2d337e0edea210505708d7859b87c2bd81e8f9902ae826 \ + --hash=sha256:70b1594d56ed32d56ed21a7fbb2a5c6fd7446cdb7b21e749c9791eac3a64d9e4 \ + --hash=sha256:76638865c83b1bb33bcac2a61ce4d13c17dba2204969dedb9ab60ef62bede686 \ + --hash=sha256:7b2ec162c87fc496aa568258ac88631a2ce0acfe681a9af40842fc55deaedc99 \ + --hash=sha256:7cee2cef07c8d76894ebefc54e4bb707dfc7f258ad155bd61d87f6cd487a70ff \ + --hash=sha256:7d16d4498f8b374fc625c4037742fbdd7f9ac383fd50b06f4df00c81ef60e829 \ + --hash=sha256:b50bc1780681b127e28f0075dfb81d6135c3a293e0c1d0211133c75e2179b6c0 \ + --hash=sha256:bd0582f831ad5bcad6ca001deba4568573a4675437db17c4031939156ff339fa \ + --hash=sha256:cfd40d8a4b59f7567620410f966bb1f32dc555b2b19f82a91b147fac296f645c \ + --hash=sha256:e3ae410089de680e8f84c68b755b42bc42c0ceb8c03dbea88a5099747091d38e \ + --hash=sha256:e9046e559c299b395b39ac7dbf16005308821c2f24a63cae2ab173bd6aa11616 \ + --hash=sha256:ef6be704ae2bc8ad0ebc5cb850ee9139493b0fc4e81abcc240fb392a63ebc808 \ + --hash=sha256:f8dc19d92896558f9c4317ee365729ead9d7bbcf2052a9a19a3ef17abbb8ac5b +requests==2.22.0 \ + --hash=sha256:11e007a8a2aa0323f5a921e9e6a2d7e4e67d9877e85773fba9ba6419025cbeb4 \ + --hash=sha256:9cf5292fcd0f598c671cfc1e0d7d1a7f13bb8085e9a590f48c010551dc6c4b31 +scikit-video==1.1.11 \ + --hash=sha256:4fc131e509aaeeb0eecb6acb58b92a7ef905be5dbe27ed1d1ae089634b601f23 \ + --hash=sha256:5061d2aeae1892b73a97c89a82942b3e8eebf2fe23e59c60e06ede5f8a24ed1e +scipy==1.3.1 \ + --hash=sha256:0baa64bf42592032f6f6445a07144e355ca876b177f47ad8d0612901c9375bef \ + --hash=sha256:243b04730d7223d2b844bda9500310eecc9eda0cba9ceaf0cde1839f8287dfa8 \ + --hash=sha256:2643cfb46d97b7797d1dbdb6f3c23fe3402904e3c90e6facfe6a9b98d808c1b5 \ + --hash=sha256:396eb4cdad421f846a1498299474f0a3752921229388f91f60dc3eda55a00488 \ + --hash=sha256:3ae3692616975d3c10aca6d574d6b4ff95568768d4525f76222fb60f142075b9 \ + --hash=sha256:435d19f80b4dcf67dc090cc04fde2c5c8a70b3372e64f6a9c58c5b806abfa5a8 \ + --hash=sha256:46a5e55850cfe02332998b3aef481d33f1efee1960fe6cfee0202c7dd6fc21ab \ + --hash=sha256:75b513c462e58eeca82b22fc00f0d1875a37b12913eee9d979233349fce5c8b2 \ + --hash=sha256:7ccfa44a08226825126c4ef0027aa46a38c928a10f0a8a8483c80dd9f9a0ad44 \ + --hash=sha256:89dd6a6d329e3f693d1204d5562dd63af0fd7a17854ced17f9cbc37d5b853c8d \ + --hash=sha256:a81da2fe32f4eab8b60d56ad43e44d93d392da228a77e229e59b51508a00299c \ + --hash=sha256:a9d606d11eb2eec7ef893eb825017fbb6eef1e1d0b98a5b7fc11446ebeb2b9b1 \ + --hash=sha256:ac37eb652248e2d7cbbfd89619dce5ecfd27d657e714ed049d82f19b162e8d45 \ + --hash=sha256:cbc0611699e420774e945f6a4e2830f7ca2b3ee3483fca1aa659100049487dd5 \ + --hash=sha256:d02d813ec9958ed63b390ded463163685af6025cb2e9a226ec2c477df90c6957 \ + --hash=sha256:dd3b52e00f93fd1c86f2d78243dfb0d02743c94dd1d34ffea10055438e63b99d \ + # via scikit-video +tqdm==4.33.0 \ + --hash=sha256:1dc82f87a8726602fa7177a091b5e8691d6523138a8f7acd08e58088f51e389f \ + --hash=sha256:47220a4f2aeebbc74b0ab317584264ea44c745e1fd5ff316b675cd0aff8afad8 +urllib3==1.25.3 \ + --hash=sha256:b246607a25ac80bedac05c6f282e3cdaf3afb65420fd024ac94435cabe6e18d1 \ + --hash=sha256:dbe59173209418ae49d485b87d1681aefa36252ee85884c31346debd19463232 \ + # via requests +werkzeug==0.15.5 \ + --hash=sha256:87ae4e5b5366da2347eb3116c0e6c681a0e939a33b2805e2c0cbd282664932c4 \ + --hash=sha256:a13b74dd3c45f758d4ebdb224be8f1ab8ef58b3c0ffc1783a8c7d9f4f50227e6 \ + # via flask diff --git a/scripts/drawlines.py b/scripts/drawlines.py new file mode 100755 index 0000000..2febc55 --- /dev/null +++ b/scripts/drawlines.py @@ -0,0 +1,32 @@ +#!/usr/bin/env python3 + +import sys + +from PIL import Image +from PIL import ImageDraw + + +if len(sys.argv) != 3: + sys.exit("Usage: {} in.jpg out.jpg".format(sys.argv[0])) + + +infile = sys.argv[1] +outfile = sys.argv[2] + +image = Image.open(infile) +w, h = image.size + +draw = ImageDraw.Draw(image) + +n = 7 + +dx = w // n +dy = h // n + +for i in range(1, n): + draw.line([i * dx, 0, i * dx, h], fill="green", width=1) + +for j in range(1, n): + draw.line([0, j * dy, w, j * dy], fill="green", width=1) + +image.save(outfile, optimize=True) diff --git a/scripts/key-frames-to-video b/scripts/key-frames-to-video new file mode 100755 index 0000000..ac39725 --- /dev/null +++ b/scripts/key-frames-to-video @@ -0,0 +1,17 @@ +#!/usr/bin/env bash + +set -o errexit +set -o pipefail +set -o nounset + +if [ $# -lt 2 ]; then + echo "Usage: $(basename $0) framedir video.mp4 [fps] [resolution]" + exit 1 +fi + +readonly framedir="${1}" +readonly video="${2}" +readonly rate="${3:-2}" +readonly resolution="${4:-320x180}" + +ffmpeg -y -loglevel error -r "${rate}" -vsync 0 -f image2 -pattern_type glob -i "${framedir}/*.jpg" -s "${resolution}" -vcodec libx264 "${video}" diff --git a/scripts/scale-crop-image b/scripts/scale-crop-image new file mode 100755 index 0000000..3a8a5f2 --- /dev/null +++ b/scripts/scale-crop-image @@ -0,0 +1,14 @@ +#!/usr/bin/env bash + +set -o errexit +set -o pipefail +set -o nounset + +if [ $# -lt 2 ]; then + echo "Usage: $(basename $0) in.jpg out.jpg [WxH]" + exit 1 +fi + +readonly resolution="${3:-224x224}" + +convert "${1}" -resize "${resolution}^" -gravity Center -extent "${resolution}" "${2}" diff --git a/scripts/split-image b/scripts/split-image new file mode 100755 index 0000000..7756622 --- /dev/null +++ b/scripts/split-image @@ -0,0 +1,14 @@ +#!/usr/bin/env bash + +set -o errexit +set -o pipefail +set -o nounset + +if [ $# -lt 2 ]; then + echo "Usage: $(basename $0) in.jpg outdir [WxH]" + exit 1 +fi + +readonly resolution="${3:-32x32}" + +convert "${1}" -crop "${resolution}" +repage +adjoin "${2}/split-%d.jpg" diff --git a/scripts/video-to-key-frames b/scripts/video-to-key-frames new file mode 100755 index 0000000..0b369ff --- /dev/null +++ b/scripts/video-to-key-frames @@ -0,0 +1,12 @@ +#!/usr/bin/env bash + +set -o errexit +set -o pipefail +set -o nounset + +if [ $# -ne 2 ]; then + echo "Usage: $(basename $0) video.mp4 outdir" + exit 1 +fi + +ffmpeg -y -loglevel error -skip_frame nokey -i "${1}" -vsync 0 -f image2 "${2}/frame-%d.jpg" diff --git a/scripts/video-to-resampled-frames b/scripts/video-to-resampled-frames new file mode 100755 index 0000000..887d8c9 --- /dev/null +++ b/scripts/video-to-resampled-frames @@ -0,0 +1,12 @@ +#!/usr/bin/env bash + +set -o errexit +set -o pipefail +set -o nounset + +if [ $# -ne 3 ]; then + echo "Usage: $(basename $0) video.mp4 outdir fps" + exit 1 +fi + +ffmpeg -y -loglevel error -r "${3}" -i "${1}" -vsync 0 -f image2 "${2}/frame-%d.jpg" diff --git a/sfi/__init__.py b/sfi/__init__.py new file mode 100644 index 0000000..37769da --- /dev/null +++ b/sfi/__init__.py @@ -0,0 +1,4 @@ +"""Semantic Frame Index + + Fast and efficient queries on video frames by semantic similarity. +""" diff --git a/sfi/datasets.py b/sfi/datasets.py new file mode 100644 index 0000000..372dfad --- /dev/null +++ b/sfi/datasets.py @@ -0,0 +1,50 @@ +from PIL import Image + +from torch.utils.data import Dataset + +from sfi.utils import files + +# PyTorch can not transport a Path object through data loaders. +# Serialize Path to str here; users have to encode via Path(path). + + +class ImageDirectory(Dataset): + def __init__(self, root, transform=None): + super().__init__() + + self.paths = files(root) + self.transform = transform + + def __len__(self): + return len(self.paths) + + def __getitem__(self, i): + path = str(self.paths[i]) + image = Image.open(path) + + if self.transform is not None: + image = self.transform(image) + + return image, path + + +class ImageSingleton(Dataset): + def __init__(self, root, transform=None): + super().__init__() + + self.path = root + self.transform = transform + + def __len__(self): + return 1 + + def __getitem__(self, i): + assert i == 0 + + path = str(self.path) + image = Image.open(path) + + if self.transform is not None: + image = self.transform(image) + + return image, path diff --git a/sfi/features.py b/sfi/features.py new file mode 100644 index 0000000..d414310 --- /dev/null +++ b/sfi/features.py @@ -0,0 +1,74 @@ +import sys + +import torch +import torch.nn as nn +from torchvision.models import resnet50 +from torchvision.transforms import Compose, Normalize, ToTensor, Resize + +from einops import rearrange + +from sfi.transforms import ToImageMode, PadToMultiple + + +class FeatureExtractor: + def __init__(self, image_size): + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + if torch.cuda.is_available(): + print("Using CUDA, benchmarking implementations", file=sys.stderr) + torch.backends.cudnn.benchmark = True + + # Set up pre-trained resnet in inference mode + resnet = resnet50(pretrained=True, progress=False) + + # Chop off classification head + resnet.fc = nn.Identity() + + # In addition do not pool, keep spatial information if user wants to + resnet.avgpool = nn.Identity() + + for params in resnet.parameters(): + params.requires_grad = False + + resnet = resnet.to(device) + resnet = nn.DataParallel(resnet) + + resnet.eval() + + self.net = resnet + self.device = device + self.image_size = image_size + + @property + def transform(self): + # ImageNet statistics (because we use pre-trained model) + mean, std = [0.485, 0.456, 0.406], [0.229, 0.224, 0.225] + + return Compose([ + ToImageMode("RGB"), + Resize(self.image_size), + # resnet5 downsamples x2 five times + PadToMultiple(32, fill=0), + ToTensor(), + Normalize(mean=mean, std=std)]) + + # batch of NCHW image tensors to batch of NHWC feature tensors + def __call__(self, images): + n, c, h, w = images.size(0), 2048, images.size(2), images.size(3) + + assert h % 32 == 0, "height divisible by 32 for resnet50" + assert w % 32 == 0, "width divisible by 32 for resnet50" + + with torch.no_grad(): + images = images.to(self.device) + + # resnet5 downsamples x2 five times + h, w = h // 32, w // 32 + + # resnet50 outputs flat view over a batch with 2048 channels, spatial resolution HxW + # https://github.com/pytorch/vision/blob/ac2e995a4352267f65e7cc6d354bde683a4fb402/torchvision/models/resnet.py#L202-L204 + + features = self.net(images) + features = rearrange(features, "n (c h w) -> n (h w) c", n=n, h=h, w=w, c=c) + + return features diff --git a/sfi/index.py b/sfi/index.py new file mode 100644 index 0000000..5d6424c --- /dev/null +++ b/sfi/index.py @@ -0,0 +1,52 @@ +import numpy as np +from einops import rearrange + +from faiss import IndexPQ + +from sfi.io import IndexIO, JsonIO + + +# TODO: benchmark +kNumResultsPerIndex = 512 + + +class IndexQueryError(Exception): + pass + + +class Index: + def __init__(self, path, metadata, features_size, num_probes=1): + self.index = IndexIO.load(path) + self.index.nprobes = num_probes + + # Disable Polysemous Codes until we know threshold for MACs + # self.index.search_type = IndexPQ.ST_polysemous + # self.index.polysemous_ht = 768 + + self.metadata = JsonIO.load(metadata) + self.features_size = features_size + + def query(self, query, num_results=1): + N, C = query.shape + + if N != self.features_size * self.features_size: + raise IndexQueryError("query feature size does not match index feature size") + + # C-array required for faiss FFI: tensors might not be contiguous + query = np.ascontiguousarray(query) + + dists, idxs = self.index.search(query, kNumResultsPerIndex) + + dists = rearrange(dists, "() n -> n") + idxs = rearrange(idxs, "() n -> n") + + results = list(zip(dists, idxs)) + + _, uniqued = np.unique([i for _, i in results], return_index=True) + results = [results[i] for i in uniqued] + results = sorted(results, key=lambda v: v[0]) + + results = [(round(d.item(), 3), self.metadata[i]) + for d, i in results[:num_results]] + + return results diff --git a/sfi/io.py b/sfi/io.py new file mode 100644 index 0000000..46d236c --- /dev/null +++ b/sfi/io.py @@ -0,0 +1,36 @@ +import json + +import numpy as np +import faiss + + +class ArrayIO: + @staticmethod + def save(path, x): + return np.save(str(path), x, allow_pickle=False) + + @staticmethod + def load(path): + return np.load(str(path), allow_pickle=False) + + +class IndexIO: + @staticmethod + def save(path, x): + return faiss.write_index(x, str(path)) + + @staticmethod + def load(path): + return faiss.read_index(str(path)) + + +class JsonIO: + @staticmethod + def save(path, x): + with path.open("w") as fd: + return json.dump(x, fd) + + @staticmethod + def load(path): + with path.open("r") as fd: + return json.load(fd) diff --git a/sfi/mixup.py b/sfi/mixup.py new file mode 100644 index 0000000..a830605 --- /dev/null +++ b/sfi/mixup.py @@ -0,0 +1,59 @@ +import torch +import torch.nn as nn + +import numpy as np + + +# Mixup for data augmentation +# https://arxiv.org/abs/1710.09412 + +class MixupDataLoaderAdaptor: + def __init__(self, dataloader, alpha=0.4): + self.dataloader = dataloader + self.dataiter = None + self.alpha = alpha + + def __len__(self): + return len(self.dataloader) + + def __iter__(self): + self.dataiter = iter(self.dataloader) + return self + + def __next__(self): + inputs1, labels1 = next(self.dataiter) + + n = inputs1.size(0) + + # draw t from (symmetric) beta distribution + # take from one side to prevent duplicates + + t = np.random.beta(self.alpha, self.alpha, size=n) + t = np.concatenate([t[:, None], 1 - t[:, None]], axis=1).max(axis=1) + t = torch.FloatTensor(t) + t = t.view(n, 1, 1, 1) + + # shuffle the batch inputs and targets to get second batch + + r = np.random.permutation(n) + inputs2, labels2 = inputs1[r], labels1[r] + + # mix up the original batch with the shuffled batch + + inputs = t * inputs1 + (1 - t) * inputs2 + + # With CrossEntropy we do not need the mixed up labels + # labels = t * labels1.float() + (1 - t) * labels2.float() + + return inputs, t, labels1, labels2 + + +class MixupCrossEntropyLossAdaptor(nn.Module): + def __init__(self, *args, **kwargs): + super().__init__() + self.criterion = nn.CrossEntropyLoss(*args, **kwargs) + + def forward(self, outputs, t, labels1, labels2): + lhs = t * self.criterion(outputs, labels1) + rhs = (1 - t) * self.criterion(outputs, labels2) + return (lhs + rhs).mean() diff --git a/sfi/tools/__init__.py b/sfi/tools/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/sfi/tools/__main__.py b/sfi/tools/__main__.py new file mode 100644 index 0000000..bbf2621 --- /dev/null +++ b/sfi/tools/__main__.py @@ -0,0 +1,89 @@ +from pathlib import Path + +import argparse + +import sfi.tools.frames +import sfi.tools.feature +import sfi.tools.feature3d +import sfi.tools.stream +import sfi.tools.server +import sfi.tools.client +import sfi.tools.train +import sfi.tools.infer +import sfi.tools.export + +parser = argparse.ArgumentParser(prog="sficmd") +subcmd = parser.add_subparsers(title="commands", metavar="") +subcmd.required = True + +Fmt = argparse.ArgumentDefaultsHelpFormatter + +frames = subcmd.add_parser("save-frames", help="saves key frames for video", formatter_class=Fmt) +frames.add_argument("--video", type=Path, required=True, help="file load video from") +frames.add_argument("--frames", type=Path, required=True, help="directory to save key frames to") +frames.add_argument("--similarity", type=float, default=0.95, help="similarity key frame threshold") +frames.add_argument("--pool", choices=["mean", "max"], default="mean", help="spatial pooling mode") +frames.add_argument("--image-size", type=int, default=7 * 32, choices=[v * 32 for v in range(1, 15)]) +frames.add_argument("--batch-size", type=int, default=8) +frames.set_defaults(main=sfi.tools.frames.main) + +stream = subcmd.add_parser("stream-index", help="builds an index in streaming mode", formatter_class=Fmt) +stream.add_argument("--index", type=Path, required=True, help="file to save index to") +stream.add_argument("--frames", type=Path, required=True, help="directory to load image frames from") +stream.add_argument("--num-train", type=int, required=True, help="number of samples to train on") +stream.add_argument("--image-size", type=int, default=14 * 32, choices=[v * 32 for v in range(1, 15)]) +stream.add_argument("--batch-size", type=int, default=64) +stream.add_argument("--num-workers", type=int, default=0) +stream.set_defaults(main=sfi.tools.stream.main) + +feature = subcmd.add_parser("save-feature", help="saves features for frames", formatter_class=Fmt) +feature.add_argument("--frame", type=Path, required=True, help="path to image frame") +feature.add_argument("--feature", type=Path, required=True, help="path to save features to") +feature.add_argument("--image-size", type=int, default=14 * 32, choices=[v * 32 for v in range(1, 15)]) +feature.set_defaults(main=sfi.tools.feature.main) + +feature3d = subcmd.add_parser("save-feature3d", help="saves features for videos", formatter_class=Fmt) +feature3d.add_argument("--video", type=Path, required=True, help="path to video") +feature3d.add_argument("--feature", type=Path, required=True, help="path to save features to") +feature3d.add_argument("--timesteps", type=int, default=64, help="frames per sequence along time axis") +feature3d.set_defaults(main=sfi.tools.feature3d.main) + +server = subcmd.add_parser("query-server", help="starts up the index query http server", formatter_class=Fmt) +server.add_argument("--index", type=Path, required=True, help="file to load index from") +server.add_argument("--host", type=str, default="127.0.0.1") +server.add_argument("--port", type=int, default=5000) +server.add_argument("--num-probes", type=int, default=1, help="number of cells to visit during search") +server.add_argument("--features-size", type=int, default=1, choices=range(1, 15)) +server.set_defaults(main=sfi.tools.server.main) + +client = subcmd.add_parser("query-client", help="queries the query server for similar features", formatter_class=Fmt) +client.add_argument("--host", type=str, default="127.0.0.1") +client.add_argument("--port", type=int, default=5000) +client.add_argument("--query", type=Path, required=True, help="feature file to query the index with") +client.add_argument("--num-results", type=int, default=10, help="number of similar frames to query for") +client.set_defaults(main=sfi.tools.client.main) + +train = subcmd.add_parser("model-train", help="trains a classifier model", formatter_class=Fmt) +train.add_argument("--model", type=Path, required=True, help="file to save trained model to") +train.add_argument("--resume-from", type=Path, help="file to load trained model from") +train.add_argument("--dataset", type=Path, required=True, help="directory to load dataset from") +train.add_argument("--batch-size", type=int, default=24) +train.add_argument("--num-workers", type=int, default=0) +train.add_argument("--num-epochs", type=int, default=100) +train.set_defaults(main=sfi.tools.train.main) + +infer = subcmd.add_parser("model-infer", help="runs inference with a classifier model", formatter_class=Fmt) +infer.add_argument("--model", type=Path, required=True, help="file to load trained model from") +infer.add_argument("--dataset", type=Path, required=True, help="directory to load dataset from") +infer.add_argument("--results", type=Path, required=True, help="file to save results to") +infer.add_argument("--batch-size", type=int, default=64) +infer.add_argument("--num-workers", type=int, default=0) +infer.set_defaults(main=sfi.tools.infer.main) + +export = subcmd.add_parser("model-export", help="export a classifier model to onnx", formatter_class=Fmt) +export.add_argument("--model", type=Path, required=True, help="file to load trained model from") +export.add_argument("--onnx", type=Path, required=True, help="file to save trained onnx model to") +export.set_defaults(main=sfi.tools.export.main) + +args = parser.parse_args() +args.main(args) diff --git a/sfi/tools/client.py b/sfi/tools/client.py new file mode 100644 index 0000000..c224ccf --- /dev/null +++ b/sfi/tools/client.py @@ -0,0 +1,33 @@ +import sys +import json +import base64 + +import requests +from einops import rearrange + +from sfi.io import ArrayIO + + +def main(args): + query = ArrayIO.load(args.query) + + if len(query.shape) == 1: # handle (C,) as (1, C) + query = rearrange(query, "n -> () n") + + N, C = query.shape + dtype = str(query.dtype) + feature = base64.b64encode(query.ravel()).decode("utf-8") + + url = "http://{}:{}".format(args.host, args.port) + + payload = {"num_results": args.num_results, + "feature": feature, + "shape": [N, C], + "dtype": dtype} + + res = requests.post(url, data=json.dumps(payload)) + + if res.status_code != requests.codes.ok: + sys.exit("Error: unable to query server") + + print(json.dumps(res.json())) diff --git a/sfi/tools/export.py b/sfi/tools/export.py new file mode 100644 index 0000000..343592f --- /dev/null +++ b/sfi/tools/export.py @@ -0,0 +1,28 @@ +import torch +import torch.onnx +import torch.nn as nn + +from torchvision.models import resnet50 + +def main(args): + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + if torch.cuda.is_available(): + print("Using CUDA, benchmarking implementations", file=sys.stderr) + torch.backends.cudnn.benchmark = True + + # Binary classifier on top of resnet50 + model = resnet50() + model.fc = nn.Linear(model.fc.in_features, 2) + + model = model.to(device) + model = nn.DataParallel(model) + + # Restore trained weights + weights = torch.load(str(args.model), map_location=device) + model.load_state_dict(weights) + + # Run dummy batch through model to trace computational graph + batch = torch.rand(1, 3, 224, 224, device=device) + + torch.onnx.export(model.module, batch, str(args.onnx)) diff --git a/sfi/tools/feature.py b/sfi/tools/feature.py new file mode 100644 index 0000000..8d450cf --- /dev/null +++ b/sfi/tools/feature.py @@ -0,0 +1,32 @@ +from torch.utils.data import DataLoader + +from einops import reduce + +from sfi.datasets import ImageSingleton +from sfi.features import FeatureExtractor +from sfi.io import ArrayIO + + +def main(args): + extract = FeatureExtractor(image_size=args.image_size) + + # We use this tool to compute query features on images of arbitrary sizes. + # That's why we can not batch images and have to feed them one by one. + + dataset = ImageSingleton(root=args.frame, transform=extract.transform) + loader = DataLoader(dataset, batch_size=1, num_workers=0) + + for images, paths in loader: + assert images.size(0) == 1, "image batch size of one for required" + + n, c, h, w = images.size(0), 2048, images.size(2), images.size(3) + + # resnet5 downsamples x2 five times + h, w = h // 32, w // 32 + + # MAC feature descriptor + features = extract(images) + features = reduce(features, "n (h w) c -> n c", "max", n=n, h=h, w=w, c=c) + features = features.data.cpu().numpy() + + ArrayIO.save(args.feature, features[0]) diff --git a/sfi/tools/feature3d.py b/sfi/tools/feature3d.py new file mode 100644 index 0000000..99f6e3d --- /dev/null +++ b/sfi/tools/feature3d.py @@ -0,0 +1,66 @@ +import sys + +import torch +import torch.nn as nn + +from torchvision.models.video import r2plus1d_18 + +from einops import rearrange + +from skvideo.io import vread + +from sfi.utils import batched + + +def main(args): + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + if torch.cuda.is_available(): + print("Using CUDA, benchmarking implementations", file=sys.stderr) + torch.backends.cudnn.benchmark = True + + # r2d2 says "beep beep" + resnet = r2plus1d_18(pretrained=True, progress=False) + + resnet.fc = nn.Identity() + # resnet.avgpool = nn.Identity() + + for params in resnet.parameters(): + params.requires_grad = False + + resnet = resnet.to(device) + resnet = nn.DataParallel(resnet) + + resnet.eval() + + # Pre-trained Kinetics-400 statistics for normalization + mean, std = [0.43216, 0.394666, 0.37645], [0.22803, 0.22145, 0.216989] + + mean = rearrange(torch.as_tensor(mean), "n -> () n () ()") + std = rearrange(torch.as_tensor(std), "n -> () n () ()") + + video = vread(str(args.video)) + + with torch.no_grad(): + for i, batch in enumerate(batched(video, args.timesteps)): + # TODO: + # - encapsulate video dataset + # - abstract away transforms + # - fix timesteps vs batching + + batch = rearrange(batch, "t h w c -> t c h w") + batch = torch.tensor(batch) + batch = batch.to(torch.float32) / 255 + + batch = (batch - mean) / std + + # model expects NxCxTxHxW + inputs = rearrange(batch, "t c h w -> () c t h w") + inputs = inputs.to(device) + + outputs = resnet(inputs) + outputs = rearrange(outputs, "() n -> n") + outputs = outputs.data.cpu().numpy() + + print("seq={}, frames=range({}, {}), prediction={}" + .format(i, i * args.timesteps, (i + 1) * args.timesteps, outputs.shape)) diff --git a/sfi/tools/frames.py b/sfi/tools/frames.py new file mode 100644 index 0000000..814bca5 --- /dev/null +++ b/sfi/tools/frames.py @@ -0,0 +1,67 @@ +import sys + +from torch.utils.data import DataLoader + +from PIL import Image + +import numpy as np +from einops import reduce + +from skvideo.io import vread + +from sfi.features import FeatureExtractor +from sfi.utils import batched + + +def main(args): + args.frames.mkdir(exist_ok=True) + + key = None + video = vread(str(args.video)) + extract = FeatureExtractor(image_size=args.image_size) + + nframes, nkeys = 0, 0 + + for i, batch in enumerate(batched(video, args.batch_size)): + # We should use the IterableDataset from upcoming PyTorch version for FramesDataset + + frames = [Image.fromarray(each) for each in batch] + + dataset = [extract.transform(frame) for frame in frames] + dataloader = DataLoader(dataset, batch_size=args.batch_size) + + assert len(dataloader) == 1 + images = next(iter(dataloader)) + + n, c, h, w = images.size(0), 2048, images.size(2), images.size(3) + + features = extract(images) + features = features.data.cpu().numpy() + + # resnet5 downsamples x2 five times + h, w = h // 32, w // 32 + + features = reduce(features, "n (h w) c -> n c", reduction=args.pool, n=n, h=h, w=w, c=c) + + for j, (frame, feature) in enumerate(zip(frames, features)): + nframes += 1 + + fid = i * args.batch_size + j + + if key: + prev_frame, prev_feature = key + + if similarity(prev_feature, feature) > args.similarity: + continue + + nkeys += 1 + key = frame, feature + frame.save(args.frames / "{:010d}.jpg".format(fid)) + + if nframes != 0: + print("Processed total={} keep={} drop={} ratio={}" + .format(nframes, nkeys, nframes - nkeys, round(nkeys / nframes, 2)), file=sys.stderr) + + +def similarity(x, y): + return (x @ y) / (np.linalg.norm(x) * np.linalg.norm(y)) diff --git a/sfi/tools/infer.py b/sfi/tools/infer.py new file mode 100644 index 0000000..d8dd4ae --- /dev/null +++ b/sfi/tools/infer.py @@ -0,0 +1,70 @@ +import sys +from pathlib import Path + +import torch +import torch.nn as nn +import torch.backends.cudnn +from torch.utils.data import DataLoader + +from torchvision.models import resnet50 +from torchvision.transforms import Compose, Normalize, ToTensor, Resize, CenterCrop + +from tqdm import tqdm + +from sfi.io import JsonIO +from sfi.datasets import ImageDirectory +from sfi.transforms import ToImageMode + + +def main(args): + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + if torch.cuda.is_available(): + print("Using CUDA, benchmarking implementations", file=sys.stderr) + torch.backends.cudnn.benchmark = True + + # ImageNet statistics (because we use pre-trained model) + mean, std = [0.485, 0.456, 0.406], [0.229, 0.224, 0.225] + + transform = Compose([ + ToImageMode("RGB"), + Resize(256), + CenterCrop(224), + ToTensor(), + Normalize(mean=mean, std=std)]) + + dataset = ImageDirectory(root=args.dataset, transform=transform) + dataloader = DataLoader(dataset, batch_size=args.batch_size, num_workers=args.num_workers) + + # Binary classifier on top of resnet50 + model = resnet50() + model.fc = nn.Linear(model.fc.in_features, 2) + + model = model.to(device) + model = nn.DataParallel(model) + + # Restore trained weights + weights = torch.load(str(args.model), map_location=device) + model.load_state_dict(weights) + + model.eval() + + results = [] + + with torch.no_grad(): + for inputs, paths in tqdm(dataloader, desc="infer", unit="batch", ascii=True): + inputs = inputs.to(device) + + outputs = model(inputs) + + _, preds = torch.max(outputs, dim=1) + preds = preds.data.cpu().numpy() + + probs = nn.functional.softmax(outputs, dim=1) + probs = probs.data.cpu().numpy() + + for path, pred, prob in zip(paths, preds, probs): + result = {"class": pred.item(), "probability": round(prob.max().item(), 3), "path": Path(path).name} + results.append(result) + + JsonIO.save(args.results, results) diff --git a/sfi/tools/server.py b/sfi/tools/server.py new file mode 100644 index 0000000..6b263be --- /dev/null +++ b/sfi/tools/server.py @@ -0,0 +1,58 @@ +import sys +import base64 +import binascii + +import numpy as np +from einops import rearrange + +from flask import Flask, request, jsonify, abort + +from sfi.index import Index, IndexQueryError + + +app = Flask(__name__) +index = None + + +@app.route("/", methods=["POST"]) +def query(): + if not index: + return abort(503) + + req = request.get_json(force=True, silent=False, cache=False) + + if not all(v in req for v in ["feature", "shape", "dtype"]): + return abort(400) + + try: + feature = base64.b64decode(req["feature"]) + except binascii.Error: + return abort(400) + + N, C = req["shape"] + dtype = req["dtype"] + + try: + vs = np.frombuffer(feature, dtype=dtype) + vs = rearrange(vs, "(n c) -> n c", n=N, c=C) + except ValueError: + return abort(400) + + num_results = req.get("num_results", 1) + + try: + results = index.query(vs, num_results=num_results) + except IndexQueryError: + return abort(400) + + return jsonify([{"distance": d, "path": p} for d, p in results]) + + +def main(args): + print("Loading index from disk", file=sys.stderr) + + global index + index = Index(path=args.index, metadata=args.index.with_suffix(".json"), + features_size=args.features_size, num_probes=args.num_probes) + + app.run(host=args.host, port=args.port, debug=False) diff --git a/sfi/tools/stream.py b/sfi/tools/stream.py new file mode 100644 index 0000000..102421d --- /dev/null +++ b/sfi/tools/stream.py @@ -0,0 +1,92 @@ +import sys +from pathlib import Path + +from torch.utils.data import DataLoader, random_split + +import numpy as np +from einops import reduce + +from faiss import IndexFlatL2, IndexIVFPQ + +from tqdm import tqdm + +from sfi.datasets import ImageDirectory +from sfi.features import FeatureExtractor +from sfi.io import IndexIO, JsonIO + +kNumCells = 100 +kNumCentroids = 256 # Note: on gpu this will not work; see links below +kNumBitsPerIdx = 8 + +# Gpu centroid limitations +# - https://github.com/facebookresearch/faiss/blob/a8118acbc516b0263dde610862c806400cc48bf5/gpu/impl/IVFPQ.cu#L69-L92 +# - https://github.com/facebookresearch/faiss/blob/a8118acbc516b0263dde610862c806400cc48bf5/ProductQuantizer.cpp#L189 + + +def main(args): + # https://github.com/facebookresearch/faiss/blob/a8118acbc516b0263dde610862c806400cc48bf5/Clustering.cpp#L78-L80 + if args.num_train < max(kNumCells, kNumCentroids): + sys.exit("Error: require at least {} training samples".format(max(kNumCells, kNumCentroids))) + + extract = FeatureExtractor(image_size=args.image_size) + + dataset = ImageDirectory(root=args.frames, transform=extract.transform) + train_dataset, index_dataset = random_split(dataset, [args.num_train, len(dataset) - args.num_train]) + + if len(train_dataset) > len(index_dataset) or len(train_dataset) > 0.25 * len(index_dataset): + sys.exit("Error: training dataset too big: train={}, index={}".format(len(train_dataset), len(index_dataset))) + + train_loader = DataLoader(train_dataset, batch_size=args.batch_size, num_workers=args.num_workers) + index_loader = DataLoader(index_dataset, batch_size=args.batch_size, num_workers=args.num_workers) + + N, C = len(train_dataset), 2048 + + train_features = np.empty(shape=(N, C), dtype=np.float32) + + for i, (images, paths) in enumerate(tqdm(train_loader, desc="Train", unit="batch", ascii=True)): + n, h, w = images.size(0), images.size(2), images.size(3) + + features = extract(images) + features = features.data.cpu().numpy() + + # resnet5 downsamples x2 five times + h, w = h // 32, w // 32 + + # MAC feature + features = reduce(features, "n (h w) c -> n c", "max", n=n, h=h, w=w, c=C) + + train_features[i * args.batch_size: i * args.batch_size + n] = features + + quantizer = IndexFlatL2(C) + + index = IndexIVFPQ(quantizer, C, kNumCells, kNumCentroids, kNumBitsPerIdx) + index.do_polysemous_training = True + + print("Training index on training features", file=sys.stderr) + index.train(train_features) + + metadata = [] + + for images, paths in tqdm(index_loader, desc="Index", unit="batch", ascii=True): + n, h, w = images.size(0), images.size(2), images.size(3) + + # resnet5 downsamples x2 five times + h, w = h // 32, w // 32 + + # MAC feature descriptor + features = extract(images) + features = reduce(features, "n (h w) c -> n c", "max", n=n, h=h, w=w, c=C) + features = features.data.cpu().numpy() + + # C-array required for faiss FFI: tensors might not be contiguous + features = np.ascontiguousarray(features) + + # Add a batch of (batch*49, 2048) unpooled features to the index at once + index.add(features) + + for path in paths: + fname = Path(path).name + metadata.append(fname) + + IndexIO.save(args.index.with_suffix(".idx"), index) + JsonIO.save(args.index.with_suffix(".json"), metadata) diff --git a/sfi/tools/train.py b/sfi/tools/train.py new file mode 100644 index 0000000..fe0ca8b --- /dev/null +++ b/sfi/tools/train.py @@ -0,0 +1,153 @@ +import sys +import copy +import collections + +import torch +import torch.nn as nn +import torch.optim +import torch.backends.cudnn +from torch.utils.data import DataLoader + +from torchvision.models import resnet50 +from torchvision.datasets import ImageFolder +from torchvision.transforms import Compose, Normalize, ToTensor, Resize, RandomHorizontalFlip + +from tqdm import tqdm + +from sfi.transforms import ToImageMode +from sfi.mixup import MixupDataLoaderAdaptor, MixupCrossEntropyLossAdaptor +from sfi.utils import decay_weights + + +def main(args): + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + if torch.cuda.is_available(): + print("Using CUDA, benchmarking implementations", file=sys.stderr) + torch.backends.cudnn.benchmark = True + + # ImageNet statistics (because we use pre-trained model) + mean, std = [0.485, 0.456, 0.406], [0.229, 0.224, 0.225] + + transform = Compose([ + ToImageMode("RGB"), + Resize(256), + RandomHorizontalFlip(), + ToTensor(), + Normalize(mean=mean, std=std)]) + + train_dataset = ImageFolder(root=args.dataset / "train", transform=transform) + val_dataset = ImageFolder(root=args.dataset / "val", transform=transform) + + train_loader = DataLoader(train_dataset, batch_size=args.batch_size, num_workers=args.num_workers, shuffle=True) + train_loader = MixupDataLoaderAdaptor(train_loader) + + val_loader = DataLoader(val_dataset, batch_size=args.batch_size, num_workers=args.num_workers, shuffle=False) + + model = resnet50(pretrained=True, progress=False) + + # Add binary classification head + model.fc = nn.Linear(model.fc.in_features, 2) + + model = model.to(device) + model = nn.DataParallel(model) + + if args.resume_from: + weights = torch.load(str(args.resume_from), map_location=device) + model.load_state_dict(weights) + + optimizer = torch.optim.Adam(model.parameters(), lr=1e-5) + + counts = collections.Counter(train_dataset.targets).values() + weight = torch.tensor([min(counts) / v for v in counts]).to(device) + + train_criterion = MixupCrossEntropyLossAdaptor(weight=weight) + val_criterion = nn.CrossEntropyLoss(weight=weight) + + best_wts = copy.deepcopy(model.state_dict()) + best_acc = 0.0 + + for epoch in range(args.num_epochs): + print("Epoch {}/{}".format(epoch, args.num_epochs - 1)) + print("-" * 10) + + loss, _, _, _ = train(model, train_criterion, optimizer, device, + dataset=train_dataset, dataloader=train_loader) + + print("train loss: {:.4f}".format(loss)) + + loss, acc, precision, recall = validate(model, val_criterion, device, + dataset=val_dataset, dataloader=val_loader) + + print("val loss: {:.4f} acc: {:.4f} precision: {:.4f} recall: {:.4f}".format(loss, acc, precision, recall)) + + if acc > best_acc: + best_acc = acc + best_wts = copy.deepcopy(model.state_dict()) + + print() + + print("Best acc: {:4f}".format(best_acc)) + + torch.save(best_wts, str(args.model)) + + +def train(model, criterion, optimizer, device, dataset, dataloader): + model.train() + + running_loss = 0.0 + + for inputs, t, labels1, labels2 in tqdm(dataloader, desc="train", unit="batch", ascii=True): + inputs = inputs.to(device) + t = t.to(device) + labels1 = labels1.to(device) + labels2 = labels2.to(device) + + optimizer.zero_grad() + + outputs = model(inputs) + + loss = criterion(outputs, t, labels1, labels2) + + loss.backward() + decay_weights(optimizer, 1e-4) + optimizer.step() + + running_loss += loss.item() * inputs.size(0) + + epoch_loss = running_loss / len(dataset) + + return epoch_loss, float("NaN"), float("NaN"), float("NaN") + + +def validate(model, criterion, device, dataset, dataloader): + model.eval() + + running_loss = 0.0 + tn, fn, tp, fp = 0, 0, 0, 0 + + with torch.no_grad(): + for inputs, labels in tqdm(dataloader, desc="val", unit="batch", ascii=True): + inputs = inputs.to(device) + labels = labels.to(device) + + outputs = model(inputs) + _, preds = torch.max(outputs, dim=1) + + loss = criterion(outputs, labels) + + running_loss += loss.item() * inputs.size(0) + + confusion = preds.float() / labels.float() + tn += torch.sum(torch.isnan(confusion)).item() + fn += torch.sum(confusion == float("inf")).item() + tp += torch.sum(confusion == 1).item() + fp += torch.sum(confusion == 0).item() + + epoch_loss = running_loss / len(dataset) + + accuracy = (tp + tn) / (tp + tn + fp + fn) + precision = tp / (tp + fp) + recall = tp / (tp + fn) + + return epoch_loss, accuracy, precision, recall diff --git a/sfi/transforms.py b/sfi/transforms.py new file mode 100644 index 0000000..4320a9b --- /dev/null +++ b/sfi/transforms.py @@ -0,0 +1,44 @@ +import torchvision.transforms.functional as F + + +def to_image_mode(image, mode): + return image.convert(mode) + + +class ToImageMode: + def __init__(self, mode): + self.mode = mode + + def __call__(self, image): + return to_image_mode(image, self.mode) + + +def pad_to_multiple(image, multiple, fill=0, padding_mode="constant"): + w, h = image.size + + def next_multiple_of(n, multiple): + return ((n // multiple) + int(bool(n % multiple))) * multiple + + padded_w = next_multiple_of(w, multiple) + padded_h = next_multiple_of(h, multiple) + + pad_left = (padded_w - w) // 2 + pad_right = pad_left + (padded_w - w) % 2 + + pad_top = (padded_h - h) // 2 + pad_bottom = pad_top + (padded_h - h) % 2 + + padding = (pad_left, pad_top, pad_right, pad_bottom) + + return F.pad(image, padding, fill=fill, padding_mode=padding_mode) + + +class PadToMultiple: + def __init__(self, multiple, fill=0, padding_mode="constant"): + self.multiple = multiple + self.fill = fill + self.padding_mode = padding_mode + + def __call__(self, image): + return pad_to_multiple(image, multiple=self.multiple, fill=self.fill, + padding_mode=self.padding_mode) diff --git a/sfi/utils.py b/sfi/utils.py new file mode 100644 index 0000000..ad655be --- /dev/null +++ b/sfi/utils.py @@ -0,0 +1,20 @@ +import itertools + + +def batched(iterable, n): + counter = itertools.count() + + for _, group in itertools.groupby(iterable, lambda _: next(counter) // n): + yield list(group) + + +def files(path): + return sorted([p for p in path.iterdir() if p.is_file()]) + + +# Proper weight decay for Adam, not L2 penalty +# https://github.com/pytorch/pytorch/pull/4429 +def decay_weights(optimizer, v): + for group in optimizer.param_groups: + for param in group["params"]: + param.data.add_(-v * group["lr"])