Updates for PyTorch 2.0 (#94)

* Remove conda copying from the deployment stage. * Add extra utilities and help on sorting yaml requirements files. * Remove deprecated Caffe2 build flags. * Fix comments concerning the build process. * Update PyTorch default versions to v2.0.0 and TorchVision to 0.15.1. * Update ruff version. * Update build requirements for PyTorch 2.x even though this is a breaking change. The README was updated to mention this. * Remove FFMPEG flags from the TorchVision build process. * Add SymPy as a PyTorch runtime dependency. * Make deployment stage build target configurable. * Add documentation on how to specify build target stages and how to get the wheel files. * Reformat code. * Updated deployment MKL version to 2023. Installing from pip still does not work. * Change train build settings to compile build tests to match the train and deployment build configurations by default. * Fix formatting.
cresset-template · Mar 20, 2023 · 34d5f8b · 34d5f8b
1 parent 7568722
commit 34d5f8b
Show file tree

Hide file tree

Showing 9 changed files with 67 additions and 57 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -32,7 +32,7 @@ repos:
 
   # Ruff should be executed before other formatters.
   - repo: https://github.com/charliermarsh/ruff-pre-commit
-    rev: "v0.0.254"
+    rev: "v0.0.256"
     hooks:
       - id: ruff
         args: [--exit-non-zero-on-fix]

diff --git a/Dockerfile b/Dockerfile
@@ -207,14 +207,12 @@ COPY --link --from=clone-torch /opt/pytorch /opt/pytorch
 # Read `setup.py` and `CMakeLists.txt` to find build flags.
 # Different flags are available for different versions of PyTorch.
 # Variables without default values here recieve defaults from the top of the Dockerfile.
-# Disabling Caffe2, NNPack, and QNNPack as they are legacy and most users do not need them.
+# Disabling NNPack and QNNPack by default as they are legacy and most users do not need them.
 ARG USE_CUDA
 ARG USE_CUDNN=${USE_CUDA}
 ARG USE_NNPACK=0
 ARG USE_QNNPACK=0
 ARG BUILD_TEST=0
-ARG BUILD_CAFFE2=0
-ARG BUILD_CAFFE2_OPS=0
 ARG USE_PRECOMPILED_HEADERS
 ARG TORCH_CUDA_ARCH_LIST
 ARG CMAKE_PREFIX_PATH=/opt/conda
@@ -291,9 +289,6 @@ RUN --mount=type=bind,from=build-pillow,source=/tmp/dist,target=/tmp/dist \
     python -m pip install --force-reinstall --no-deps /tmp/dist/*
 
 ARG USE_CUDA
-# Disable FFMPEG and remove it as a build dependency if TorchVision
-# fails to compile with unhelpful error messages.
-ARG USE_FFMPEG=1
 ARG USE_PRECOMPILED_HEADERS
 ARG FORCE_CUDA=${USE_CUDA}
 ARG TORCH_CUDA_ARCH_LIST
@@ -532,7 +527,6 @@ COPY --link --from=fetch-vision  /tmp/dist  /tmp/dist
 ########################################################################
 FROM ${BUILD_IMAGE} AS deploy-builds-include
 
-COPY --link --from=install-conda /opt/conda /opt/conda
 COPY --link --from=build-pillow  /tmp/dist  /tmp/dist
 COPY --link --from=build-vision  /tmp/dist  /tmp/dist
 
@@ -548,7 +542,7 @@ FROM deploy-builds-${BUILD_MODE} AS deploy-builds
 
 # The Anaconda defaults channel and Intel MKL are not fully open-source.
 # Enterprise users may therefore wish to remove them from their final product.
-# The deployment therefore uses system Python. Conda is copied here just in case.
+# The deployment therefore uses system Python.
 # Intel packages such as MKL can be removed by using MKL_MODE=exclude during the build.
 # This may also be useful for non-Intel CPUs.
 

diff --git a/README.md b/README.md
@@ -119,24 +119,27 @@ IMAGE_NAME=cresset:train-USERNAME
 # [[Optional]]: Fill in these configurations manually if the defaults do not suffice.
 
 # NVIDIA GPU Compute Capability (CCA) values may be found at https://developer.nvidia.com/cuda-gpus
-CCA=8.6                            # Compute capability. CCA=8.6 for RTX3090 and A100.
-# CCA='8.6+PTX'                    # The '+PTX' enables forward compatibility. Multi-architecture builds can also be specified.
-# CCA='7.5 8.6+PTX'                # Visit the documentation for details. https://pytorch.org/docs/stable/cpp_extension.html
+CCA=8.6                          # Compute capability. CCA=8.6 for RTX3090 and A100.
+# CCA='8.6+PTX'                  # The '+PTX' enables forward compatibility. Multi-architecture builds can also be specified.
+# CCA='7.5 8.6+PTX'              # Visit the documentation for details. https://pytorch.org/docs/stable/cpp_extension.html
 
 # Used only if building PyTorch from source (`BUILD_MODE=include`).
 # The `*_TAG` variables are used only if `BUILD_MODE=include`. No effect otherwise.
-BUILD_MODE=exclude                 # Whether to build PyTorch from source.
-PYTORCH_VERSION_TAG=v1.13.1        # Any `git` branch or tag name can be used.
+BUILD_MODE=exclude               # Whether to build PyTorch from source.
+PYTORCH_VERSION_TAG=v1.13.1      # Any `git` branch or tag name can be used.
 TORCHVISION_VERSION_TAG=v0.14.1
 
 # General environment configurations.
-LINUX_DISTRO=ubuntu                # Visit the NVIDIA Docker Hub repo for available base images.
-DISTRO_VERSION=22.04               # https://hub.docker.com/r/nvidia/cuda/tags
-CUDA_VERSION=11.7.1                # Must be compatible with hardware and CUDA driver.
-CUDNN_VERSION=8                    # Only major version specifications are available.
-PYTHON_VERSION=3.10                # Specify the Python version.
-MKL_MODE=include                   # Enable MKL for Intel CPUs.
-TZ=Asia/Seoul                      # Set the container timezone.
+LINUX_DISTRO=ubuntu              # Visit the NVIDIA Docker Hub repo for available base images.
+DISTRO_VERSION=22.04             # https://hub.docker.com/r/nvidia/cuda/tags
+CUDA_VERSION=11.7.1              # Must be compatible with hardware and CUDA driver.
+CUDNN_VERSION=8                  # Only major version specifications are available.
+PYTHON_VERSION=3.10              # Specify the Python version.
+MKL_MODE=include                 # Enable MKL for Intel CPUs.
+TZ=Asia/Seoul                    # Set the container timezone.
+
+# Advanced Usage.
+TARGET_STAGE=train               # Target Dockerfile stage. The `*.whl` files are available in `train-builds`.
 ```
 
 ## General Usage After Initial Installation and Configuration

diff --git a/docker-compose.yaml b/docker-compose.yaml
@@ -60,14 +60,12 @@ services:
       dockerfile: Dockerfile
       args: # Equivalent to `--build-arg`.
         BUILD_MODE: ${BUILD_MODE:-exclude}
-        BUILD_CAFFE2: 0 # Caffe2 disabled for faster build.
-        BUILD_CAFFE2_OPS: 0
-        BUILD_TEST: 0
+        BUILD_TEST: 1 # Enable tests to have identical configurations with deployment.
         USE_NNPACK: 0
         USE_QNNPACK: 0
         LINUX_DISTRO: ${LINUX_DISTRO:-ubuntu}
         DISTRO_VERSION: ${DISTRO_VERSION:-22.04}
-        CUDA_VERSION: ${CUDA_VERSION:-11.7.1}
+        CUDA_VERSION: ${CUDA_VERSION:-11.8.0}
         CUDNN_VERSION: ${CUDNN_VERSION:-8}
         PYTHON_VERSION: ${PYTHON_VERSION:-3.10}
         MKL_MODE: ${MKL_MODE:-include} # MKL_MODE can be `include` or `exclude`.
@@ -85,12 +83,12 @@ services:
         # Fails if `BUILD_MODE=include` but `CCA` is not set explicitly.
         TORCH_CUDA_ARCH_LIST: ${CCA}
         # Variables for building PyTorch. Must be valid git tags.
-        PYTORCH_VERSION_TAG: ${PYTORCH_VERSION_TAG:-v1.13.1}
-        TORCHVISION_VERSION_TAG: ${TORCHVISION_VERSION_TAG:-v0.14.1}
+        PYTORCH_VERSION_TAG: ${PYTORCH_VERSION_TAG:-v2.0.0}
+        TORCHVISION_VERSION_TAG: ${TORCHVISION_VERSION_TAG:-v0.15.1}
         # Variables for downloading PyTorch instead of building.
-        PYTORCH_INDEX_URL: ${PYTORCH_INDEX_URL:-https://download.pytorch.org/whl/cu117}
-        PYTORCH_VERSION: ${PYTORCH_VERSION:-1.13.1}
-        TORCHVISION_VERSION: ${TORCHVISION_VERSION:-0.14.1}
+        PYTORCH_INDEX_URL: ${PYTORCH_INDEX_URL:-https://download.pytorch.org/whl/cu118}
+        PYTORCH_VERSION: ${PYTORCH_VERSION:-2.0.0}
+        TORCHVISION_VERSION: ${TORCHVISION_VERSION:-0.15.1}
         PROJECT_ROOT: ${PROJECT_ROOT:-/opt/project}
         GID: ${GID:-1000}
         UID: ${UID:-1000}
@@ -132,20 +130,18 @@ services:
     volumes: # Place user-specific directories in `docker-compose.override.yaml`.
       - .:${PROJECT_ROOT:-/opt/project}
     build:
-      target: deploy
+      target: ${TARGET_STAGE:-deploy}
       context: .
       dockerfile: Dockerfile
       args:
         BUILD_MODE: ${BUILD_MODE:-exclude}
         # The Anaconda `defaults` channel is not free for commercial use.
         BUILD_TEST: 1 # Enable build tests for deployment.
-        BUILD_CAFFE2: 1 # Caffe2 should be enabled in production settings.
-        BUILD_CAFFE2_OPS: 1
-        USE_NNPACK: 1 # Enable NNPack for deployment.
-        USE_QNNPACK: 1 # Enable QNNPack for deployment.
+        USE_NNPACK: 0 # Enable NNPack for deployment if required.
+        USE_QNNPACK: 0 # Enable QNNPack for deployment if required.
         LINUX_DISTRO: ${LINUX_DISTRO:-ubuntu}
         DISTRO_VERSION: ${DISTRO_VERSION:-22.04}
-        CUDA_VERSION: ${CUDA_VERSION:-11.7.1}
+        CUDA_VERSION: ${CUDA_VERSION:-11.8.0}
         CUDNN_VERSION: ${CUDNN_VERSION:-8}
         PYTHON_VERSION: ${PYTHON_VERSION:-3.10}
         # Requirements must include `mkl` if `MKL_MODE` is set to `include` for deployment.
@@ -155,12 +151,12 @@ services:
         CONDA_MANAGER: ${CONDA_MANAGER:-mamba}
         TORCH_CUDA_ARCH_LIST: ${CCA} # This will fail if BUILD_MODE=include but CCA is not set explicitly.
         # Variables for building PyTorch. Must be valid git tags.
-        PYTORCH_VERSION_TAG: ${PYTORCH_VERSION_TAG:-v1.13.1}
-        TORCHVISION_VERSION_TAG: ${TORCHVISION_VERSION_TAG:-v0.14.1}
+        PYTORCH_VERSION_TAG: ${PYTORCH_VERSION_TAG:-v2.0.0}
+        TORCHVISION_VERSION_TAG: ${TORCHVISION_VERSION_TAG:-v0.15.1}
         # Variables for downloading PyTorch instead of building.
-        PYTORCH_INDEX_URL: ${PYTORCH_INDEX_URL:-https://download.pytorch.org/whl/cu117}
-        PYTORCH_VERSION: ${PYTORCH_VERSION:-1.13.1}
-        TORCHVISION_VERSION: ${TORCHVISION_VERSION:-0.14.1}
+        PYTORCH_INDEX_URL: ${PYTORCH_INDEX_URL:-https://download.pytorch.org/whl/cu118}
+        PYTORCH_VERSION: ${PYTORCH_VERSION:-2.0.0}
+        TORCHVISION_VERSION: ${TORCHVISION_VERSION:-0.15.1}
         PROJECT_ROOT: ${PROJECT_ROOT:-/opt/project}
     #        DEB_OLD: ${DEB_OLD:-http://archive.ubuntu.com}
     #        DEB_NEW: ${DEB_NEW:-http://mirror.kakao.com}

diff --git a/pyproject.toml b/pyproject.toml
@@ -66,7 +66,7 @@ ignore-init-module-imports = true
 lines-after-imports = 2
 
 [tool.ruff.pycodestyle]
-# PEP8 states sets maximum documentation length to 72 but this is 
+# PEP8 states sets maximum documentation length to 72 but this is
 # too short for many people. Using 80 as in the Google Style Guide.
 max-doc-length = 80
 

diff --git a/reqs/README.md b/reqs/README.md
@@ -13,6 +13,15 @@ project root directory because of the `.dockerignore` file.
 To use files in other directories,
 please modify the `.dockerignore` file.
 
+# Notes on Building PyTorch 1.x
+
+PyTorch v2.x has very different build dependencies from PyTorch v1.x.
+While it may have been best to keep all dependencies, the build dependencies
+have been cleaned up for the PyTorch v2.x builds to save time and space.
+
+To build legacy PyTorch 1.x versions, copy the requirements from the following
+[link](https://github.com/cresset-template/cresset/blob/7568722631a458980b6586ab0799a2e0d6f0a3da/reqs/conda-build.requirements.txt).
+
 ## Build Dependency Versions
 
 Edit the package versions in `*-build.requirements.txt` if the latest versions

diff --git a/reqs/conda-build.requirements.txt b/reqs/conda-build.requirements.txt
@@ -1,26 +1,29 @@
+# PyTorch 2.x build-time dependencies. Do not use for PyTorch 1.x compilation.
+
 # Do not edit this file (including comments) unless absolutely necessary.
 # Editing this file will invalidate the Docker build cache for all build layers.
 # Specify package versions if necessary for the build.
 # Also, do not add MKL or related packages in this file.
 astunparse
-autoconf
 ccache
-cffi
 cmake
-ffmpeg  # Remove this if TorchVision fails to compile.
-future
+expecttest
+filelock
+fsspec
 git  # Needed to get the `git` commit hash, etc.
+hypothesis
 jemalloc
+jinja2
 libjpeg-turbo
 libpng
 lld
+networkx
 ninja
 numpy
-# pillow  # Not necessary as Pillow-SIMD is used.
-pkgconfig
+psutil
 pyyaml
 requests
-rsync
-setuptools  # ==59.5.0 # For older PyTorch versions that use `distutils.version`.
-six
-typing_extensions
+setuptools
+sympy
+types-dataclasses
+typing-extensions
diff --git a/reqs/pip-deploy.requirements.txt b/reqs/pip-deploy.requirements.txt
@@ -1,6 +1,6 @@
-# Lower the MKL version to mkl==2021.4.0 if PyTorch cannot find `libmkl_intel_lp64.so.1`.
-# Raise the MKL version to mkl==2022.x.x if PyTorch cannot find `libmkl_intel_lp64.so.2`.
 # The MKL major version (year) used to build PyTorch must match the version to run it.
 # Include the appropriate version of the `mkl` package manually if `MKL_MODE=include`.
-mkl==2022.1.0
-tqdm==4.64.0
+# Lower the MKL version to mkl==2021.4.0 if PyTorch cannot find `libmkl_intel_lp64.so.1`.
+# Raise the MKL version to mkl==2022.x.x if PyTorch cannot find `libmkl_intel_lp64.so.2`.
+mkl==2023.0.0
+tqdm==4.65.0
diff --git a/reqs/train-environment.yaml b/reqs/train-environment.yaml
@@ -4,6 +4,8 @@
 # to reduce dependency issues with conda and for greater flexibility.
 # Manually add dependencies of compiled libraries for reduced
 # installation with pip.
+# Tip: Use `awk 'START_LINE<=NR && FINISH_LINE<=20 reqs/train-environment.yaml'`
+# to sort dependencies in the command line while preserving comments, etc.
 name: base # Always use the `base` environment.
 channels:
   - nodefaults # Do not use the default environment.
@@ -16,6 +18,7 @@ dependencies: # Use conda packages if possible.
   - libpng # TorchVision dependency.
   - numpy # Intel optimized NumPy is not available on PyPI.
   - mkl # Essential if BUILD_MODE=include and MKL_MODE=include.
+  - sympy # A PyTorch dependency.
   - tqdm
   - typing_extensions # A PyTorch dependency.
 
@@ -25,11 +28,13 @@ dependencies: # Use conda packages if possible.
   - tzdata
 
   # Utility packages.
+  - attrs
   - conda-lock
   - git
   - htop
+  - invoke
   - lazygit
-  - monkeytype
+  - loguru
   - nano
   - pandera
   - parallel