From febf0e92d953ed2b22adcd83e876316ac1788a1b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joonhyung=20Lee/=EC=9D=B4=EC=A4=80=ED=98=95?= <33523965+veritas9872@users.noreply.github.com> Date: Thu, 20 Apr 2023 10:25:26 +0900 Subject: [PATCH] Fixes many miscellaneous bugs. (#122) * Remove the Intel channel from Python, Numpy etc. because issues only arise with `pip` installs, not `conda` installs. The Intel channel was causing unnecessary overhead and issues. * Make the rules for lockfiles more flexible. * Fix accidental CUDA version change to 11.7 back to 11.8. * Remove unnecessary `apt` requirements. * Remove HOST_NAME variable. * Cleanup of the docker-compose.yaml file to make services easier to see. Most settings have been moved to the `base` service. Also got rid of the $HOST_NAME variable. * Set the `SHELL` environment variable to an empty string as it was previously fixed to `/bin/bash`. This fixes the color problem in new `tmux` shells and prevents possible incompatibilities in Docker `RUN` instructions. --- .dockerignore | 4 +- Makefile | 12 +-- README.md | 1 - docker-compose.yaml | 158 ++++++++++++-------------------- dockerfiles/ngc.Dockerfile | 4 + reqs/hub-apt.requirements.txt | 1 - reqs/simple-environment.yaml | 6 +- reqs/train-apt.requirements.txt | 4 +- 8 files changed, 71 insertions(+), 119 deletions(-) diff --git a/.dockerignore b/.dockerignore index 6eb7c6b..89206dc 100644 --- a/.dockerignore +++ b/.dockerignore @@ -7,5 +7,5 @@ !**/*requirements*.txt !*environment*.yaml !**/*environment*.yaml -!*conda-lock.yaml -!**/*conda-lock.yaml +!*conda-lock*.yaml +!**/*conda-lock*.yaml diff --git a/Makefile b/Makefile index 397a641..ab4c4d8 100644 --- a/Makefile +++ b/Makefile @@ -20,7 +20,7 @@ _PROJECT = "${SERVICE}-${USR}" PROJECT = $(shell echo ${_PROJECT} | tr "[:upper:]" "[:lower:]") PROJECT_ROOT = /opt/project -# Creates a `.env` file in PWD if it does not exist. +# Creates a `.env` file in ${PWD} if it does not exist. # This will help prevent UID/GID bugs in `docker-compose.yaml`, # which unfortunately cannot use shell outputs in the file. # Image names have the usernames appended to them to prevent @@ -39,7 +39,6 @@ IMAGE_NAME = $(shell echo ${_IMAGE_NAME} | tr "[:upper:]" "[:lower:]") # Makefiles require `$\` at the end of a line for multi-line string values. # https://www.gnu.org/software/make/manual/html_node/Splitting-Lines.html -# `HOST_NAME` avoids conflict with the `HOSTNAME` shell builtin variable. ENV_TEXT = "$\ GID=${GID}\n$\ UID=${UID}\n$\ @@ -48,16 +47,13 @@ USR=${USR}\n$\ PROJECT=${PROJECT}\n$\ SERVICE=${SERVICE}\n$\ COMMAND=${COMMAND}\n$\ -HOST_NAME=${SERVICE}\n$\ IMAGE_NAME=${IMAGE_NAME}\n$\ PROJECT_ROOT=${PROJECT_ROOT}\n$\ " -# Creates the `.env` file if it does not exist. -# The `.env` file must be checked via the shell -# as is cannot be made into a Makefile target. -# This would make it impossible to reference it in the `include` command. -env: +# The `.env` file must be checked via shell as is cannot be a Makefile target. +# Doing so would make it impossible to reference `.env` in the `-include` command. +env: # Creates the `.env` file if it does not exist. @test -f ${ENV_FILE} || printf ${ENV_TEXT} >> ${ENV_FILE} check: # Checks if the `.env` file exists. diff --git a/README.md b/README.md index 88abb06..382427a 100644 --- a/README.md +++ b/README.md @@ -182,7 +182,6 @@ USR=USERNAME PROJECT=train-username # `PROJECT` must be in lowercase. SERVICE=train COMMAND=/bin/zsh # Command to execute on starting the container. -HOST_NAME=train IMAGE_NAME=cresset:train-USERNAME PROJECT_ROOT=/opt/project diff --git a/docker-compose.yaml b/docker-compose.yaml index 4082ded..7954ff8 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -1,28 +1,24 @@ -# Requires the Docker Compose V2. +# Requires Docker Compose V2. # See https://docs.docker.com/compose/compose-file/compose-file-v3 # and https://github.com/compose-spec/compose-spec/blob/master/spec.md # for details concerning the `docker-compose.yaml` file syntax. -# Variables are in ${VARIABLE:-DEFAULT_VALUE} format -# to ensure that default values are given to the Dockerfile. -# Using a `.env` file to set variables is strongly recommended. -# However, note that variables in the host shell have -# higher priority than the `.env` file for Docker Compose. - -# Run `make env` to create a basic `.env` file with the UID and GID variables. -# Compute Capability must be specified via the `CCA` variable. - -# Using a `docker-compose.yaml` file has many advantages -# over creating custom shell scripts for each project. +# Using `docker-compose.yaml` has many advantages over writing custom shell scripts. # The settings are much easier to see and maintain than scattered shell scripts. # Also, Compose is a native Docker component, simplifying project maintenance. -# Set the host environment variable `BUILDKIT_PROGRESS=plain` to see the full build log. -# https://github.com/docker/cli/blob/master/docs/reference/commandline/cli.md#environment-variables +# Run `make env` to create a basic `.env` file with the UID and GID variables. +# Using a `.env` file to set variables is strongly recommended. However, +# variables in the host shell have higher priority than `.env` for Docker Compose. +# Variables are in ${VARIABLE:-DEFAULT_VALUE} format to specify default values. # See https://pytorch.org/docs/stable/cpp_extension.html for an # explanation of how to specify the `TORCH_CUDA_ARCH_LIST` variable. # The variable `CCA` is used to specify `TORCH_CUDA_ARCH_LIST`. +# Compute Capability must be specified via the `CCA` variable. + +# Set the host environment variable `BUILDKIT_PROGRESS=plain` to see the full build log. +# https://github.com/docker/cli/blob/master/docs/reference/commandline/cli.md#environment-variables networks: # Use the host network instead of creating a separate network. default: # This reduces load and conflicts with the host network. @@ -35,22 +31,50 @@ services: init: true # Equivalent to `--init` flag in `docker run`. stdin_open: true # equivalent to `-i` flag in `docker run`. working_dir: ${PROJECT_ROOT:-/opt/project} + user: ${UID:-1000}:${GID:-1000} # Specify USR/GRP at runtime. # Use different image names for different users and projects. # Otherwise, images will be repeatedly removed and recreated. # The removed images will remain cached, however. image: ${IMAGE_NAME} network_mode: host # Use the same network as the host, may cause security issues. # `ipc: host` removes the shared memory cap but is a known security vulnerability. - # ipc: host # Equivalent to `--ipc=host` in `docker run`. Disable this on WSL. + # ipc: host # Equivalent to `--ipc=host` in `docker run`. **Disable this on WSL.** # shm_size: 1GB # Explicit shared memory limit. No security issues this way. - environment: # Common runtime environment variables. + hostname: ${SERVICE} # Makes `pure` terminals easier to tell apart. + extra_hosts: # Prevents "unknown host" issue when using `sudo`. + - "${SERVICE}:127.0.0.1" + + # Common environment variables for the container runtime. No effect on build. + environment: # Equivalent to `--env` CUDA_DEVICE_ORDER: PCI_BUS_ID + HISTSIZE: 50000 # Hard-coded large command history size. TZ: ${TZ:-Asia/Seoul} # Timezone settings used during runtime. + # tmpfs: # Create directory in RAM for fast data IO. + # - /opt/data + # Default volume pairings of ${HOST_PATH}:${CONTAINER_PATH}. + # Allows the container to access `HOST_PATH` as `CONTAINER_PATH`. + # See https://docs.docker.com/storage/volumes for details. + # Always use the ${HOME} variable to specify the host home directory. + # See https://github.com/docker/compose/issues/6506 for details. + volumes: # Equivalent to `-v` flag in `docker run`. + # Current working directory `.` is connected to `PROJECT_ROOT`. + # Mount `.` if the docker-compose.yaml file is at the project root. + # Mount `..` if Cresset is a subdirectory in a different project, etc. + - .:${PROJECT_ROOT:-/opt/project} + # Preserve VSCode extensions between containers. + # Assumes default VSCode server directory. + # May cause VSCode issues if multiple Cresset-based projects are on the + # same machine writing to the `${HOME}/.vscode-server` directory. + # If so, specify a different host directory for each project. + - ${HOME}/.vscode-server:/home/${USR:-user}/.vscode-server + build: context: . # Nearly all files are ignored due to `.dockerignore` settings. + target: ${TARGET_STAGE:-train} # Specify Dockerfile target build stage. args: # Common build-time environment variables. # Even if these variables are unnecessary during the build, # they can be ignored simply by not defining them in that stage. + INTERACTIVE_MODE: ${INTERACTIVE_MODE:-include} PROJECT_ROOT: ${PROJECT_ROOT:-/opt/project} GID: ${GID:-1000} UID: ${UID:-1000} @@ -66,37 +90,13 @@ services: capabilities: [gpu] # device_ids: [ "0" ] # Use only GPU 0. - train: # Default service name. Change the name for each project. + train: extends: service: base - # Set to the service name. Makes terminals easier to tell apart. - # `HOST_NAME` avoids conflict with the `HOSTNAME` shell builtin variable. - hostname: ${HOST_NAME:-train} - extra_hosts: - - "${HOST_NAME:-train}:127.0.0.1" # Prevents "unknown host" issue when using `sudo`. - user: ${UID:-1000}:${GID:-1000} - environment: # Environment variables for the container, not the build. Equivalent to `--env` - HISTSIZE: 50000 # Hard-coded large command history size. - # Setting `HOST_PATH:CONTAINER_PATH` - # allows the container to access `HOST_PATH` as `CONTAINER_PATH`. - # See https://docs.docker.com/storage/volumes for details. - # Current working directory `.` is connected to `PROJECT_ROOT`. - # Always use the ${HOME} variable to specify the host home directory. - # See https://github.com/docker/compose/issues/6506 for details. - volumes: # Equivalent to `-v` flag in `docker run`. - # Use this if the docker-compose.yaml file is at the project root. - - .:${PROJECT_ROOT:-/opt/project} - # Preserve VSCode extensions between containers. - # Assumes default VSCode server directory. - - ${HOME}/.vscode-server:/home/${USR:-user}/.vscode-server - # tmpfs: # Create directory in RAM for fast data IO. - # - /opt/data build: # Options for building. Used when `--build` is called in `docker compose`. # Set `TARGET_STAGE` to `train-builds` to get just the wheels in `/tmp/dist`. - target: ${TARGET_STAGE:-train} # Specify build target. dockerfile: Dockerfile args: # Equivalent to `--build-arg`. - INTERACTIVE_MODE: ${INTERACTIVE_MODE:-include} BUILD_MODE: ${BUILD_MODE:-exclude} BUILD_TEST: 1 # Enable tests to have identical configurations with deployment. USE_NNPACK: 0 @@ -110,84 +110,52 @@ services: MKL_MODE: ${MKL_MODE:-include} # MKL_MODE can be `include` or `exclude`. # Change the `CONDA_URL` for different hardware architectures. # URLs from https://github.com/conda-forge/miniforge are recommended over - # Miniconda URLs from https://docs.conda.io/en/latest/miniconda.html - # `CONDA_MANAGER` may be either `mamba` (the default) or `conda`. - # Mamba is a faster reimplementation of conda in C++ - # However, there are occasions where mamba is unable to - # resolve conflicts that conda can resolve. + # Miniconda URLs from https://docs.conda.io/en/latest/miniconda.html. + # The `CONDA_MANAGER` may be either `mamba` (the default) or `conda`. + # However, `mamba` may be unable to resolve conflicts that `conda` can. # In such cases, set `CONDA_MANAGER=conda` for conda-based installation. - # Note that installing Mamba via Mambaforge is strongly recommended. + # Installing `mamba` via Mambaforge is strongly recommended. CONDA_URL: ${CONDA_URL:-https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforge-Linux-x86_64.sh} CONDA_MANAGER: ${CONDA_MANAGER:-mamba} # Fails if `BUILD_MODE=include` but `CCA` is not set explicitly. - TORCH_CUDA_ARCH_LIST: ${CCA} - # Variables for building PyTorch. Must be valid git tags. + TORCH_CUDA_ARCH_LIST: ${CCA} # Ignore the missing CCA warning otherwise. + # Variables for building PyTorch. Must be valid git tags or commits. PYTORCH_VERSION_TAG: ${PYTORCH_VERSION_TAG:-v2.0.0} TORCHVISION_VERSION_TAG: ${TORCHVISION_VERSION_TAG:-v0.15.1} # Variables for downloading PyTorch instead of building. PYTORCH_INDEX_URL: ${PYTORCH_INDEX_URL:-https://download.pytorch.org/whl/cu118} PYTORCH_VERSION: ${PYTORCH_VERSION:-2.0.0} TORCHVISION_VERSION: ${TORCHVISION_VERSION:-0.15.1} - # URL for faster `apt` and `pip` installs. Optimized for Korean users. - # Use URLs optimized for user location and security requirements. + # URLs for faster `apt` and `pip` installs. Comment out to use the defaults. + # Use URLs optimized for location and security requirements. # DEB_OLD: ${DEB_OLD:-http://archive.ubuntu.com} # DEB_NEW: ${DEB_NEW:-http://mirror.kakao.com} - # Comment out the PyPI mirrors to use the default PyPI repository. # INDEX_URL: ${INDEX_URL:-http://mirror.kakao.com/pypi/simple} # TRUSTED_HOST: ${TRUSTED_HOST:-mirror.kakao.com} - # This layer may be useful for PyTorch contributors. devel: # Skeleton service for development and debugging. - extends: + extends: # This service may be useful for PyTorch CUDA/C++ contributors. service: base - hostname: ${HOST_NAME:-devel} - extra_hosts: - - "${HOST_NAME:-devel}:127.0.0.1" - volumes: - - .:${PROJECT_ROOT:-/opt/project} build: target: ${TARGET_STAGE:-build-base} # All builds begin at `build-base`. dockerfile: Dockerfile - # Service based on images from the NGC PyTorch image catalog. Visit - # https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/running.html - # for an up-to-date list of all NVIDIA NGC PyTorch images. - # Note that the NGC images are very unstable, with many differences between versions. - # This service may break for different `NGC_YEAR` and `NGC_MONTH` configurations. - ngc: + ngc: # Service based on images from the NGC PyTorch image catalog. + # https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/running.html + # NGC images are very unstable, with many differences between versions. + # This service may break for different `NGC_YEAR` and `NGC_MONTH` values. extends: service: base - user: ${UID:-1000}:${GID:-1000} - environment: - HISTSIZE: 50000 # Hard-coded large command history size. - volumes: - - .:${PROJECT_ROOT:-/opt/project} - hostname: ${HOST_NAME:-ngc} - extra_hosts: - - "${HOST_NAME:-ngc}:127.0.0.1" build: - target: ${TARGET_STAGE:-train} dockerfile: dockerfiles/ngc.Dockerfile args: NGC_YEAR: ${NGC_YEAR:-23} NGC_MONTH: ${NGC_MONTH:-03} - INTERACTIVE_MODE: ${INTERACTIVE_MODE:-include} - # Service based on the official PyTorch Docker images from Docker Hub. Visit - # https://hub.docker.com/r/pytorch/pytorch/tags to find available images. - hub: - extends: + hub: # Service based on the official PyTorch Docker images from Docker Hub. + extends: # Available images: https://hub.docker.com/r/pytorch/pytorch/tags service: base - user: ${UID:-1000}:${GID:-1000} - environment: - HISTSIZE: 50000 # Hard-coded large command history size. - volumes: - - .:${PROJECT_ROOT:-/opt/project} - hostname: ${HOST_NAME:-hub} - extra_hosts: - - "${HOST_NAME:-hub}:127.0.0.1" build: - target: ${TARGET_STAGE:-train} dockerfile: dockerfiles/hub.Dockerfile args: PYTORCH_VERSION: ${PYTORCH_VERSION:-2.0.0} @@ -195,26 +163,14 @@ services: CUDA_SHORT_VERSION: ${CUDA_SHORT_VERSION:-11.7} CUDNN_VERSION: ${CUDNN_VERSION:-8} IMAGE_FLAVOR: ${IMAGE_FLAVOR:-devel} - INTERACTIVE_MODE: ${INTERACTIVE_MODE:-include} - # Service installed purely from official/verified Docker images and `conda`. - simple: + simple: # Service installed purely from official/verified Docker images and `conda`. extends: service: base - user: ${UID:-1000}:${GID:-1000} - environment: - HISTSIZE: 50000 # Hard-coded large command history size. - volumes: - - .:${PROJECT_ROOT:-/opt/project} - hostname: ${HOST_NAME:-simple} - extra_hosts: - - "${HOST_NAME:-simple}:127.0.0.1" build: - target: ${TARGET_STAGE:-train} dockerfile: dockerfiles/simple.Dockerfile args: BASE_IMAGE: ${LINUX_DISTRO:-ubuntu}:${DISTRO_VERSION:-22.04} - INTERACTIVE_MODE: ${INTERACTIVE_MODE:-include} LOCK_MODE: ${LOCK_MODE:-exclude} CONDA_URL: ${CONDA_URL:-https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforge-Linux-x86_64.sh} CONDA_MANAGER: ${CONDA_MANAGER:-mamba} diff --git a/dockerfiles/ngc.Dockerfile b/dockerfiles/ngc.Dockerfile index a3a06df..c1b3dd3 100644 --- a/dockerfiles/ngc.Dockerfile +++ b/dockerfiles/ngc.Dockerfile @@ -31,6 +31,10 @@ ENV PYTHONIOENCODING=UTF-8 ARG PYTHONDONTWRITEBYTECODE=1 ARG PYTHONUNBUFFERED=1 +# The base NGC image sets `SHELL=bash`. Docker cannot unset an `ENV` variable, +# ergo, `SHELL=''` is used for best compatibility with the other services. +ENV SHELL='' + # Install `apt` requirements. # `tzdata` requires noninteractive mode. ARG DEBIAN_FRONTEND=noninteractive diff --git a/reqs/hub-apt.requirements.txt b/reqs/hub-apt.requirements.txt index eb7a69e..f0f6ffc 100644 --- a/reqs/hub-apt.requirements.txt +++ b/reqs/hub-apt.requirements.txt @@ -1,4 +1,3 @@ -libjemalloc-dev sudo tmux tzdata diff --git a/reqs/simple-environment.yaml b/reqs/simple-environment.yaml index b3c107b..9de954b 100644 --- a/reqs/simple-environment.yaml +++ b/reqs/simple-environment.yaml @@ -9,13 +9,13 @@ channels: - conda-forge # Always use conda-forge instead. - nvidia # CUDA-related packages are available in the NVIDIA channel. dependencies: # Use conda packages if possible. - - intel::python==3.10 + - python==3.10 - pytorch::pytorch # Only install PyTorch-related packages from the PyTorch channel. - pytorch::torchvision - pytorch::pytorch-cuda==11.8 - jemalloc - - intel::mkl - - intel::numpy # Use Numpy built with the Intel compiler for best performance with MKL. + - mkl + - numpy - pytest - tmux==3.2a - tqdm diff --git a/reqs/train-apt.requirements.txt b/reqs/train-apt.requirements.txt index 8a342a7..f99b476 100644 --- a/reqs/train-apt.requirements.txt +++ b/reqs/train-apt.requirements.txt @@ -1,7 +1,5 @@ -# Example `apt` requirements file. -# `sudo` and `zsh` are required packages. +# Example `apt` requirements file. `sudo` and `zsh` are required packages. at numactl sudo -watchman # For pyre-check only. zsh