diff --git a/scripts/datasets/imagenet/parse-records.py b/scripts/datasets/imagenet/parse-records.py index 4b0321c..588dbbd 100644 --- a/scripts/datasets/imagenet/parse-records.py +++ b/scripts/datasets/imagenet/parse-records.py @@ -3,6 +3,7 @@ import os import math import numpy as np +import argparse import tensorflow as tf @@ -59,14 +60,20 @@ def _parse(record): return image, height, width, label, xmin, ymin, xmax, ymax, features['image/class/text'] if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('--input-dir', type=str, default='/data/tensorflow/imagenet/train') + parser.add_argument('--output-dir', type=str, default='/data/crossbow/imagenet/train') + args = parser.parse_args() + with tf.Session() as session: subset = "train" maxrecordsperfile = 2048 N = 0 # Expect 1251 records in 1 file (for training) # Number of records per file... mx = 0 - directory = "/data/tensorflow/imagenet/train" - pattern = os.path.join(directory, '%s-*-of-*' % subset) + input_dir = args.input_dir + output_dir = args.output_dir + pattern = os.path.join(input_dir, '%s-*-of-*' % subset) files = gfile.Glob(pattern) if not files: raise ValueError() @@ -101,7 +108,7 @@ def _parse(record): img_checksums = [] filecounter = 1 - filename = "crossbow-%s.records.%d" % (subset, filecounter) + filename = "%s/crossbow-%s.records.%d" % (output_dir, subset, filecounter) f = open(filename, "wb") # Write number of records as a file header recordsinfile = 0 @@ -178,7 +185,7 @@ def _parse(record): remaining = N - totalrecordswritten if remaining > 0: filecounter += 1 - filename = "crossbow-%s.records.%d" % (subset, filecounter) + filename = "%s/crossbow-%s.records.%d" % (output_dir, subset, filecounter) f = open(filename, "wb") # Write file header recordsinfile = 0 diff --git a/scripts/huawei/download_data.py b/scripts/huawei/download_data.py new file mode 100644 index 0000000..61d0c05 --- /dev/null +++ b/scripts/huawei/download_data.py @@ -0,0 +1,15 @@ +#!/usr/bin/env python + +from __future__ import print_function + +import moxing as mox +import time +import os + +if __name__ == '__main__': + data_dir = '/cache/data_dir' + start = time.time() + data_url = os.environ['DLS_DATA_URL'] + print('INFO: Start copying data from the blob storage ' + data_url + ' into SSD under ' + data_dir) + mox.file.copy_parallel(data_url, data_dir) + print('INFO: Copying completes! The copy task takes: ' + str(time.time() - start) + ' seconds') \ No newline at end of file diff --git a/scripts/huawei/runner.sh b/scripts/huawei/runner.sh new file mode 100644 index 0000000..ea3219e --- /dev/null +++ b/scripts/huawei/runner.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +CROSSBOW_HOME=/home/work/user-job-dir/Crossbow + +[ ! -d "/cache/train_dir" ] && mkdir /cache/train_dir + +python $CROSSBOW_HOME/scripts/huawei/download_data.py + +bash $CROSSBOW_HOME/scripts/datasets/imagenet/prepare-imagenet.sh /cache/data_dir /cache/train_dir + +python $CROSSBOW_HOME/scripts/huawei/upload_data.py \ No newline at end of file diff --git a/scripts/huawei/train.sh b/scripts/huawei/train.sh new file mode 100644 index 0000000..e5683e0 --- /dev/null +++ b/scripts/huawei/train.sh @@ -0,0 +1,21 @@ +#!/bin/bash + +CROSSBOW_HOME=/crossbow + +cd $CROSSBOW_HOME \ + && git pull \ + && mvn package \ + && cd clib-multigpu \ + && ./genmakefile.sh \ + && make -j $(nproc) \ + && cd ../ \ + && ./scripts/build.sh + +python $CROSSBOW_HOME/scripts/huawei/download_data.py + +mv /home/work/user-job-dir/Crossbow-scripts/imagenet-test.metadata $CROSSBOW_HOME/data/imagenet/imagenet-test.metadata +mv /home/work/user-job-dir/Crossbow-scripts/imagenet-train.metadata $CROSSBOW_HOME/data/imagenet/imagenet-train.metadata + +bash /home/work/user-job-dir/Crossbow-scripts/resnet-50.sh + +python $CROSSBOW_HOME/scripts/huawei/upload_data.py \ No newline at end of file diff --git a/scripts/huawei/upload_data.py b/scripts/huawei/upload_data.py new file mode 100644 index 0000000..fa5fac9 --- /dev/null +++ b/scripts/huawei/upload_data.py @@ -0,0 +1,12 @@ +#!/usr/bin/env python + +from __future__ import print_function + +import moxing as mox +import os + +if __name__ == '__main__': + train_dir = '/cache/train_dir' + train_url = os.environ['DLS_TRAIN_URL'] + print('INFO: Copy trained model to ' + train_url) + mox.file.copy_parallel(train_dir, train_url) \ No newline at end of file diff --git a/tools/dockerfiles/dockerfiles/Dockerfile.huawei b/tools/dockerfiles/dockerfiles/Dockerfile.huawei new file mode 100644 index 0000000..44a87e0 --- /dev/null +++ b/tools/dockerfiles/dockerfiles/Dockerfile.huawei @@ -0,0 +1,90 @@ +# ModelArts example: https://github.com/huawei-clouds/modelarts-example/blob/master/CustomImage +FROM swr.cn-north-1.myhuaweicloud.com/eiwizard/custom-gpu-cuda9-inner-moxing-cp36:1.1 as base + +# The pip source has been pre-configured to an internal source. Roll back to public sources. +RUN rm $HOME/.pip/pip.conf + +# Fix the source lists +RUN sed -i 's/cmc-cd-mirror.rnd.huawei.com/security.ubuntu.com/g' /etc/apt/sources.list + +# Replace the standard ubuntu source with Aliyun sources if buidling in mainland China +RUN sed -i s/archive.ubuntu.com/mirrors.aliyun.com/g /etc/apt/sources.list \ + && sed -i s/security.ubuntu.com/mirrors.aliyun.com/g /etc/apt/sources.list + +# Add the NVIDIA package repo and fetch key +# Reference: https://gitlab.com/nvidia/cuda/blob/ubuntu16.04/9.0/base/Dockerfile#L4 +RUN apt-get update && apt-get install -y --no-install-recommends ca-certificates apt-transport-https gnupg-curl && \ + rm -rf /var/lib/apt/lists/* && \ + NVIDIA_GPGKEY_SUM=d1be581509378368edeec8c1eb2958702feedf3bc3d17011adbf24efacce4ab5 && \ + NVIDIA_GPGKEY_FPR=ae09fe4bbd223a84b2ccfce3f60f4b3d7fa2af80 && \ + apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64/7fa2af80.pub && \ + apt-key adv --export --no-emit-version -a $NVIDIA_GPGKEY_FPR | tail -n +5 > cudasign.pub && \ + echo "$NVIDIA_GPGKEY_SUM cudasign.pub" | sha256sum -c --strict - && rm cudasign.pub && \ + echo "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/cuda.list && \ + echo "deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list + +RUN apt update && apt install -y --no-install-recommends \ + apt-utils \ + build-essential \ + cuda9.0 \ + cuda-cublas-9-0 \ + cuda-cufft-9-0 \ + cuda-curand-9-0 \ + cuda-cusolver-9-0 \ + cuda-cusparse-9-0 \ + libcudnn7=7.2.1.38-1+cuda9.0 \ + libcudnn7-dev=7.2.1.38-1+cuda9.0 \ + libnccl2=2.2.13-1+cuda9.0 \ + libnccl-dev=2.2.13-1+cuda9.0 \ + cuda-command-line-tools-9-0 \ + libfreetype6-dev \ + libhdf5-serial-dev \ + libpng12-dev \ + libzmq3-dev \ + pkg-config \ + software-properties-common \ + unzip \ + git \ + wget \ + openjdk-8-jdk \ + maven \ + libboost-all-dev \ + graphviz \ + cmake \ + nasm \ + && rm -rf /var/lib/apt/lists/* + +ENV CUDA_HOME /usr/local/cuda + +# OpenBLAS (TODO: install using apt install) +RUN git clone --progress https://github.com/xianyi/OpenBLAS.git openblas \ + && cd openblas \ + && make -j $(nproc) \ + && make install +ENV BLAS_HOME /opt/OpenBLAS +ENV LD_LIBRARY_PATH $BLAS_HOME/lib:$LD_LIBRARY_PATH + +# libjpeg-turbo (TODO: install using apt install) +RUN git clone --progress https://github.com/libjpeg-turbo/libjpeg-turbo.git \ + && cd libjpeg-turbo \ + && cmake -G"Unix Makefiles" && make -j $(nproc) +ENV JPEG_HOME /libjpeg-turbo +ENV LD_LIBRARY_PATH $JPEG_HOME/lib:$LD_LIBRARY_PATH +ENV LD_LIBRARY_PATH $JPEG_HOME:$LD_LIBRARY_PATH + +# Crossbow +ADD . /crossbow +ENV CROSSBOW_HOME /crossbow +RUN cd crossbow \ + && mvn package \ + && cd clib-multigpu \ + && ./genmakefile.sh \ + && make -j $(nproc) \ + && cd ../ \ + && ./scripts/build.sh + +# Install tensorflow-gpu 1.12.0 in the conda environment (pip has been redirected to conda pip) +RUN pip install -i https://pypi.tuna.tsinghua.edu.cn/simple tensorflow-gpu==1.12.0 # Run this if in the mainland China +# RUN pip install tensorflow-gpu==1.12.0 + +WORKDIR / \ No newline at end of file