Skip to content

Commit

Permalink
Check in Huawei scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
luomai committed May 6, 2019
1 parent e45a23d commit 66742d0
Show file tree
Hide file tree
Showing 6 changed files with 160 additions and 4 deletions.
15 changes: 11 additions & 4 deletions scripts/datasets/imagenet/parse-records.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import os
import math
import numpy as np
import argparse

import tensorflow as tf

Expand Down Expand Up @@ -59,14 +60,20 @@ def _parse(record):
return image, height, width, label, xmin, ymin, xmax, ymax, features['image/class/text']

if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('--input-dir', type=str, default='/data/tensorflow/imagenet/train')
parser.add_argument('--output-dir', type=str, default='/data/crossbow/imagenet/train')
args = parser.parse_args()

with tf.Session() as session:
subset = "train"
maxrecordsperfile = 2048
N = 0 # Expect 1251 records in 1 file (for training)
# Number of records per file...
mx = 0
directory = "/data/tensorflow/imagenet/train"
pattern = os.path.join(directory, '%s-*-of-*' % subset)
input_dir = args.input_dir
output_dir = args.output_dir
pattern = os.path.join(input_dir, '%s-*-of-*' % subset)
files = gfile.Glob(pattern)
if not files:
raise ValueError()
Expand Down Expand Up @@ -101,7 +108,7 @@ def _parse(record):
img_checksums = []

filecounter = 1
filename = "crossbow-%s.records.%d" % (subset, filecounter)
filename = "%s/crossbow-%s.records.%d" % (output_dir, subset, filecounter)
f = open(filename, "wb")
# Write number of records as a file header
recordsinfile = 0
Expand Down Expand Up @@ -178,7 +185,7 @@ def _parse(record):
remaining = N - totalrecordswritten
if remaining > 0:
filecounter += 1
filename = "crossbow-%s.records.%d" % (subset, filecounter)
filename = "%s/crossbow-%s.records.%d" % (output_dir, subset, filecounter)
f = open(filename, "wb")
# Write file header
recordsinfile = 0
Expand Down
15 changes: 15 additions & 0 deletions scripts/huawei/download_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#!/usr/bin/env python

from __future__ import print_function

import moxing as mox
import time
import os

if __name__ == '__main__':
data_dir = '/cache/data_dir'
start = time.time()
data_url = os.environ['DLS_DATA_URL']
print('INFO: Start copying data from the blob storage ' + data_url + ' into SSD under ' + data_dir)
mox.file.copy_parallel(data_url, data_dir)
print('INFO: Copying completes! The copy task takes: ' + str(time.time() - start) + ' seconds')
11 changes: 11 additions & 0 deletions scripts/huawei/runner.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#!/bin/bash

CROSSBOW_HOME=/home/work/user-job-dir/Crossbow

[ ! -d "/cache/train_dir" ] && mkdir /cache/train_dir

python $CROSSBOW_HOME/scripts/huawei/download_data.py

bash $CROSSBOW_HOME/scripts/datasets/imagenet/prepare-imagenet.sh /cache/data_dir /cache/train_dir

python $CROSSBOW_HOME/scripts/huawei/upload_data.py
21 changes: 21 additions & 0 deletions scripts/huawei/train.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#!/bin/bash

CROSSBOW_HOME=/crossbow

cd $CROSSBOW_HOME \
&& git pull \
&& mvn package \
&& cd clib-multigpu \
&& ./genmakefile.sh \
&& make -j $(nproc) \
&& cd ../ \
&& ./scripts/build.sh

python $CROSSBOW_HOME/scripts/huawei/download_data.py

mv /home/work/user-job-dir/Crossbow-scripts/imagenet-test.metadata $CROSSBOW_HOME/data/imagenet/imagenet-test.metadata
mv /home/work/user-job-dir/Crossbow-scripts/imagenet-train.metadata $CROSSBOW_HOME/data/imagenet/imagenet-train.metadata

bash /home/work/user-job-dir/Crossbow-scripts/resnet-50.sh

python $CROSSBOW_HOME/scripts/huawei/upload_data.py
12 changes: 12 additions & 0 deletions scripts/huawei/upload_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#!/usr/bin/env python

from __future__ import print_function

import moxing as mox
import os

if __name__ == '__main__':
train_dir = '/cache/train_dir'
train_url = os.environ['DLS_TRAIN_URL']
print('INFO: Copy trained model to ' + train_url)
mox.file.copy_parallel(train_dir, train_url)
90 changes: 90 additions & 0 deletions tools/dockerfiles/dockerfiles/Dockerfile.huawei
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
# ModelArts example: https://github.com/huawei-clouds/modelarts-example/blob/master/CustomImage
FROM swr.cn-north-1.myhuaweicloud.com/eiwizard/custom-gpu-cuda9-inner-moxing-cp36:1.1 as base

# The pip source has been pre-configured to an internal source. Roll back to public sources.
RUN rm $HOME/.pip/pip.conf

# Fix the source lists
RUN sed -i 's/cmc-cd-mirror.rnd.huawei.com/security.ubuntu.com/g' /etc/apt/sources.list

# Replace the standard ubuntu source with Aliyun sources if buidling in mainland China
RUN sed -i s/archive.ubuntu.com/mirrors.aliyun.com/g /etc/apt/sources.list \
&& sed -i s/security.ubuntu.com/mirrors.aliyun.com/g /etc/apt/sources.list

# Add the NVIDIA package repo and fetch key
# Reference: https://gitlab.com/nvidia/cuda/blob/ubuntu16.04/9.0/base/Dockerfile#L4
RUN apt-get update && apt-get install -y --no-install-recommends ca-certificates apt-transport-https gnupg-curl && \
rm -rf /var/lib/apt/lists/* && \
NVIDIA_GPGKEY_SUM=d1be581509378368edeec8c1eb2958702feedf3bc3d17011adbf24efacce4ab5 && \
NVIDIA_GPGKEY_FPR=ae09fe4bbd223a84b2ccfce3f60f4b3d7fa2af80 && \
apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64/7fa2af80.pub && \
apt-key adv --export --no-emit-version -a $NVIDIA_GPGKEY_FPR | tail -n +5 > cudasign.pub && \
echo "$NVIDIA_GPGKEY_SUM cudasign.pub" | sha256sum -c --strict - && rm cudasign.pub && \
echo "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/cuda.list && \
echo "deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list

RUN apt update && apt install -y --no-install-recommends \
apt-utils \
build-essential \
cuda9.0 \
cuda-cublas-9-0 \
cuda-cufft-9-0 \
cuda-curand-9-0 \
cuda-cusolver-9-0 \
cuda-cusparse-9-0 \
libcudnn7=7.2.1.38-1+cuda9.0 \
libcudnn7-dev=7.2.1.38-1+cuda9.0 \
libnccl2=2.2.13-1+cuda9.0 \
libnccl-dev=2.2.13-1+cuda9.0 \
cuda-command-line-tools-9-0 \
libfreetype6-dev \
libhdf5-serial-dev \
libpng12-dev \
libzmq3-dev \
pkg-config \
software-properties-common \
unzip \
git \
wget \
openjdk-8-jdk \
maven \
libboost-all-dev \
graphviz \
cmake \
nasm \
&& rm -rf /var/lib/apt/lists/*

ENV CUDA_HOME /usr/local/cuda

# OpenBLAS (TODO: install using apt install)
RUN git clone --progress https://github.com/xianyi/OpenBLAS.git openblas \
&& cd openblas \
&& make -j $(nproc) \
&& make install
ENV BLAS_HOME /opt/OpenBLAS
ENV LD_LIBRARY_PATH $BLAS_HOME/lib:$LD_LIBRARY_PATH

# libjpeg-turbo (TODO: install using apt install)
RUN git clone --progress https://github.com/libjpeg-turbo/libjpeg-turbo.git \
&& cd libjpeg-turbo \
&& cmake -G"Unix Makefiles" && make -j $(nproc)
ENV JPEG_HOME /libjpeg-turbo
ENV LD_LIBRARY_PATH $JPEG_HOME/lib:$LD_LIBRARY_PATH
ENV LD_LIBRARY_PATH $JPEG_HOME:$LD_LIBRARY_PATH

# Crossbow
ADD . /crossbow
ENV CROSSBOW_HOME /crossbow
RUN cd crossbow \
&& mvn package \
&& cd clib-multigpu \
&& ./genmakefile.sh \
&& make -j $(nproc) \
&& cd ../ \
&& ./scripts/build.sh

# Install tensorflow-gpu 1.12.0 in the conda environment (pip has been redirected to conda pip)
RUN pip install -i https://pypi.tuna.tsinghua.edu.cn/simple tensorflow-gpu==1.12.0 # Run this if in the mainland China
# RUN pip install tensorflow-gpu==1.12.0

WORKDIR /

0 comments on commit 66742d0

Please sign in to comment.