Skip to content

Commit fa83a68

Browse files
committed
Update SMPv2 conda setup script with latest PT2.3.1 TSM2.4.0
1 parent 94bd6c9 commit fa83a68

File tree

1 file changed

+32
-32
lines changed

1 file changed

+32
-32
lines changed

3.test_cases/17.SM-modelparallelv2/setup_conda_env.sh

Lines changed: 32 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,11 @@
1-
# specify which CUDA version you are using
1+
# specify PyTorch version
2+
PT_VER=2.3.1
3+
# specify SMP version
4+
SMP_VER=2.4.0
5+
# specify CUDA version
26
SMP_CUDA_VER=12.1
7+
# specify Python version
8+
PY_VER=3.11
39

410
directory="$(pwd)/miniconda3"
511

@@ -16,25 +22,25 @@ source ./miniconda3/bin/activate
1622

1723
export ENV_PATH=./miniconda3/envs/smpv2
1824

19-
conda create -p ${ENV_PATH} python=3.10
25+
conda create -p ${ENV_PATH} python=${PY_VER}
2026

2127
conda activate ${ENV_PATH}
2228

2329

24-
# Install OFI nccl
25-
conda install "aws-ofi-nccl==1.7.4" packaging --override-channels \
30+
# Install OFI nccl
31+
conda install -y "aws-ofi-nccl==1.9.1" packaging --override-channels \
2632
-c https://aws-ml-conda.s3.us-west-2.amazonaws.com \
2733
-c pytorch -c numba/label/dev \
2834
-c nvidia \
2935
-c conda-forge \
3036

31-
conda install -c conda-forge mkl=2023.1.0
32-
conda install "requests==2.28.2"
33-
conda install "filelock==3.9.0"
34-
conda install "sympy==1.12"
37+
conda install -y -c conda-forge "mkl<=2024.0" \
38+
"requests>=2.31.0" \
39+
"filelock==3.9.0" \
40+
"sympy==1.12"
3541

36-
# Install SMP V2 pytorch. We will install SMP with pytorch 2.2
37-
conda install pytorch="2.2.0=sm_py3.10_cuda12.1_cudnn8.9.5_nccl_pt_2.2_tsm_2.3_cuda12.1_0" packaging --override-channels \
42+
# Install SMP V2 pytorch. We will install SMP 2.4.0 with pytorch 2.3.1
43+
conda install -y pytorch="${PT_VER}=sm_py${PY_VER}_cuda${SMP_CUDA_VER}_*smp_${SMP_VER}*" packaging --override-channels \
3844
-c https://sagemaker-distributed-model-parallel.s3.us-west-2.amazonaws.com/smp-v2/ \
3945
-c pytorch -c numba/label/dev \
4046
-c pytorch-nightly -c nvidia -c conda-forge
@@ -43,18 +49,22 @@ conda install pytorch="2.2.0=sm_py3.10_cuda12.1_cudnn8.9.5_nccl_pt_2.2_tsm_2.3_c
4349
# Install dependencies of the script as below
4450

4551
python -m pip install --no-cache-dir -U \
46-
"transformers==4.37.1" \
52+
"transformers==4.40.1" \
4753
"accelerate==0.28.0" \
4854
"triton==2.2.0" \
4955
"SentencePiece==0.1.99" \
50-
"datasets==2.16.1" \
56+
"datasets==2.19.0" \
5157
"expecttest" \
5258
"parameterized==0.9.0" \
5359
"protobuf==3.20.3" \
5460
"pytest-repeat==0.9.1" \
5561
"pytest==7.4.0" \
5662
"tensorboard==2.13.0" \
57-
"tqdm==4.65.0"
63+
"tqdm==4.65.0" \
64+
# setuptools==70 has some issues
65+
"setuptools==69.5.1" \
66+
# smpv2 is currently not compiled with numpy 2.0 support
67+
"numpy<2"
5868

5969
pip install megatron-core==0.5.0
6070

@@ -64,29 +74,19 @@ MAX_JOBS=64 pip install flash-attn==2.3.3 --no-build-isolation
6474

6575
# Install SMDDP
6676

67-
SMDDP_WHL="smdistributed_dataparallel-2.2.0-cp310-cp310-linux_x86_64.whl" \
68-
&& wget -q https://smdataparallel.s3.amazonaws.com/binary/pytorch/2.2.0/cu121/2024-03-04/${SMDDP_WHL} \
77+
SMDDP_WHL="smdistributed_dataparallel-2.3.0-cp311-cp311-linux_x86_64.whl" \
78+
&& wget -q https://smdataparallel.s3.amazonaws.com/binary/pytorch/2.3.0/cu121/2024-05-23/${SMDDP_WHL} \
6979
&& pip install --force ${SMDDP_WHL} \
7080
&& rm ${SMDDP_WHL}
7181

7282

73-
if [ $SMP_CUDA_VER == "11.8" ]; then
74-
# cuDNN installation for TransformerEngine installation for cuda11.8
75-
tar xf cudnn-linux-x86_64-8.9.5.30_cuda11-archive.tar.xz \
76-
&& rm -rf /usr/local/cuda-$SMP_CUDA_VER/include/cudnn* /usr/local/cuda-$SMP_CUDA_VER/lib/cudnn* \
77-
&& cp ./cudnn-linux-x86_64-8.9.5.30_cuda11-archive/include/* /usr/local/cuda-$SMP_CUDA_VER/include/ \
78-
&& cp ./cudnn-linux-x86_64-8.9.5.30_cuda11-archive/lib/* /usr/local/cuda-$SMP_CUDA_VER/lib/ \
79-
&& rm -rf cudnn-linux-x86_64-8.9.5.30_cuda11-archive.tar.xz \
80-
&& rm -rf cudnn-linux-x86_64-8.9.5.30_cuda11-archive/
81-
else
82-
# cuDNN installation for TransformerEngine installation for cuda12.1
83-
tar xf cudnn-linux-x86_64-8.9.7.29_cuda12-archive.tar.xz \
84-
&& rm -rf /usr/local/cuda-$SMP_CUDA_VER/include/cudnn* /usr/local/cuda-$SMP_CUDA_VER/lib/cudnn* \
85-
&& cp ./cudnn-linux-x86_64-8.9.7.29_cuda12-archive/include/* /usr/local/cuda-$SMP_CUDA_VER/include/ \
86-
&& cp ./cudnn-linux-x86_64-8.9.7.29_cuda12-archive/lib/* /usr/local/cuda-$SMP_CUDA_VER/lib/ \
87-
&& rm -rf cudnn-linux-x86_64-8.9.7.29_cuda12-archive.tar.xz \
88-
&& rm -rf cudnn-linux-x86_64-8.9.7.29_cuda12-archive/
89-
fi
83+
# cuDNN installation for TransformerEngine installation for cuda12.1
84+
tar xf cudnn-linux-x86_64-8.9.7.29_cuda12-archive.tar.xz \
85+
&& rm -rf /usr/local/cuda-$SMP_CUDA_VER/include/cudnn* /usr/local/cuda-$SMP_CUDA_VER/lib/cudnn* \
86+
&& cp ./cudnn-linux-x86_64-8.9.7.29_cuda12-archive/include/* /usr/local/cuda-$SMP_CUDA_VER/include/ \
87+
&& cp ./cudnn-linux-x86_64-8.9.7.29_cuda12-archive/lib/* /usr/local/cuda-$SMP_CUDA_VER/lib/ \
88+
&& rm -rf cudnn-linux-x86_64-8.9.7.29_cuda12-archive.tar.xz \
89+
&& rm -rf cudnn-linux-x86_64-8.9.7.29_cuda12-archive/
9090

9191
# TransformerEngine installation
9292
export CUDA_HOME=/usr/local/cuda-$SMP_CUDA_VER

0 commit comments

Comments
 (0)