1
- # specify which CUDA version you are using
1
+ # specify PyTorch version
2
+ PT_VER=2.3.1
3
+ # specify SMP version
4
+ SMP_VER=2.4.0
5
+ # specify CUDA version
2
6
SMP_CUDA_VER=12.1
7
+ # specify Python version
8
+ PY_VER=3.11
3
9
4
10
directory=" $( pwd) /miniconda3"
5
11
@@ -16,25 +22,25 @@ source ./miniconda3/bin/activate
16
22
17
23
export ENV_PATH=./miniconda3/envs/smpv2
18
24
19
- conda create -p ${ENV_PATH} python=3.10
25
+ conda create -p ${ENV_PATH} python=${PY_VER}
20
26
21
27
conda activate ${ENV_PATH}
22
28
23
29
24
- # Install OFI nccl
25
- conda install " aws-ofi-nccl==1.7.4 " packaging --override-channels \
30
+ # Install OFI nccl
31
+ conda install -y " aws-ofi-nccl==1.9.1 " packaging --override-channels \
26
32
-c https://aws-ml-conda.s3.us-west-2.amazonaws.com \
27
33
-c pytorch -c numba/label/dev \
28
34
-c nvidia \
29
35
-c conda-forge \
30
36
31
- conda install -c conda-forge mkl=2023.1.0
32
- conda install " requests==2.28.2 "
33
- conda install " filelock==3.9.0"
34
- conda install " sympy==1.12"
37
+ conda install -y - c conda-forge " mkl<=2024.0 " \
38
+ " requests>=2.31.0 " \
39
+ " filelock==3.9.0" \
40
+ " sympy==1.12"
35
41
36
- # Install SMP V2 pytorch. We will install SMP with pytorch 2.2
37
- conda install pytorch=" 2.2.0=sm_py3.10_cuda12.1_cudnn8.9.5_nccl_pt_2.2_tsm_2.3_cuda12.1_0 " packaging --override-channels \
42
+ # Install SMP V2 pytorch. We will install SMP 2.4.0 with pytorch 2.3.1
43
+ conda install -y pytorch=" ${PT_VER} =sm_py ${PY_VER} _cuda ${SMP_CUDA_VER} _*smp_ ${SMP_VER} * " packaging --override-channels \
38
44
-c https://sagemaker-distributed-model-parallel.s3.us-west-2.amazonaws.com/smp-v2/ \
39
45
-c pytorch -c numba/label/dev \
40
46
-c pytorch-nightly -c nvidia -c conda-forge
@@ -43,18 +49,22 @@ conda install pytorch="2.2.0=sm_py3.10_cuda12.1_cudnn8.9.5_nccl_pt_2.2_tsm_2.3_c
43
49
# Install dependencies of the script as below
44
50
45
51
python -m pip install --no-cache-dir -U \
46
- " transformers==4.37 .1" \
52
+ " transformers==4.40 .1" \
47
53
" accelerate==0.28.0" \
48
54
" triton==2.2.0" \
49
55
" SentencePiece==0.1.99" \
50
- " datasets==2.16.1 " \
56
+ " datasets==2.19.0 " \
51
57
" expecttest" \
52
58
" parameterized==0.9.0" \
53
59
" protobuf==3.20.3" \
54
60
" pytest-repeat==0.9.1" \
55
61
" pytest==7.4.0" \
56
62
" tensorboard==2.13.0" \
57
- " tqdm==4.65.0"
63
+ " tqdm==4.65.0" \
64
+ # setuptools==70 has some issues
65
+ " setuptools==69.5.1" \
66
+ # smpv2 is currently not compiled with numpy 2.0 support
67
+ " numpy<2"
58
68
59
69
pip install megatron-core==0.5.0
60
70
@@ -64,29 +74,19 @@ MAX_JOBS=64 pip install flash-attn==2.3.3 --no-build-isolation
64
74
65
75
# Install SMDDP
66
76
67
- SMDDP_WHL=" smdistributed_dataparallel-2.2 .0-cp310-cp310 -linux_x86_64.whl" \
68
- && wget -q https://smdataparallel.s3.amazonaws.com/binary/pytorch/2.2 .0/cu121/2024-03-04 /${SMDDP_WHL} \
77
+ SMDDP_WHL=" smdistributed_dataparallel-2.3 .0-cp311-cp311 -linux_x86_64.whl" \
78
+ && wget -q https://smdataparallel.s3.amazonaws.com/binary/pytorch/2.3 .0/cu121/2024-05-23 /${SMDDP_WHL} \
69
79
&& pip install --force ${SMDDP_WHL} \
70
80
&& rm ${SMDDP_WHL}
71
81
72
82
73
- if [ $SMP_CUDA_VER == " 11.8" ]; then
74
- # cuDNN installation for TransformerEngine installation for cuda11.8
75
- tar xf cudnn-linux-x86_64-8.9.5.30_cuda11-archive.tar.xz \
76
- && rm -rf /usr/local/cuda-$SMP_CUDA_VER /include/cudnn* /usr/local/cuda-$SMP_CUDA_VER /lib/cudnn* \
77
- && cp ./cudnn-linux-x86_64-8.9.5.30_cuda11-archive/include/* /usr/local/cuda-$SMP_CUDA_VER /include/ \
78
- && cp ./cudnn-linux-x86_64-8.9.5.30_cuda11-archive/lib/* /usr/local/cuda-$SMP_CUDA_VER /lib/ \
79
- && rm -rf cudnn-linux-x86_64-8.9.5.30_cuda11-archive.tar.xz \
80
- && rm -rf cudnn-linux-x86_64-8.9.5.30_cuda11-archive/
81
- else
82
- # cuDNN installation for TransformerEngine installation for cuda12.1
83
- tar xf cudnn-linux-x86_64-8.9.7.29_cuda12-archive.tar.xz \
84
- && rm -rf /usr/local/cuda-$SMP_CUDA_VER /include/cudnn* /usr/local/cuda-$SMP_CUDA_VER /lib/cudnn* \
85
- && cp ./cudnn-linux-x86_64-8.9.7.29_cuda12-archive/include/* /usr/local/cuda-$SMP_CUDA_VER /include/ \
86
- && cp ./cudnn-linux-x86_64-8.9.7.29_cuda12-archive/lib/* /usr/local/cuda-$SMP_CUDA_VER /lib/ \
87
- && rm -rf cudnn-linux-x86_64-8.9.7.29_cuda12-archive.tar.xz \
88
- && rm -rf cudnn-linux-x86_64-8.9.7.29_cuda12-archive/
89
- fi
83
+ # cuDNN installation for TransformerEngine installation for cuda12.1
84
+ tar xf cudnn-linux-x86_64-8.9.7.29_cuda12-archive.tar.xz \
85
+ && rm -rf /usr/local/cuda-$SMP_CUDA_VER /include/cudnn* /usr/local/cuda-$SMP_CUDA_VER /lib/cudnn* \
86
+ && cp ./cudnn-linux-x86_64-8.9.7.29_cuda12-archive/include/* /usr/local/cuda-$SMP_CUDA_VER /include/ \
87
+ && cp ./cudnn-linux-x86_64-8.9.7.29_cuda12-archive/lib/* /usr/local/cuda-$SMP_CUDA_VER /lib/ \
88
+ && rm -rf cudnn-linux-x86_64-8.9.7.29_cuda12-archive.tar.xz \
89
+ && rm -rf cudnn-linux-x86_64-8.9.7.29_cuda12-archive/
90
90
91
91
# TransformerEngine installation
92
92
export CUDA_HOME=/usr/local/cuda-$SMP_CUDA_VER
0 commit comments