Skip to content

Update SMPv2 conda setup script with latest PT2.3.1 TSM2.4.0 #366

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 32 additions & 32 deletions 3.test_cases/17.SM-modelparallelv2/setup_conda_env.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
# specify which CUDA version you are using
# specify PyTorch version
PT_VER=2.3.1
# specify SMP version
SMP_VER=2.4.0
# specify CUDA version
SMP_CUDA_VER=12.1
# specify Python version
PY_VER=3.11

directory="$(pwd)/miniconda3"

Expand All @@ -16,25 +22,25 @@ source ./miniconda3/bin/activate

export ENV_PATH=./miniconda3/envs/smpv2

conda create -p ${ENV_PATH} python=3.10
conda create -p ${ENV_PATH} python=${PY_VER}

conda activate ${ENV_PATH}


# Install OFI nccl
conda install "aws-ofi-nccl==1.7.4" packaging --override-channels \
# Install OFI nccl
conda install -y "aws-ofi-nccl==1.9.1" packaging --override-channels \
-c https://aws-ml-conda.s3.us-west-2.amazonaws.com \
-c pytorch -c numba/label/dev \
-c nvidia \
-c conda-forge \

conda install -c conda-forge mkl=2023.1.0
conda install "requests==2.28.2"
conda install "filelock==3.9.0"
conda install "sympy==1.12"
conda install -y -c conda-forge "mkl<=2024.0" \
"requests>=2.31.0" \
"filelock==3.9.0" \
"sympy==1.12"

# Install SMP V2 pytorch. We will install SMP with pytorch 2.2
conda install pytorch="2.2.0=sm_py3.10_cuda12.1_cudnn8.9.5_nccl_pt_2.2_tsm_2.3_cuda12.1_0" packaging --override-channels \
# Install SMP V2 pytorch. We will install SMP 2.4.0 with pytorch 2.3.1
conda install -y pytorch="${PT_VER}=sm_py${PY_VER}_cuda${SMP_CUDA_VER}_*smp_${SMP_VER}*" packaging --override-channels \
-c https://sagemaker-distributed-model-parallel.s3.us-west-2.amazonaws.com/smp-v2/ \
-c pytorch -c numba/label/dev \
-c pytorch-nightly -c nvidia -c conda-forge
Expand All @@ -43,18 +49,22 @@ conda install pytorch="2.2.0=sm_py3.10_cuda12.1_cudnn8.9.5_nccl_pt_2.2_tsm_2.3_c
# Install dependencies of the script as below

python -m pip install --no-cache-dir -U \
"transformers==4.37.1" \
"transformers==4.40.1" \
"accelerate==0.28.0" \
"triton==2.2.0" \
"SentencePiece==0.1.99" \
"datasets==2.16.1" \
"datasets==2.19.0" \
"expecttest" \
"parameterized==0.9.0" \
"protobuf==3.20.3" \
"pytest-repeat==0.9.1" \
"pytest==7.4.0" \
"tensorboard==2.13.0" \
"tqdm==4.65.0"
"tqdm==4.65.0" \
# setuptools==70 has some issues
"setuptools==69.5.1" \
# smpv2 is currently not compiled with numpy 2.0 support
"numpy<2"

pip install megatron-core==0.5.0

Expand All @@ -64,29 +74,19 @@ MAX_JOBS=64 pip install flash-attn==2.3.3 --no-build-isolation

# Install SMDDP

SMDDP_WHL="smdistributed_dataparallel-2.2.0-cp310-cp310-linux_x86_64.whl" \
&& wget -q https://smdataparallel.s3.amazonaws.com/binary/pytorch/2.2.0/cu121/2024-03-04/${SMDDP_WHL} \
SMDDP_WHL="smdistributed_dataparallel-2.3.0-cp311-cp311-linux_x86_64.whl" \
&& wget -q https://smdataparallel.s3.amazonaws.com/binary/pytorch/2.3.0/cu121/2024-05-23/${SMDDP_WHL} \
&& pip install --force ${SMDDP_WHL} \
&& rm ${SMDDP_WHL}


if [ $SMP_CUDA_VER == "11.8" ]; then
# cuDNN installation for TransformerEngine installation for cuda11.8
tar xf cudnn-linux-x86_64-8.9.5.30_cuda11-archive.tar.xz \
&& rm -rf /usr/local/cuda-$SMP_CUDA_VER/include/cudnn* /usr/local/cuda-$SMP_CUDA_VER/lib/cudnn* \
&& cp ./cudnn-linux-x86_64-8.9.5.30_cuda11-archive/include/* /usr/local/cuda-$SMP_CUDA_VER/include/ \
&& cp ./cudnn-linux-x86_64-8.9.5.30_cuda11-archive/lib/* /usr/local/cuda-$SMP_CUDA_VER/lib/ \
&& rm -rf cudnn-linux-x86_64-8.9.5.30_cuda11-archive.tar.xz \
&& rm -rf cudnn-linux-x86_64-8.9.5.30_cuda11-archive/
else
# cuDNN installation for TransformerEngine installation for cuda12.1
tar xf cudnn-linux-x86_64-8.9.7.29_cuda12-archive.tar.xz \
&& rm -rf /usr/local/cuda-$SMP_CUDA_VER/include/cudnn* /usr/local/cuda-$SMP_CUDA_VER/lib/cudnn* \
&& cp ./cudnn-linux-x86_64-8.9.7.29_cuda12-archive/include/* /usr/local/cuda-$SMP_CUDA_VER/include/ \
&& cp ./cudnn-linux-x86_64-8.9.7.29_cuda12-archive/lib/* /usr/local/cuda-$SMP_CUDA_VER/lib/ \
&& rm -rf cudnn-linux-x86_64-8.9.7.29_cuda12-archive.tar.xz \
&& rm -rf cudnn-linux-x86_64-8.9.7.29_cuda12-archive/
fi
# cuDNN installation for TransformerEngine installation for cuda12.1
tar xf cudnn-linux-x86_64-8.9.7.29_cuda12-archive.tar.xz \
&& rm -rf /usr/local/cuda-$SMP_CUDA_VER/include/cudnn* /usr/local/cuda-$SMP_CUDA_VER/lib/cudnn* \
&& cp ./cudnn-linux-x86_64-8.9.7.29_cuda12-archive/include/* /usr/local/cuda-$SMP_CUDA_VER/include/ \
&& cp ./cudnn-linux-x86_64-8.9.7.29_cuda12-archive/lib/* /usr/local/cuda-$SMP_CUDA_VER/lib/ \
&& rm -rf cudnn-linux-x86_64-8.9.7.29_cuda12-archive.tar.xz \
&& rm -rf cudnn-linux-x86_64-8.9.7.29_cuda12-archive/

# TransformerEngine installation
export CUDA_HOME=/usr/local/cuda-$SMP_CUDA_VER
Expand Down