From 2a3961695e0ea548f075a5dcc06ec9661309037e Mon Sep 17 00:00:00 2001 From: Sarah Yurick <53962159+sarahyurick@users.noreply.github.com> Date: Fri, 17 Jan 2025 16:31:49 -0800 Subject: [PATCH] Revert "Run GPU Tests on Rapids Nightly at midnight UTC (#463)" (#489) This reverts commit 6c739e97c09ae6da83a0b67140670c0071611486. --- .github/workflows/gpuci.yml | 35 +++++++++++++---------------------- Dockerfile | 14 ++------------ 2 files changed, 15 insertions(+), 34 deletions(-) diff --git a/.github/workflows/gpuci.yml b/.github/workflows/gpuci.yml index 2b87bea2c..6f14b414b 100644 --- a/.github/workflows/gpuci.yml +++ b/.github/workflows/gpuci.yml @@ -1,8 +1,6 @@ name: "GPU CI/CD" on: - schedule: - - cron: '0 0 * * *' # scheduled trigger for nightly builds, runs at midnight UTC every day push: branches: - main @@ -23,23 +21,18 @@ concurrency: jobs: # First, we build and push a NeMo-Curator container build-container: - # "build-container" job is run if the "gpuci" label is added to the PR / merge to main / scheduled run - if: ${{ github.event.label.name == 'gpuci' || github.ref == 'refs/heads/main' || github.event_name == 'schedule' }} + # "build-container" job is run if the "gpuci" label is added to the PR + if: ${{ github.event.label.name == 'gpuci' || github.ref == 'refs/heads/main' }} uses: NVIDIA/NeMo-FW-CI-templates/.github/workflows/_build_container.yml@v0.18.0 with: image-name: nemo_curator_container dockerfile: Dockerfile - # Use different image-label / build-args based on event type - # Given we only use "schedule" event for nightly builds, we use that in our conditions to distinguish nightly vs stable - image-label: ${{ github.event_name == 'schedule' && 'nemo-curator-nightly' || 'nemo-curator' }} + image-label: nemo-curator build-args: | - IMAGE_LABEL=${{ github.event_name == 'schedule' && 'nemo-curator-nightly' || 'nemo-curator' }} + IMAGE_LABEL=nemo-curator REPO_URL=https://github.com/${{ github.repository }}.git CURATOR_COMMIT=${{ github.sha }} - BUILD_TYPE=${{ github.event_name == 'schedule' && 'nightly' || 'stable' }} prune-filter-timerange: 24h - # We don't want to use cache for Nighlty Runs, as we want to build a fresh image each time - use_cache: ${{ github.event_name != 'schedule' }} # Then, we run our PyTests in the container we just built run-gpu-tests: @@ -47,17 +40,15 @@ jobs: # This is the tag on our Azure runner found in Actions -> Runners -> Self-hosted runners # It has 2 A100 GPUs runs-on: self-hosted-azure - # "run-gpu-tests" job is run if the "gpuci" label is added to the PR / merged to main / scheduled run - if: ${{ github.event.label.name == 'gpuci' || github.ref == 'refs/heads/main' || github.event_name == 'schedule' }} - env: - CONTAINER_NAME: ${{ github.event_name == 'schedule' && 'nemo-curator-nightly-container' || 'nemo-curator-container' }} + # "run-gpu-tests" job is run if the "gpuci" label is added to the PR + if: ${{ github.event.label.name == 'gpuci' || github.ref == 'refs/heads/main' }} steps: # If something went wrong during the last cleanup, this step ensures any existing container is removed - name: Remove existing container if it exists run: | - if [ "$(docker ps -aq -f name=$CONTAINER_NAME)" ]; then - docker rm -f $CONTAINER_NAME + if [ "$(docker ps -aq -f name=nemo-curator-container)" ]; then + docker rm -f nemo-curator-container fi # This runs the container which was pushed by build-container, which we call "nemo-curator-container" @@ -66,20 +57,20 @@ jobs: # `bash -c "sleep infinity"` keeps the container running indefinitely without exiting - name: Run Docker container run: | - docker run --gpus all --name $CONTAINER_NAME -d nemoci.azurecr.io/nemo_curator_container:${{ github.run_id }} bash -c "sleep infinity" + docker run --gpus all --name nemo-curator-container -d nemoci.azurecr.io/nemo_curator_container:${{ github.run_id }} bash -c "sleep infinity" # Expect `whoami` to be "azureuser" # Expect `nvidia-smi` to show our 2 A100 GPUs - name: Check GPUs run: | whoami - docker exec $CONTAINER_NAME nvidia-smi + docker exec nemo-curator-container nvidia-smi # In the virtual environment (called "curator") we created in the container, # list all of our packages. Useful for debugging - name: Verify installations run: | - docker exec $CONTAINER_NAME pip list + docker exec nemo-curator-container pip list # In the virtual environment (called "curator") we created in the container, # run our PyTests marked with `@pytest.mark.gpu` @@ -87,7 +78,7 @@ jobs: # and then the directory where the PyTests are located - name: Run PyTests with GPU mark run: | - docker exec $CONTAINER_NAME pytest -m gpu --rootdir /opt/NeMo-Curator /opt/NeMo-Curator/tests + docker exec nemo-curator-container pytest -m gpu --rootdir /opt/NeMo-Curator /opt/NeMo-Curator/tests # After running `docker stop`, the container remains in an exited state # It is still present on our system and could be restarted with `docker start` @@ -95,4 +86,4 @@ jobs: - name: Cleanup if: always() run: | - docker stop $CONTAINER_NAME && docker rm $CONTAINER_NAME + docker stop nemo-curator-container && docker rm nemo-curator-container diff --git a/Dockerfile b/Dockerfile index 1aca99e03..b0d1bedc7 100644 --- a/Dockerfile +++ b/Dockerfile @@ -6,7 +6,6 @@ ARG PYTHON_VER=3.10 ARG IMAGE_LABEL ARG REPO_URL ARG CURATOR_COMMIT -ARG BUILD_TYPE=stable FROM rapidsai/ci-conda:cuda${CUDA_VER}-${LINUX_VER}-py${PYTHON_VER} as curator-update # Needed to navigate to and pull the forked repository's changes @@ -31,7 +30,6 @@ WORKDIR /opt # Re-declare ARGs after new FROM to make them available in this stage ARG CUDA_VER -ARG BUILD_TYPE # Install the minimal libcu* libraries needed by NeMo Curator RUN conda create -y --name curator -c nvidia/label/cuda-${CUDA_VER} -c conda-forge \ @@ -52,11 +50,7 @@ RUN \ --mount=type=bind,source=/opt/NeMo-Curator/pyproject.toml,target=/opt/NeMo-Curator/pyproject.toml,from=curator-update \ cd /opt/NeMo-Curator && \ source activate curator && \ - if [ "$BUILD_TYPE" = "nightly" ]; then \ - pip install ".[all_nightly]"; \ - else \ - pip install ".[all]"; \ - fi + pip install ".[all]" COPY --from=curator-update /opt/NeMo-Curator/ /opt/NeMo-Curator/ @@ -64,11 +58,7 @@ COPY --from=curator-update /opt/NeMo-Curator/ /opt/NeMo-Curator/ RUN bash -exu <