From f3ce8d5007cbff9b6d33f381cb10c712dd4ccd8f Mon Sep 17 00:00:00 2001 From: Praateek Date: Tue, 17 Dec 2024 11:14:12 -0800 Subject: [PATCH 1/5] fc fancy Signed-off-by: Praateek --- .github/workflows/gpuci.yml | 104 +++++++++++++++++------------------- Dockerfile | 21 ++++++-- 2 files changed, 66 insertions(+), 59 deletions(-) diff --git a/.github/workflows/gpuci.yml b/.github/workflows/gpuci.yml index a48e79ef5..520b70af3 100644 --- a/.github/workflows/gpuci.yml +++ b/.github/workflows/gpuci.yml @@ -2,88 +2,84 @@ name: "GPU CI/CD" on: push: - branches: - - main + branches: [main] pull_request: branches: - # We can run gpuCI on any PR targeting these branches - 'main' - '[rv][0-9].[0-9].[0-9]' - '[rv][0-9].[0-9].[0-9]rc[0-9]' - # PR has to be labeled with "gpuCI" label - # If new commits are added, the "gpuCI" label has to be removed and re-added to rerun gpuCI - types: [ labeled ] + types: [labeled] concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} cancel-in-progress: true +# Reusable job templates jobs: - # First, we build and push a NeMo-Curator container build-container: - # "build-container" job is run if the "gpuci" label is added to the PR if: ${{ github.event.label.name == 'gpuci' || github.ref == 'refs/heads/main' }} + strategy: + matrix: + include: + - type: stable + image-suffix: "" + - type: nightly + image-suffix: "_nightly" uses: NVIDIA/NeMo-FW-CI-templates/.github/workflows/_build_container.yml@v0.11.0 with: - image-name: nemo_curator_container + image-name: nemo_curator_container${{ matrix.image-suffix }} dockerfile: Dockerfile - image-label: nemo-curator + image-label: nemo-curator${{ matrix.image-suffix }} build-args: | - IMAGE_LABEL=nemo-curator + IMAGE_LABEL=nemo-curator${{ matrix.image-suffix }} REPO_URL=https://github.com/${{ github.repository }}.git CURATOR_COMMIT=${{ github.sha }} + BUILD_TYPE=${{ matrix.type }} prune-filter-timerange: 24h - # Then, we run our PyTests in the container we just built run-gpu-tests: needs: build-container - # This is the tag on our Azure runner found in Actions -> Runners -> Self-hosted runners - # It has 2 A100 GPUs runs-on: self-hosted-azure - # "run-gpu-tests" job is run if the "gpuci" label is added to the PR if: ${{ github.event.label.name == 'gpuci' || github.ref == 'refs/heads/main' }} + strategy: + matrix: + include: + - type: stable + image-suffix: "" + - type: nightly + image-suffix: "_nightly" - steps: - # If something went wrong during the last cleanup, this step ensures any existing container is removed - - name: Remove existing container if it exists - run: | - if [ "$(docker ps -aq -f name=nemo-curator-container)" ]; then - docker rm -f nemo-curator-container - fi + env: + CONTAINER_NAME: nemo-curator-container${{ matrix.image-suffix }} + IMAGE_NAME: nemoci.azurecr.io/nemo_curator_container${{ matrix.image-suffix }}:${{ github.run_id }} - # This runs the container which was pushed by build-container, which we call "nemo-curator-container" - # `--gpus all` ensures that all of the GPUs from our self-hosted-azure runner are available in the container - # We use "github.run_id" to identify the PR with the commits we want to run the PyTests with - # `bash -c "sleep infinity"` keeps the container running indefinitely without exiting - - name: Run Docker container - run: | - docker run --gpus all --name nemo-curator-container -d nemoci.azurecr.io/nemo_curator_container:${{ github.run_id }} bash -c "sleep infinity" + steps: + - name: Cleanup existing container + run: docker rm -f ${{ env.CONTAINER_NAME }} || true - # Expect `whoami` to be "azureuser" - # Expect `nvidia-smi` to show our 2 A100 GPUs - - name: Check GPUs - run: | - whoami - docker exec nemo-curator-container nvidia-smi + - name: Run container + run: | + docker run --gpus all \ + --name ${{ env.CONTAINER_NAME }} \ + -d ${{ env.IMAGE_NAME }} \ + bash -c "sleep infinity" - # In the virtual environment (called "curator") we created in the container, - # list all of our packages. Useful for debugging - - name: Verify installations - run: | - docker exec nemo-curator-container pip list + - name: Verify environment + run: | + echo "Checking system user:" + docker exec ${{ env.CONTAINER_NAME }} whoami + echo "Checking GPU availability:" + docker exec ${{ env.CONTAINER_NAME }} nvidia-smi + echo "Checking installed packages:" + docker exec ${{ env.CONTAINER_NAME }} pip list - # In the virtual environment (called "curator") we created in the container, - # run our PyTests marked with `@pytest.mark.gpu` - # We specify the `rootdir` to help locate the "pyproject.toml" file (which is in the root directory of the repository), - # and then the directory where the PyTests are located - - name: Run PyTests with GPU mark - run: | - docker exec nemo-curator-container pytest -m gpu --rootdir /opt/NeMo-Curator /opt/NeMo-Curator/tests + - name: Run GPU tests + run: | + docker exec ${{ env.CONTAINER_NAME }} \ + pytest -m gpu \ + --rootdir /opt/NeMo-Curator \ + /opt/NeMo-Curator/tests - # After running `docker stop`, the container remains in an exited state - # It is still present on our system and could be restarted with `docker start` - # Thus, we use `docker rm` to permanently removed it from the system - - name: Cleanup - if: always() - run: | - docker stop nemo-curator-container && docker rm nemo-curator-container + - name: Cleanup + if: always() + run: docker rm -f ${{ env.CONTAINER_NAME }} || true diff --git a/Dockerfile b/Dockerfile index 37bcb35d4..71e5dc729 100644 --- a/Dockerfile +++ b/Dockerfile @@ -6,6 +6,7 @@ ARG PYTHON_VER=3.10 ARG IMAGE_LABEL ARG REPO_URL ARG CURATOR_COMMIT +ARG BUILD_TYPE=stable FROM rapidsai/ci-conda:cuda${CUDA_VER}-${LINUX_VER}-py${PYTHON_VER} as curator-update # Needed to navigate to and pull the forked repository's changes @@ -23,14 +24,16 @@ RUN bash -exu < Date: Tue, 17 Dec 2024 11:43:25 -0800 Subject: [PATCH 2/5] sc Signed-off-by: Praateek --- .github/workflows/gpuci.yml | 64 ++++++++++++++++++++++++------------- 1 file changed, 41 insertions(+), 23 deletions(-) diff --git a/.github/workflows/gpuci.yml b/.github/workflows/gpuci.yml index 520b70af3..13167b5e9 100644 --- a/.github/workflows/gpuci.yml +++ b/.github/workflows/gpuci.yml @@ -5,9 +5,12 @@ on: branches: [main] pull_request: branches: + # We can run gpuCI on any PR targeting these branches - 'main' - '[rv][0-9].[0-9].[0-9]' - '[rv][0-9].[0-9].[0-9]rc[0-9]' + # PR has to be labeled with "gpuCI" label + # If new commits are added, the "gpuCI" label has to be removed and re-added to rerun gpuCI types: [labeled] concurrency: @@ -16,7 +19,9 @@ concurrency: # Reusable job templates jobs: + # First, we build and push a NeMo-Curator container build-container: + # "build-container" job is run if the "gpuci" label is added to the PR if: ${{ github.event.label.name == 'gpuci' || github.ref == 'refs/heads/main' }} strategy: matrix: @@ -36,10 +41,13 @@ jobs: CURATOR_COMMIT=${{ github.sha }} BUILD_TYPE=${{ matrix.type }} prune-filter-timerange: 24h - + # Then, we run our PyTests in the container we just built run-gpu-tests: needs: build-container + # This is the tag on our Azure runner found in Actions -> Runners -> Self-hosted runners + # It has 2 A100 GPUs runs-on: self-hosted-azure + # "run-gpu-tests" job is run if the "gpuci" label is added to the PR if: ${{ github.event.label.name == 'gpuci' || github.ref == 'refs/heads/main' }} strategy: matrix: @@ -54,32 +62,42 @@ jobs: IMAGE_NAME: nemoci.azurecr.io/nemo_curator_container${{ matrix.image-suffix }}:${{ github.run_id }} steps: - - name: Cleanup existing container - run: docker rm -f ${{ env.CONTAINER_NAME }} || true - - - name: Run container - run: | - docker run --gpus all \ - --name ${{ env.CONTAINER_NAME }} \ - -d ${{ env.IMAGE_NAME }} \ - bash -c "sleep infinity" + # If something went wrong during the last cleanup, this step ensures any existing container is removed + - name: Remove existing container if it exists + run: | + if [ "$(docker ps -aq -f name=${{ env.CONTAINER_NAME }})" ]; then + docker rm -f ${{ env.CONTAINER_NAME }} + fi + # This runs the container which was pushed by build-container, which we call "nemo-curator-container" + # `--gpus all` ensures that all of the GPUs from our self-hosted-azure runner are available in the container + # We use "github.run_id" to identify the PR with the commits we want to run the PyTests with + # `bash -c "sleep infinity"` keeps the container running indefinitely without exiting + - name: Run Docker container + run: | + docker run --gpus all --name ${{ env.CONTAINER_NAME }} \ + -d ${{ env.IMAGE_NAME }} \ + bash -c "sleep infinity" - - name: Verify environment - run: | + # In the virtual environment (called "curator") we created in the container, + # list all of our packages. Useful for debugging + - name: Verify installations + run: | echo "Checking system user:" docker exec ${{ env.CONTAINER_NAME }} whoami echo "Checking GPU availability:" docker exec ${{ env.CONTAINER_NAME }} nvidia-smi echo "Checking installed packages:" docker exec ${{ env.CONTAINER_NAME }} pip list - - - name: Run GPU tests - run: | - docker exec ${{ env.CONTAINER_NAME }} \ - pytest -m gpu \ - --rootdir /opt/NeMo-Curator \ - /opt/NeMo-Curator/tests - - - name: Cleanup - if: always() - run: docker rm -f ${{ env.CONTAINER_NAME }} || true + # In the virtual environment (called "curator") we created in the container, + # run our PyTests marked with `@pytest.mark.gpu` + # We specify the `rootdir` to help locate the "pyproject.toml" file (which is in the root directory of the repository), + # and then the directory where the PyTests are located + - name: Run PyTests with GPU mark + run: | + docker exec ${{ env.CONTAINER_NAME }} pytest -m gpu --rootdir /opt/NeMo-Curator /opt/NeMo-Curator/tests + # After running `docker stop`, the container remains in an exited state + # It is still present on our system and could be restarted with `docker start` + # Thus, we use `docker rm` to permanently removed it from the system + - name: Cleanup + if: always() + run: docker rm -f ${{ env.CONTAINER_NAME }} || true From 28c84bfefc2a5c86c87eded9ddde28700c86d1c4 Mon Sep 17 00:00:00 2001 From: Praateek Date: Tue, 17 Dec 2024 11:46:44 -0800 Subject: [PATCH 3/5] tc Signed-off-by: Praateek --- .github/workflows/gpuci.yml | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/.github/workflows/gpuci.yml b/.github/workflows/gpuci.yml index 13167b5e9..92e006264 100644 --- a/.github/workflows/gpuci.yml +++ b/.github/workflows/gpuci.yml @@ -68,6 +68,7 @@ jobs: if [ "$(docker ps -aq -f name=${{ env.CONTAINER_NAME }})" ]; then docker rm -f ${{ env.CONTAINER_NAME }} fi + # This runs the container which was pushed by build-container, which we call "nemo-curator-container" # `--gpus all` ensures that all of the GPUs from our self-hosted-azure runner are available in the container # We use "github.run_id" to identify the PR with the commits we want to run the PyTests with @@ -80,9 +81,12 @@ jobs: # In the virtual environment (called "curator") we created in the container, # list all of our packages. Useful for debugging - - name: Verify installations + # Expect `whoami` to be "azureuser" + # Expect `nvidia-smi` to show our 2 A100 GPUs + - name: Check GPUs + Verify installations run: | echo "Checking system user:" + whoami docker exec ${{ env.CONTAINER_NAME }} whoami echo "Checking GPU availability:" docker exec ${{ env.CONTAINER_NAME }} nvidia-smi @@ -100,4 +104,4 @@ jobs: # Thus, we use `docker rm` to permanently removed it from the system - name: Cleanup if: always() - run: docker rm -f ${{ env.CONTAINER_NAME }} || true + run: docker stop ${{ env.CONTAINER_NAME }} && docker rm ${{ env.CONTAINER_NAME }} From 077ba5c4a22ecc3c8cc2e8dae2565ae2ae22b82e Mon Sep 17 00:00:00 2001 From: Praateek Date: Tue, 17 Dec 2024 11:51:59 -0800 Subject: [PATCH 4/5] testing if fassttext 0.9.3 works, if so we should merge 434 -> 435 -> revert this change -> 436 Signed-off-by: Praateek --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 87dce1a52..b59550064 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -47,7 +47,7 @@ dependencies = [ "dask[complete]>=2021.7.1", "datasets", "distributed>=2021.7.1", - "fasttext==0.9.2", + "fasttext==0.9.3", "ftfy==6.1.1", "in-place==0.5.0", "jieba==0.42.1", From d358bd733b395893cc2806f4f0b4d71c8193101f Mon Sep 17 00:00:00 2001 From: Praateek Date: Tue, 17 Dec 2024 11:53:17 -0800 Subject: [PATCH 5/5] update to nightly Signed-off-by: Praateek --- pyproject.toml | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index b59550064..8819d0120 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -75,20 +75,20 @@ dynamic = ["version"] [project.optional-dependencies] # Installs CPU + GPU text curation modules cuda12x = [ - "cudf-cu12>=24.10", - "cugraph-cu12>=24.10", - "cuml-cu12>=24.10", - "dask-cuda>=24.10", - "dask-cudf-cu12>=24.10", + "cudf-cu12>=24.12", + "cugraph-cu12>=24.12", + "cuml-cu12>=24.12", + "dask-cuda>=24.12", + "dask-cudf-cu12>=24.12", "spacy[cuda12x]>=3.6.0, <3.8.0", ] # Installs CPU + GPU text curation modules with RAPIDS Nightlies cuda12x_nightly = [ - "cudf-cu12>=24.12.0a0,<=24.12", - "cugraph-cu12>=24.12.0a0,<=24.12", - "cuml-cu12>=24.12.0a0,<=24.12", - "dask-cuda>=24.12.0a0,<=24.12", - "dask-cudf-cu12>=24.12.0a0,<=24.12", + "cudf-cu12>=25.02.0a0,<=25.02", + "cugraph-cu12>=25.02.0a0,<=25.02", + "cuml-cu12>=25.02.0a0,<=25.02", + "dask-cuda>=25.02.0a0,<=25.02", + "dask-cudf-cu12>=25.02.0a0,<=25.02", "spacy[cuda12x]>=3.6.0, <3.8.0", ] # Installs CPU + GPU text and image curation modules