|
7 | 7 | pull_request:
|
8 | 8 | branches:
|
9 | 9 | # We can run gpuCI on any PR targeting these branches
|
10 |
| - - 'main' |
11 |
| - - '[rv][0-9].[0-9].[0-9]' |
12 |
| - - '[rv][0-9].[0-9].[0-9]rc[0-9]' |
| 10 | + - "main" |
| 11 | + - "[rv][0-9].[0-9].[0-9]" |
| 12 | + - "[rv][0-9].[0-9].[0-9]rc[0-9]" |
13 | 13 | # PR has to be labeled with "gpuCI" label
|
14 | 14 | # If new commits are added, the "gpuCI" label has to be removed and re-added to rerun gpuCI
|
15 |
| - types: [ labeled ] |
| 15 | + types: [labeled] |
16 | 16 |
|
17 | 17 | concurrency:
|
18 | 18 | group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
|
@@ -40,50 +40,52 @@ jobs:
|
40 | 40 | # This is the tag on our Azure runner found in Actions -> Runners -> Self-hosted runners
|
41 | 41 | # It has 2 A100 GPUs
|
42 | 42 | runs-on: self-hosted-azure
|
| 43 | + # Unit tests shouldn't take longer than 30minutes |
| 44 | + timeout-minutes: 30 |
43 | 45 | # "run-gpu-tests" job is run if the "gpuci" label is added to the PR
|
44 | 46 | if: ${{ github.event.label.name == 'gpuci' || github.ref == 'refs/heads/main' }}
|
45 | 47 |
|
46 | 48 | steps:
|
47 | 49 | # If something went wrong during the last cleanup, this step ensures any existing container is removed
|
48 |
| - - name: Remove existing container if it exists |
49 |
| - run: | |
50 |
| - if [ "$(docker ps -aq -f name=nemo-curator-container)" ]; then |
51 |
| - docker rm -f nemo-curator-container |
52 |
| - fi |
| 50 | + - name: Remove existing container if it exists |
| 51 | + run: | |
| 52 | + if [ "$(docker ps -aq -f name=nemo-curator-container)" ]; then |
| 53 | + docker rm -f nemo-curator-container |
| 54 | + fi |
53 | 55 |
|
54 |
| - # This runs the container which was pushed by build-container, which we call "nemo-curator-container" |
55 |
| - # `--gpus all` ensures that all of the GPUs from our self-hosted-azure runner are available in the container |
56 |
| - # We use "github.run_id" to identify the PR with the commits we want to run the PyTests with |
57 |
| - # `bash -c "sleep infinity"` keeps the container running indefinitely without exiting |
58 |
| - - name: Run Docker container |
59 |
| - run: | |
60 |
| - docker run --gpus all --name nemo-curator-container -d nemoci.azurecr.io/nemo_curator_container:${{ github.run_id }} bash -c "sleep infinity" |
| 56 | + # This runs the container which was pushed by build-container, which we call "nemo-curator-container" |
| 57 | + # `--gpus all` ensures that all of the GPUs from our self-hosted-azure runner are available in the container |
| 58 | + # We use "github.run_id" to identify the PR with the commits we want to run the PyTests with |
| 59 | + # `bash -c "sleep infinity"` keeps the container running indefinitely without exiting |
| 60 | + - name: Run Docker container |
| 61 | + run: | |
| 62 | + docker run --gpus all --name nemo-curator-container -d nemoci.azurecr.io/nemo_curator_container:${{ github.run_id }} bash -c "sleep infinity" |
61 | 63 |
|
62 |
| - # Expect `whoami` to be "azureuser" |
63 |
| - # Expect `nvidia-smi` to show our 2 A100 GPUs |
64 |
| - - name: Check GPUs |
65 |
| - run: | |
66 |
| - whoami |
67 |
| - docker exec nemo-curator-container nvidia-smi |
| 64 | + # Expect `whoami` to be "azureuser" |
| 65 | + # Expect `nvidia-smi` to show our 2 A100 GPUs |
| 66 | + - name: Check GPUs |
| 67 | + run: | |
| 68 | + whoami |
| 69 | + docker exec nemo-curator-container nvidia-smi |
68 | 70 |
|
69 |
| - # In the virtual environment (called "curator") we created in the container, |
70 |
| - # list all of our packages. Useful for debugging |
71 |
| - - name: Verify installations |
72 |
| - run: | |
73 |
| - docker exec nemo-curator-container pip list |
| 71 | + # In the virtual environment (called "curator") we created in the container, |
| 72 | + # list all of our packages. Useful for debugging |
| 73 | + - name: Verify installations |
| 74 | + run: | |
| 75 | + docker exec nemo-curator-container pip list |
74 | 76 |
|
75 |
| - # In the virtual environment (called "curator") we created in the container, |
76 |
| - # run our PyTests marked with `@pytest.mark.gpu` |
77 |
| - # We specify the `rootdir` to help locate the "pyproject.toml" file (which is in the root directory of the repository), |
78 |
| - # and then the directory where the PyTests are located |
79 |
| - - name: Run PyTests with GPU mark |
80 |
| - run: | |
81 |
| - docker exec nemo-curator-container pytest -m gpu --rootdir /opt/NeMo-Curator /opt/NeMo-Curator/tests |
| 77 | + # In the virtual environment (called "curator") we created in the container, |
| 78 | + # run our PyTests marked with `@pytest.mark.gpu` |
| 79 | + # We specify the `rootdir` to help locate the "pyproject.toml" file (which is in the root directory of the repository), |
| 80 | + # and then the directory where the PyTests are located |
| 81 | + - name: Run PyTests with GPU mark |
| 82 | + run: | |
| 83 | + docker exec nemo-curator-container pytest -m gpu --rootdir /opt/NeMo-Curator /opt/NeMo-Curator/tests |
82 | 84 |
|
83 |
| - # After running `docker stop`, the container remains in an exited state |
84 |
| - # It is still present on our system and could be restarted with `docker start` |
85 |
| - # Thus, we use `docker rm` to permanently removed it from the system |
86 |
| - - name: Cleanup |
87 |
| - if: always() |
88 |
| - run: | |
89 |
| - docker stop nemo-curator-container && docker rm nemo-curator-container |
| 85 | + # After running `docker stop`, the container remains in an exited state |
| 86 | + # It is still present on our system and could be restarted with `docker start` |
| 87 | + # Thus, we use `docker rm` to permanently removed it from the system |
| 88 | + - name: Cleanup |
| 89 | + if: always() |
| 90 | + run: | |
| 91 | + docker stop nemo-curator-container && docker rm nemo-curator-container |
0 commit comments