Skip to content

Commit 2d55eaa

Browse files
authored
ci: Limit unit-test duration (#534)
Signed-off-by: oliver könig <[email protected]>
1 parent f642628 commit 2d55eaa

File tree

1 file changed

+43
-41
lines changed

1 file changed

+43
-41
lines changed

.github/workflows/gpuci.yml

Lines changed: 43 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,12 @@ on:
77
pull_request:
88
branches:
99
# We can run gpuCI on any PR targeting these branches
10-
- 'main'
11-
- '[rv][0-9].[0-9].[0-9]'
12-
- '[rv][0-9].[0-9].[0-9]rc[0-9]'
10+
- "main"
11+
- "[rv][0-9].[0-9].[0-9]"
12+
- "[rv][0-9].[0-9].[0-9]rc[0-9]"
1313
# PR has to be labeled with "gpuCI" label
1414
# If new commits are added, the "gpuCI" label has to be removed and re-added to rerun gpuCI
15-
types: [ labeled ]
15+
types: [labeled]
1616

1717
concurrency:
1818
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
@@ -40,50 +40,52 @@ jobs:
4040
# This is the tag on our Azure runner found in Actions -> Runners -> Self-hosted runners
4141
# It has 2 A100 GPUs
4242
runs-on: self-hosted-azure
43+
# Unit tests shouldn't take longer than 30minutes
44+
timeout-minutes: 30
4345
# "run-gpu-tests" job is run if the "gpuci" label is added to the PR
4446
if: ${{ github.event.label.name == 'gpuci' || github.ref == 'refs/heads/main' }}
4547

4648
steps:
4749
# If something went wrong during the last cleanup, this step ensures any existing container is removed
48-
- name: Remove existing container if it exists
49-
run: |
50-
if [ "$(docker ps -aq -f name=nemo-curator-container)" ]; then
51-
docker rm -f nemo-curator-container
52-
fi
50+
- name: Remove existing container if it exists
51+
run: |
52+
if [ "$(docker ps -aq -f name=nemo-curator-container)" ]; then
53+
docker rm -f nemo-curator-container
54+
fi
5355
54-
# This runs the container which was pushed by build-container, which we call "nemo-curator-container"
55-
# `--gpus all` ensures that all of the GPUs from our self-hosted-azure runner are available in the container
56-
# We use "github.run_id" to identify the PR with the commits we want to run the PyTests with
57-
# `bash -c "sleep infinity"` keeps the container running indefinitely without exiting
58-
- name: Run Docker container
59-
run: |
60-
docker run --gpus all --name nemo-curator-container -d nemoci.azurecr.io/nemo_curator_container:${{ github.run_id }} bash -c "sleep infinity"
56+
# This runs the container which was pushed by build-container, which we call "nemo-curator-container"
57+
# `--gpus all` ensures that all of the GPUs from our self-hosted-azure runner are available in the container
58+
# We use "github.run_id" to identify the PR with the commits we want to run the PyTests with
59+
# `bash -c "sleep infinity"` keeps the container running indefinitely without exiting
60+
- name: Run Docker container
61+
run: |
62+
docker run --gpus all --name nemo-curator-container -d nemoci.azurecr.io/nemo_curator_container:${{ github.run_id }} bash -c "sleep infinity"
6163
62-
# Expect `whoami` to be "azureuser"
63-
# Expect `nvidia-smi` to show our 2 A100 GPUs
64-
- name: Check GPUs
65-
run: |
66-
whoami
67-
docker exec nemo-curator-container nvidia-smi
64+
# Expect `whoami` to be "azureuser"
65+
# Expect `nvidia-smi` to show our 2 A100 GPUs
66+
- name: Check GPUs
67+
run: |
68+
whoami
69+
docker exec nemo-curator-container nvidia-smi
6870
69-
# In the virtual environment (called "curator") we created in the container,
70-
# list all of our packages. Useful for debugging
71-
- name: Verify installations
72-
run: |
73-
docker exec nemo-curator-container pip list
71+
# In the virtual environment (called "curator") we created in the container,
72+
# list all of our packages. Useful for debugging
73+
- name: Verify installations
74+
run: |
75+
docker exec nemo-curator-container pip list
7476
75-
# In the virtual environment (called "curator") we created in the container,
76-
# run our PyTests marked with `@pytest.mark.gpu`
77-
# We specify the `rootdir` to help locate the "pyproject.toml" file (which is in the root directory of the repository),
78-
# and then the directory where the PyTests are located
79-
- name: Run PyTests with GPU mark
80-
run: |
81-
docker exec nemo-curator-container pytest -m gpu --rootdir /opt/NeMo-Curator /opt/NeMo-Curator/tests
77+
# In the virtual environment (called "curator") we created in the container,
78+
# run our PyTests marked with `@pytest.mark.gpu`
79+
# We specify the `rootdir` to help locate the "pyproject.toml" file (which is in the root directory of the repository),
80+
# and then the directory where the PyTests are located
81+
- name: Run PyTests with GPU mark
82+
run: |
83+
docker exec nemo-curator-container pytest -m gpu --rootdir /opt/NeMo-Curator /opt/NeMo-Curator/tests
8284
83-
# After running `docker stop`, the container remains in an exited state
84-
# It is still present on our system and could be restarted with `docker start`
85-
# Thus, we use `docker rm` to permanently removed it from the system
86-
- name: Cleanup
87-
if: always()
88-
run: |
89-
docker stop nemo-curator-container && docker rm nemo-curator-container
85+
# After running `docker stop`, the container remains in an exited state
86+
# It is still present on our system and could be restarted with `docker start`
87+
# Thus, we use `docker rm` to permanently removed it from the system
88+
- name: Cleanup
89+
if: always()
90+
run: |
91+
docker stop nemo-curator-container && docker rm nemo-curator-container

0 commit comments

Comments
 (0)