GPU integration CI #100
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: GPU integration CI | |
| on: | |
| schedule: | |
| - cron: '0 0 * * 0' # This will trigger the workflow at 00:00 every Sunday | |
| workflow_dispatch: | |
| workflow_call: | |
| jobs: | |
| start-runner: | |
| name: Start self-hosted EC2 runner | |
| runs-on: ubuntu-22.04 | |
| outputs: | |
| label: ${{ steps.start-ec2-runner.outputs.label }} | |
| ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }} | |
| steps: | |
| - name: Configure AWS credentials | |
| uses: aws-actions/[email protected] | |
| with: | |
| aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} | |
| aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} | |
| aws-region: ${{ secrets.AWS_REGION }} | |
| - name: Start EC2 runner | |
| id: start-ec2-runner | |
| uses: machulav/[email protected] | |
| with: | |
| mode: start | |
| github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }} | |
| ec2-image-id: ${{ vars.AWS_EC2_IMAGE_ID }} | |
| ec2-instance-type: ${{ vars.AWS_EC2_INSTANCE_TYPE }} | |
| subnet-id: ${{ vars.AWS_DEFAULT_SUBNET_ID }} | |
| security-group-id: ${{ vars.AWS_SECURITY_GROUP_ID }} | |
| integration-tests: | |
| name: Run integration tests | |
| needs: [start-runner] | |
| runs-on: ${{ needs.start-runner.outputs.label }} | |
| steps: | |
| - name: Checkout repository | |
| uses: actions/checkout@v4 | |
| - name: Install python | |
| run: | | |
| sudo add-apt-repository ppa:deadsnakes/ppa -y | |
| sudo apt update -y | |
| VERSION=3.10 | |
| sudo apt install python3.10 python3.10-distutils python3.10-venv -y | |
| wget https://bootstrap.pypa.io/get-pip.py | |
| python3.10 get-pip.py | |
| python3.10 -m pip install tox | |
| rm get-pip.py | |
| - name: Install and setup Canonical k8s | |
| run: | | |
| sudo snap install kubectl --classic | |
| sudo snap install k8s --classic --channel=1.32-classic/stable | |
| sudo k8s bootstrap | |
| sudo k8s enable local-storage | |
| mkdir -p ~/.kube | |
| sudo k8s config > ~/.kube/config | |
| sudo chown $(id -u):$(id -g) ~/.kube/config | |
| - name: Enable NVIDIA operator | |
| run: | | |
| export KUBECONFIG=~/.kube/config | |
| curl -fsSL -o get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 \ | |
| && chmod 700 get_helm.sh \ | |
| && ./get_helm.sh | |
| helm repo add nvidia https://helm.ngc.nvidia.com/nvidia \ | |
| && helm repo update | |
| helm install --wait --generate-name -n gpu-operator --create-namespace nvidia/gpu-operator | |
| # Wait until the GPU operator validations pass | |
| while ! kubectl logs -n gpu-operator -l app=nvidia-operator-validator | grep "all validations are successful"; do | |
| echo "Waiting for GPU operator validations to pass..." | |
| sleep 5 | |
| done | |
| - name: Run tests | |
| run: | | |
| sudo tox -e integration-gpu -- -vv -s | |
| stop-runner: | |
| name: Stop self-hosted EC2 runner | |
| needs: | |
| - start-runner | |
| - integration-tests | |
| runs-on: ubuntu-22.04 | |
| if: ${{ always() }} | |
| steps: | |
| - name: Configure AWS credentials | |
| uses: aws-actions/[email protected] | |
| with: | |
| aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} | |
| aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} | |
| aws-region: ${{ secrets.AWS_REGION }} | |
| - name: Stop EC2 runner | |
| uses: machulav/[email protected] | |
| with: | |
| mode: stop | |
| github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }} | |
| label: ${{ needs.start-runner.outputs.label }} | |
| ec2-instance-id: ${{ needs.start-runner.outputs.ec2-instance-id }} |