Skip to content

GPU integration CI #100

GPU integration CI

GPU integration CI #100

name: GPU integration CI
on:
schedule:
- cron: '0 0 * * 0' # This will trigger the workflow at 00:00 every Sunday
workflow_dispatch:
workflow_call:
jobs:
start-runner:
name: Start self-hosted EC2 runner
runs-on: ubuntu-22.04
outputs:
label: ${{ steps.start-ec2-runner.outputs.label }}
ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }}
steps:
- name: Configure AWS credentials
uses: aws-actions/[email protected]
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region: ${{ secrets.AWS_REGION }}
- name: Start EC2 runner
id: start-ec2-runner
uses: machulav/[email protected]
with:
mode: start
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
ec2-image-id: ${{ vars.AWS_EC2_IMAGE_ID }}
ec2-instance-type: ${{ vars.AWS_EC2_INSTANCE_TYPE }}
subnet-id: ${{ vars.AWS_DEFAULT_SUBNET_ID }}
security-group-id: ${{ vars.AWS_SECURITY_GROUP_ID }}
integration-tests:
name: Run integration tests
needs: [start-runner]
runs-on: ${{ needs.start-runner.outputs.label }}
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Install python
run: |
sudo add-apt-repository ppa:deadsnakes/ppa -y
sudo apt update -y
VERSION=3.10
sudo apt install python3.10 python3.10-distutils python3.10-venv -y
wget https://bootstrap.pypa.io/get-pip.py
python3.10 get-pip.py
python3.10 -m pip install tox
rm get-pip.py
- name: Install and setup Canonical k8s
run: |
sudo snap install kubectl --classic
sudo snap install k8s --classic --channel=1.32-classic/stable
sudo k8s bootstrap
sudo k8s enable local-storage
mkdir -p ~/.kube
sudo k8s config > ~/.kube/config
sudo chown $(id -u):$(id -g) ~/.kube/config
- name: Enable NVIDIA operator
run: |
export KUBECONFIG=~/.kube/config
curl -fsSL -o get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 \
&& chmod 700 get_helm.sh \
&& ./get_helm.sh
helm repo add nvidia https://helm.ngc.nvidia.com/nvidia \
&& helm repo update
helm install --wait --generate-name -n gpu-operator --create-namespace nvidia/gpu-operator
# Wait until the GPU operator validations pass
while ! kubectl logs -n gpu-operator -l app=nvidia-operator-validator | grep "all validations are successful"; do
echo "Waiting for GPU operator validations to pass..."
sleep 5
done
- name: Run tests
run: |
sudo tox -e integration-gpu -- -vv -s
stop-runner:
name: Stop self-hosted EC2 runner
needs:
- start-runner
- integration-tests
runs-on: ubuntu-22.04
if: ${{ always() }}
steps:
- name: Configure AWS credentials
uses: aws-actions/[email protected]
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region: ${{ secrets.AWS_REGION }}
- name: Stop EC2 runner
uses: machulav/[email protected]
with:
mode: stop
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
label: ${{ needs.start-runner.outputs.label }}
ec2-instance-id: ${{ needs.start-runner.outputs.ec2-instance-id }}