GPU integration CI #100

Workflow file for this run

.github/workflows/aws-integration-gpu.yaml at 7908a8c

	name: GPU integration CI

	on:
	schedule:
	- cron: '0 0 * * 0' # This will trigger the workflow at 00:00 every Sunday
	workflow_dispatch:
	workflow_call:


	jobs:
	start-runner:
	name: Start self-hosted EC2 runner
	runs-on: ubuntu-22.04
	outputs:
	label: ${{ steps.start-ec2-runner.outputs.label }}
	ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }}
	steps:
	- name: Configure AWS credentials
	uses: aws-actions/[email protected]
	with:
	aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
	aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
	aws-region: ${{ secrets.AWS_REGION }}
	- name: Start EC2 runner
	id: start-ec2-runner
	uses: machulav/[email protected]
	with:
	mode: start
	github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
	ec2-image-id: ${{ vars.AWS_EC2_IMAGE_ID }}
	ec2-instance-type: ${{ vars.AWS_EC2_INSTANCE_TYPE }}
	subnet-id: ${{ vars.AWS_DEFAULT_SUBNET_ID }}
	security-group-id: ${{ vars.AWS_SECURITY_GROUP_ID }}

	integration-tests:
	name: Run integration tests
	needs: [start-runner]
	runs-on: ${{ needs.start-runner.outputs.label }}
	steps:
	- name: Checkout repository
	uses: actions/checkout@v4

	- name: Install python
	run: \|
	sudo add-apt-repository ppa:deadsnakes/ppa -y
	sudo apt update -y
	VERSION=3.10
	sudo apt install python3.10 python3.10-distutils python3.10-venv -y
	wget https://bootstrap.pypa.io/get-pip.py
	python3.10 get-pip.py
	python3.10 -m pip install tox
	rm get-pip.py

	- name: Install and setup Canonical k8s
	run: \|
	sudo snap install kubectl --classic
	sudo snap install k8s --classic --channel=1.32-classic/stable
	sudo k8s bootstrap
	sudo k8s enable local-storage
	mkdir -p ~/.kube
	sudo k8s config > ~/.kube/config
	sudo chown $(id -u):$(id -g) ~/.kube/config

	- name: Enable NVIDIA operator
	run: \|
	export KUBECONFIG=~/.kube/config
	curl -fsSL -o get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 \
	&& chmod 700 get_helm.sh \
	&& ./get_helm.sh

	helm repo add nvidia https://helm.ngc.nvidia.com/nvidia \
	&& helm repo update

	helm install --wait --generate-name -n gpu-operator --create-namespace nvidia/gpu-operator

	# Wait until the GPU operator validations pass
	while ! kubectl logs -n gpu-operator -l app=nvidia-operator-validator \| grep "all validations are successful"; do
	echo "Waiting for GPU operator validations to pass..."
	sleep 5
	done

	- name: Run tests
	run: \|
	sudo tox -e integration-gpu -- -vv -s

	stop-runner:
	name: Stop self-hosted EC2 runner
	needs:
	- start-runner
	- integration-tests
	runs-on: ubuntu-22.04
	if: ${{ always() }}
	steps:
	- name: Configure AWS credentials
	uses: aws-actions/[email protected]
	with:
	aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
	aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
	aws-region: ${{ secrets.AWS_REGION }}
	- name: Stop EC2 runner
	uses: machulav/[email protected]
	with:
	mode: stop
	github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
	label: ${{ needs.start-runner.outputs.label }}
	ec2-instance-id: ${{ needs.start-runner.outputs.ec2-instance-id }}

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

GPU integration CI #100

Workflow file

GPU integration CI #100

Uh oh!

Workflow file for this run