[DRAFT] Set vllm version limit to avoid upstream API incompatibility #950

	name: API CUDA Tests

	on:
	workflow_dispatch:
	push:
	branches:
	- main
	pull_request:
	branches:
	- main
	types:
	- opened
	- reopened
	- synchronize
	- labeled
	- unlabeled

	concurrency:
	cancel-in-progress: true
	group: ${{ github.workflow }}-${{ github.event.pull_request.number \|\| github.ref }}

	jobs:
	run_api_cuda_tests:
	if: ${{
	(github.event_name == 'push') \|\|
	(github.event_name == 'workflow_dispatch') \|\|
	contains( github.event.pull_request.labels.*.name, 'api') \|\|
	contains( github.event.pull_request.labels.*.name, 'cuda') \|\|
	contains( github.event.pull_request.labels.*.name, 'api_cuda')
	}}

	runs-on: [single-gpu, nvidia-gpu, a10, ci]

	container:
	image: ghcr.io/huggingface/optimum-benchmark:latest-cuda
	options: --ipc host --gpus all

	steps:
	- name: Checkout
	uses: actions/checkout@v4

	- name: Install dependencies
	run: \|
	pip install -e .[testing,timm,diffusers,codecarbon]

	- name: Run tests
	env:
	HF_TOKEN: ${{ secrets.HF_TOKEN }}
	PUSH_REPO_ID: optimum-benchmark/cuda
	run: \|
	pytest tests/test_api.py -x -s -k "api and cuda"

Provide feedback