diff --git a/config/components/rbac/role.yaml b/config/components/rbac/role.yaml index de2f34fc74..026b705eaf 100644 --- a/config/components/rbac/role.yaml +++ b/config/components/rbac/role.yaml @@ -284,7 +284,6 @@ rules: - apiGroups: - resource.k8s.io resources: - - deviceclasses - resourceclaimtemplates verbs: - get diff --git a/keps/2941-DRA-Structured-Parameters/examples/gpu-test1/single-clusterqueue-setup.yaml b/keps/2941-DRA-Structured-Parameters/examples/gpu-test1/single-clusterqueue-setup.yaml index 309e93d8d0..7ea9df25a4 100644 --- a/keps/2941-DRA-Structured-Parameters/examples/gpu-test1/single-clusterqueue-setup.yaml +++ b/keps/2941-DRA-Structured-Parameters/examples/gpu-test1/single-clusterqueue-setup.yaml @@ -1,3 +1,4 @@ + apiVersion: kueue.x-k8s.io/v1beta1 kind: ResourceFlavor metadata: diff --git a/keps/2941-DRA-Structured-Parameters/examples/gpu-test2/gpu-test2.yaml b/keps/2941-DRA-Structured-Parameters/examples/gpu-test2/gpu-test2.yaml new file mode 100644 index 0000000000..35f334a326 --- /dev/null +++ b/keps/2941-DRA-Structured-Parameters/examples/gpu-test2/gpu-test2.yaml @@ -0,0 +1,42 @@ +# One pod, one container +# Asking for 2 distinct GPUs + +--- +apiVersion: resource.k8s.io/v1alpha3 +kind: ResourceClaimTemplate +metadata: + namespace: gpu-test2 + name: multiple-gpus +spec: + spec: + devices: + requests: + - name: gpus + deviceClassName: gpu.example.com + allocationMode: ExactCount + count: 2 + +--- +apiVersion: batch/v1 +kind: Job +metadata: + namespace: gpu-test2 + name: job0 + labels: + app: job + kueue.x-k8s.io/queue-name: user-queue-gpu-test2 +spec: + template: + spec: + restartPolicy: Never + containers: + - name: ctr0 + image: ubuntu:22.04 + command: ["bash", "-c"] + args: ["export; sleep 9999"] + resources: + claims: + - name: gpus + resourceClaims: + - name: gpus + resourceClaimTemplateName: multiple-gpus diff --git a/keps/2941-DRA-Structured-Parameters/examples/gpu-test2/single-clusterqueue-setup.yaml b/keps/2941-DRA-Structured-Parameters/examples/gpu-test2/single-clusterqueue-setup.yaml new file mode 100644 index 0000000000..e6f3f59ef5 --- /dev/null +++ b/keps/2941-DRA-Structured-Parameters/examples/gpu-test2/single-clusterqueue-setup.yaml @@ -0,0 +1,9 @@ + +apiVersion: kueue.x-k8s.io/v1beta1 +kind: LocalQueue +metadata: + namespace: "gpu-test2" + name: "user-queue-gpu-test2" +spec: + clusterQueue: "cluster-queue" + diff --git a/keps/2941-DRA-Structured-Parameters/examples/gpu-test3/gpu-test3.yaml b/keps/2941-DRA-Structured-Parameters/examples/gpu-test3/gpu-test3.yaml new file mode 100644 index 0000000000..4a47a71657 --- /dev/null +++ b/keps/2941-DRA-Structured-Parameters/examples/gpu-test3/gpu-test3.yaml @@ -0,0 +1,56 @@ +# One pod, two containers +# Each asking for shared access to a single GPU + +--- +apiVersion: v1 +kind: Namespace +metadata: + name: gpu-test3 + +--- +apiVersion: resource.k8s.io/v1alpha3 +kind: ResourceClaimTemplate +metadata: + namespace: gpu-test3 + name: single-gpu +spec: + spec: + devices: + requests: + - name: gpu + deviceClassName: gpu.example.com + +--- + +apiVersion: batch/v1 +kind: Job +metadata: + namespace: gpu-test3 + name: job0 + labels: + app: job + kueue.x-k8s.io/queue-name: "user-queue-gpu-test3" +spec: + parallelism: 1 + completions: 1 + template: + spec: + restartPolicy: Never + containers: + - name: ctr0 + image: ubuntu:22.04 + command: ["bash", "-c"] + args: ["export; sleep 9999"] + resources: + claims: + - name: shared-gpu + - name: ctr1 + image: ubuntu:22.04 + command: ["bash", "-c"] + args: ["export; sleep 9999"] + resources: + claims: + - name: shared-gpu + resourceClaims: + - name: shared-gpu + resourceClaimTemplateName: single-gpu diff --git a/keps/2941-DRA-Structured-Parameters/examples/gpu-test3/single-clusterqueue-setup.yaml b/keps/2941-DRA-Structured-Parameters/examples/gpu-test3/single-clusterqueue-setup.yaml new file mode 100644 index 0000000000..1c26666caf --- /dev/null +++ b/keps/2941-DRA-Structured-Parameters/examples/gpu-test3/single-clusterqueue-setup.yaml @@ -0,0 +1,9 @@ + +apiVersion: kueue.x-k8s.io/v1beta1 +kind: LocalQueue +metadata: + namespace: "gpu-test3" + name: "user-queue-gpu-test3" +spec: + clusterQueue: "cluster-queue" + diff --git a/pkg/controller/core/clusterqueue_controller.go b/pkg/controller/core/clusterqueue_controller.go index 73179d2d7d..785cca556f 100644 --- a/pkg/controller/core/clusterqueue_controller.go +++ b/pkg/controller/core/clusterqueue_controller.go @@ -153,7 +153,6 @@ func NewClusterQueueReconciler( // +kubebuilder:rbac:groups="",resources=namespaces,verbs=get;list;watch // +kubebuilder:rbac:groups="",resources=events,verbs=create;watch;update;patch // +kubebuilder:rbac:groups=resource.k8s.io,resources=resourceclaimtemplates,verbs=get;list;watch -// +kubebuilder:rbac:groups=resource.k8s.io,resources=deviceclasses,verbs=get;list;watch // +kubebuilder:rbac:groups=kueue.x-k8s.io,resources=clusterqueues,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups=kueue.x-k8s.io,resources=clusterqueues/status,verbs=get;update;patch // +kubebuilder:rbac:groups=kueue.x-k8s.io,resources=clusterqueues/finalizers,verbs=update diff --git a/pkg/util/limitrange/limitrange.go b/pkg/util/limitrange/limitrange.go index 2ae360ad08..3bd3bd7b82 100644 --- a/pkg/util/limitrange/limitrange.go +++ b/pkg/util/limitrange/limitrange.go @@ -25,6 +25,7 @@ import ( "k8s.io/utils/ptr" k8sresource "k8s.io/apimachinery/pkg/api/resource" + "sigs.k8s.io/kueue/pkg/util/resource" ) @@ -150,18 +151,18 @@ func calculatePodClaims(ps *corev1.PodSpec) corev1.ResourceList { // We want to track the number of claims for the pod. for i := range ps.Containers { for _, val := range containers[i].Resources.Claims { - totalClaims[val.Name] = totalClaims[val.Name] + 1 + totalClaims[val.Name]++ } } for i := range initContainers { for _, val := range initContainers[i].Resources.Claims { - totalClaims[val.Name] = totalClaims[val.Name] + 1 + totalClaims[val.Name]++ } } for i := range initContainers { if isSidecarContainer(initContainers[i]) { for _, val := range initContainers[i].Resources.Claims { - totalClaims[val.Name] = totalClaims[val.Name] + 1 + totalClaims[val.Name]++ } } } @@ -169,13 +170,12 @@ func calculatePodClaims(ps *corev1.PodSpec) corev1.ResourceList { _, ok := totalClaims[val.Name] if ok { keyName := "" - if ptr.Deref(val.ResourceClaimName, "") != "" { + switch { + case ptr.Deref(val.ResourceClaimName, "") != "": keyName = *val.ResourceClaimName - } else if ptr.Deref(val.ResourceClaimTemplateName, "") != "" { + case ptr.Deref(val.ResourceClaimTemplateName, "") != "": keyName = *val.ResourceClaimTemplateName - } else { - // TODO: figure out what to do in this case - // DRA API says this is not allowed + default: return totalResourceClaimTemplate } countOfClaims, ok := totalResourceClaimTemplate[corev1.ResourceName(keyName)] diff --git a/pkg/workload/resources_test.go b/pkg/workload/resources_test.go index 2b60af9607..fc22ea21dd 100644 --- a/pkg/workload/resources_test.go +++ b/pkg/workload/resources_test.go @@ -21,7 +21,7 @@ import ( nodev1 "k8s.io/api/node/v1" dra "k8s.io/api/resource/v1alpha3" "k8s.io/apimachinery/pkg/api/resource" - v1 "k8s.io/apimachinery/pkg/apis/meta/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/utils/ptr" kueue "sigs.k8s.io/kueue/apis/kueue/v1beta1" @@ -497,7 +497,7 @@ func TestAddDeviceClassesToContainerRequests(t *testing.T) { enableDRAGate: false, resourceClaimTemplate: []dra.ResourceClaimTemplate{ { - ObjectMeta: v1.ObjectMeta{ + ObjectMeta: metav1.ObjectMeta{ Name: "single-gpu", Namespace: "", }, @@ -548,7 +548,7 @@ func TestAddDeviceClassesToContainerRequests(t *testing.T) { enableDRAGate: true, resourceClaimTemplate: []dra.ResourceClaimTemplate{ { - ObjectMeta: v1.ObjectMeta{ + ObjectMeta: metav1.ObjectMeta{ Name: "single-gpu", Namespace: "", }, diff --git a/test/e2e/config/manager_e2e_patch.yaml b/test/e2e/config/manager_e2e_patch.yaml index 8de3324961..fbb0ffbe0b 100644 --- a/test/e2e/config/manager_e2e_patch.yaml +++ b/test/e2e/config/manager_e2e_patch.yaml @@ -3,4 +3,4 @@ value: IfNotPresent - op: add path: /spec/template/spec/containers/0/args/- - value: --feature-gates=VisibilityOnDemand=true,MultiKueue=true,MultiKueueBatchJobWithManagedBy=true + value: --feature-gates=VisibilityOnDemand=true,MultiKueue=true,MultiKueueBatchJobWithManagedBy=true,DynamicResourceStructuredParameters=true