Merge pull request #1078 from moshe010/dra

add support for Dynamic Resource Allocation
k8snetworkplumbingwg · May 23, 2024 · 9f5c023 · 9f5c023
2 parents d9f1c7c + c9d411c
commit 9f5c023
Show file tree

Hide file tree

Showing 10 changed files with 355 additions and 19 deletions.
diff --git a/.github/workflows/kind-e2e.yml b/.github/workflows/kind-e2e.yml
@@ -85,6 +85,10 @@ jobs:
         working-directory: ./e2e
         run: ./test-default-route1.sh
 
+      - name: Test DRA integration
+        working-directory: ./e2e
+        run: ./test-dra-integration.sh
+
       - name: Export kind logs
         if: always()
         run: |

diff --git a/.gitignore b/.gitignore
@@ -2,6 +2,7 @@
 bin/
 e2e/bin/
 e2e/yamls/
+e2e/repos/
 
 # GOPATH created by the build script
 gopath/

diff --git a/docs/how-to-use.md b/docs/how-to-use.md
@@ -511,7 +511,7 @@ spec:
 EOF
 ```
 
-We can then create a pod which uses the `default-route` key in the JSON formatted `k8s.v1.cni.cncf.io/networks` annotation. 
+We can then create a pod which uses the `default-route` key in the JSON formatted `k8s.v1.cni.cncf.io/networks` annotation.
 
 ```
 cat <<EOF | kubectl create -f -
@@ -537,9 +537,9 @@ This will set `192.168.2.1` as the default route over the `net1` interface, such
 ```
 kubectl exec -it samplepod -- ip route
 
-default via 192.168.2.1 dev net1 
-10.244.0.0/24 dev eth0  proto kernel  scope link  src 10.244.0.169 
-10.244.0.0/16 via 10.244.0.1 dev eth0 
+default via 192.168.2.1 dev net1
+10.244.0.0/24 dev eth0  proto kernel  scope link  src 10.244.0.169
+10.244.0.0/16 via 10.244.0.1 dev eth0
 ```
 
 ## Entrypoint Parameters
@@ -634,3 +634,123 @@ Sometimes, you may wish to not have the entrypoint copy the binary file onto the
 If you wish to have auto configuration use the `readinessindicatorfile` in the configuration, you can use the `--readiness-indicator-file` to express which file should be used as the readiness indicator.
 
     --readiness-indicator-file=/path/to/file
+
+### Run pod with network annotation and Dynamic Resource Allocation driver
+
+> :warning: Dynamic Resource Allocation (DRA) is [currently an alpha](https://kubernetes.io/docs/concepts/scheduling-eviction/dynamic-resource-allocation/),
+> and is subject to change. Please consider this functionality as a preview. The architecture and usage of DRA in
+> Multus CNI may change in the future as this technology matures.
+
+Dynamic Resource Allocation is alternative mechanism to device plugin which allows to requests pod and container
+resources.
+
+The following sections describe how to use DRA with multus and NVIDIA DRA driver. Other DRA networking driver vendors
+should follow similar concepts to make use of multus DRA support.
+
+#### Prerequisite
+
+1. Kubernetes 1.27
+2. Container Runtime with CDI support enabled
+3. Kubernetes runtime-config=resource.k8s.io/v1alpha2
+4. Kubernetes feature-gates=DynamicResourceAllocation=True,KubeletPodResourcesDynamicResources=true
+
+#### Install DRA driver
+
+The current example uses NVIDIA DRA driver for networking. This DRA driver is not publicly available. An alternative to
+this DRA driver is available at [dra-example-driver](https://github.com/kubernetes-sigs/dra-example-driver).
+
+#### Create dynamic resource class with NVIDIA network DRA driver
+
+The `ResourceClass` defines the resource pool of `sf-pool-1`.
+
+```
+# Execute following command at Kubernetes master
+cat <<EOF | kubectl create -f -
+apiVersion: resource.k8s.io/v1alpha2
+kind: ResourceClass
+metadata:
+  name: sf-pool-1
+driverName: net.resource.nvidia.com
+EOF
+```
+
+#### Create network attachment definition with resource name
+
+The `k8s.v1.cni.cncf.io/resourceName` should match the `ResourceClass` name defined in the section above.
+In this example it is `sf-pool-1`. Multus query the K8s PodResource API to fetch the `resourceClass` name and also
+query the NetworkAttachmentDefinition `k8s.v1.cni.cncf.io/resourceName`. If both has the same name multus send the
+CDI device name in the DeviceID argument.
+
+##### NetworkAttachmentDefinition for ovn-kubernetes example:
+
+Following command creates NetworkAttachmentDefinition. CNI config is in `config:` field.
+
+```
+# Execute following command at Kubernetes master
+cat <<EOF | kubectl create -f -
+apiVersion: "k8s.cni.cncf.io/v1"
+kind: NetworkAttachmentDefinition
+metadata:
+  name: default
+  annotations:
+    k8s.v1.cni.cncf.io/resourceName: sf-pool-1
+spec:
+  config: '{
+      "cniVersion": "0.4.0",
+      "dns": {},
+      "ipam": {},
+      "logFile": "/var/log/ovn-kubernetes/ovn-k8s-cni-overlay.log",
+      "logLevel": "4",
+      "logfile-maxage": 5,
+      "logfile-maxbackups": 5,
+      "logfile-maxsize": 100,
+      "name": "ovn-kubernetes",
+      "type": "ovn-k8s-cni-overlay"
+    }'
+EOF
+```
+
+#### Create DRA Resource Claim
+
+Following command creates `ResourceClaim` `sf` which request resource from  `ResourceClass` `sf-pool-1`.
+
+```
+# Execute following command at Kubernetes master
+cat <<EOF | kubectl create -f -
+apiVersion: resource.k8s.io/v1alpha2
+kind: ResourceClaim
+metadata:
+  namespace: default
+  name: sf
+spec:
+  spec:
+    resourceClassName: sf-pool-1
+EOF
+```
+
+#### Launch pod with DRA Resource Claim
+
+Following command Launch a Pod with primiry network `default` and `ResourceClaim` `sf`.
+
+```
+apiVersion: v1
+kind: Pod
+metadata:
+  namespace: default
+  name: test-sf-claim
+  annotations:
+    v1.multus-cni.io/default-network: default
+spec:
+  restartPolicy: Always
+  containers:
+  - name: with-resource
+    image: docker.io/library/ubuntu:22.04
+    command: ["/bin/sh", "-ec", "while :; do echo '.'; sleep 5 ; done"]
+    resources:
+      claims:
+      - name: resource
+  resourceClaims:
+  - name: resource
+    source:
+      resourceClaimName: sf
+```
diff --git a/e2e/get_tools.sh b/e2e/get_tools.sh
@@ -13,3 +13,4 @@ curl -Lo ./bin/koko https://github.com/redhat-nfvpe/koko/releases/download/v0.83
 chmod +x ./bin/koko
 curl -Lo ./bin/jq https://github.com/stedolan/jq/releases/download/jq-1.6/jq-linux64
 chmod +x ./bin/jq
+wget -qO- https://get.helm.sh/helm-v3.14.3-linux-amd64.tar.gz | tar xvzf - --strip-components=1 -C ./bin linux-amd64/helm
diff --git a/e2e/setup_cluster.sh b/e2e/setup_cluster.sh
@@ -34,7 +34,21 @@ nodes:
       nodeRegistration:
         kubeletExtraArgs:
           pod-manifest-path: "/etc/kubernetes/manifests/"
+          feature-gates: "DynamicResourceAllocation=true,KubeletPodResourcesDynamicResources=true"
   - role: worker
+# Required by DRA Integration
+##
+featureGates:
+  DynamicResourceAllocation: true
+runtimeConfig:
+  "api/alpha": "true"
+containerdConfigPatches:
+# Enable CDI as described in
+# https://github.com/container-orchestrated-devices/container-device-interface#containerd-configuration
+- |-
+  [plugins."io.containerd.grpc.v1.cri"]
+      enable_cdi = true
+##
 EOF
 
 # load multus image from container host to kind node

diff --git a/e2e/templates/dra-integration.yml.j2 b/e2e/templates/dra-integration.yml.j2
@@ -0,0 +1,49 @@
+---
+apiVersion: resource.k8s.io/v1alpha2
+kind: ResourceClaimTemplate
+metadata:
+  name: gpu.example.com
+spec:
+  spec:
+    resourceClassName: gpu.example.com
+---
+apiVersion: "k8s.cni.cncf.io/v1"
+kind: NetworkAttachmentDefinition
+metadata:
+  name: dra-net 
+  annotations:
+    k8s.v1.cni.cncf.io/resourceName: gpu.example.com
+spec:
+  config: '{
+        "cniVersion": "{{ CNI_VERSION }}",
+        "plugins": [{
+            "name": "mynet",
+            "type": "dummy",
+            "ipam": {
+                "type": "host-local",
+                "subnet": "10.1.2.0/24"
+            }
+        }]
+    }'
+---
+apiVersion: v1
+kind: Pod
+metadata:
+  name: dra-integration
+  labels:
+    app: dra-integration
+  annotations:
+    k8s.v1.cni.cncf.io/networks: default/dra-net
+spec:
+  containers:
+  - name: ctr0
+    image: ubuntu:22.04
+    command: ["bash", "-c"]
+    args: ["export; sleep 9999"]
+    resources:
+      claims:
+      - name: gpu
+  resourceClaims:
+  - name: gpu
+    source:
+      resourceClaimTemplateName: gpu.example.com
diff --git a/e2e/templates/multus-daemonset-thick.yml.j2 b/e2e/templates/multus-daemonset-thick.yml.j2
@@ -158,6 +158,9 @@ spec:
         - name: multus-daemon-config
           mountPath: /etc/cni/net.d/multus.d
           readOnly: true
+        - name: kubelet-pod-resources
+          mountPath: /var/lib/kubelet/pod-resources
+          readOnly: true
         env:
         - name: MULTUS_NODE_NAME
           valueFrom:
@@ -187,6 +190,9 @@ spec:
         - name: cnibin
           hostPath:
             path: /opt/cni/bin
+        - name: kubelet-pod-resources
+          hostPath:
+            path: /var/lib/kubelet/pod-resources
         - name: multus-daemon-config
           configMap:
             name: multus-daemon-config

diff --git a/e2e/test-dra-integration.sh b/e2e/test-dra-integration.sh
@@ -0,0 +1,59 @@
+#!/bin/sh
+set -o errexit
+
+export PATH=${PATH}:./bin
+
+# This test is using an example implementation of a DRA driver. This driver is mocking GPU resources. At our test we
+# don't care about what these resources are. We want to ensure that such resource is correctly passed in the Pod using
+# Multus configurations. A couple of notes:
+# - We explitictly don't pin the revision of the dra-example-driver to a specific commit to ensure that the integration
+#   continues to work even when the dra-example-driver is updated (which may also indicate API changes on the DRA).
+# - The chart and latest is image is not published somewhere, therefore we have to build locally. This leads to slower
+#   e2e suite runs.
+echo "installing dra-example-driver"
+repo_path="repos/dra-example-driver"
+
+rm -rf $repo_path || true
+git clone https://github.com/kubernetes-sigs/dra-example-driver.git ${repo_path}
+${repo_path}/demo/build-driver.sh
+KIND_CLUSTER_NAME=kind ${repo_path}/demo/scripts/load-driver-image-into-kind.sh
+chart_path=${repo_path}/deployments/helm/dra-example-driver/
+overriden_values_path=${chart_path}/overriden_values.yaml
+
+# With the thick plugin, in kind, the primary network on the control plane is not always working as expected. The pods
+# sometimes are not able to communicate with the control plane and the error looks like this:
+# failed to list *v1alpha2.PodSchedulingContext: Get "https://10.96.0.1:443/apis/resource.k8s.io/v1alpha2/podschedulingcontexts?limit=500&resourceVersion=0": dial tcp 10.96.0.1:443: connect: no route to host
+# We override the values here to schedule the controller on the worker nodes where the network is working as expected.
+cat <<EOF >> ${overriden_values_path}
+controller:
+  nodeSelector: null
+  tolerations: null
+EOF
+
+helm install \
+    -n dra-example-driver \
+    --create-namespace \
+    -f ${overriden_values_path} \
+    dra-example-driver \
+    ${chart_path}
+
+echo "installing testing pods"
+kubectl create -f yamls/dra-integration.yml
+kubectl wait --for=condition=ready -l app=dra-integration --timeout=300s pod
+
+echo "check dra-integration pod for DRA injected environment variable"
+
+# We can validate that the resource is correctly injected by checking an environment variable this dra driver is injecting
+# in the Pod.
+# https://github.com/kubernetes-sigs/dra-example-driver/blob/be2b8b1db47b8c757440e955ce5ced88c23bfe86/cmd/dra-example-kubeletplugin/cdi.go#L71C20-L71C44
+env_variable=$(kubectl exec dra-integration -- bash -c "echo \$DRA_RESOURCE_DRIVER_NAME | grep gpu.resource.example.com")
+if [ $? -eq 0 ];then
+	echo "dra-integration pod has DRA injected environment variable"
+else
+	echo "dra-integration pod doesn't have DRA injected environment variable"
+	exit 1
+fi
+
+echo "cleanup resources"
+kubectl delete -f yamls/dra-integration.yml
+helm uninstall -n dra-example-driver dra-example-driver
diff --git a/pkg/kubeletclient/kubeletclient.go b/pkg/kubeletclient/kubeletclient.go
@@ -21,6 +21,7 @@ import (
 	"net/url"
 	"os"
 	"path/filepath"
+	"strings"
 	"time"
 
 	"golang.org/x/net/context"
@@ -137,19 +138,45 @@ func (rc *kubeletClient) GetPodResourceMap(pod *v1.Pod) (map[string]*types.Resou
 	for _, pr := range rc.resources {
 		if pr.Name == name && pr.Namespace == ns {
 			for _, cnt := range pr.Containers {
-				for _, dev := range cnt.Devices {
-					if rInfo, ok := resourceMap[dev.ResourceName]; ok {
-						rInfo.DeviceIDs = append(rInfo.DeviceIDs, dev.DeviceIds...)
-					} else {
-						resourceMap[dev.ResourceName] = &types.ResourceInfo{DeviceIDs: dev.DeviceIds}
-					}
-				}
+				rc.getDevicePluginResources(cnt.Devices, resourceMap)
+				rc.getDRAResources(cnt.DynamicResources, resourceMap)
 			}
 		}
 	}
 	return resourceMap, nil
 }
 
+func (rc *kubeletClient) getDevicePluginResources(devices []*podresourcesapi.ContainerDevices, resourceMap map[string]*types.ResourceInfo) {
+	for _, dev := range devices {
+		if rInfo, ok := resourceMap[dev.ResourceName]; ok {
+			rInfo.DeviceIDs = append(rInfo.DeviceIDs, dev.DeviceIds...)
+		} else {
+			resourceMap[dev.ResourceName] = &types.ResourceInfo{DeviceIDs: dev.DeviceIds}
+		}
+	}
+}
+
+func (rc *kubeletClient) getDRAResources(dynamicResources []*podresourcesapi.DynamicResource, resourceMap map[string]*types.ResourceInfo) {
+	for _, dynamicResource := range dynamicResources {
+		var deviceIDs []string
+		for _, claimResource := range dynamicResource.ClaimResources {
+			for _, cdiDevice := range claimResource.CDIDevices {
+				res := strings.Split(cdiDevice.Name, "=")
+				if len(res) == 2 {
+					deviceIDs = append(deviceIDs, res[1])
+				} else {
+					logging.Errorf("GetPodResourceMap: Invalid CDI format")
+				}
+			}
+		}
+		if rInfo, ok := resourceMap[dynamicResource.ClassName]; ok {
+			rInfo.DeviceIDs = append(rInfo.DeviceIDs, deviceIDs...)
+		} else {
+			resourceMap[dynamicResource.ClassName] = &types.ResourceInfo{DeviceIDs: deviceIDs}
+		}
+	}
+}
+
 func hasKubeletAPIEndpoint(url *url.URL) bool {
 	// Check for kubelet resource API socket file
 	if _, err := os.Stat(url.Path); err != nil {