Skip to content

Commit b487e9a

Browse files
andrewd-zededaOhmSpectator
authored andcommitted
Install Descheduler, fix startup readywait
Descheduler will be used for eve-app rebalancing during cluster node reboots/upgrades in an upcoming PR. Wait for longhorn daemonsets to be ready, before upcoming PR to snapshot single-node /var/lib kube db. Resolve sometimes failure to import external-boot-image Wait for containerd before importing. Tighter error checking on import. Signed-off-by: Andrew Durbin <[email protected]>
1 parent a3fbdd9 commit b487e9a

11 files changed

+4903
-68
lines changed

.spdxignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,3 +10,6 @@ pkg/rngd/cmd/rngd/vendor/
1010
pkg/wwan/mmagent/vendor/
1111
tools/get-deps/vendor/
1212
pkg/installer/vendor/
13+
pkg/kube/descheduler-job.yaml
14+
pkg/kube/descheduler_rbac.yaml
15+
pkg/kube/lh-cfg-v1.6.2.yaml

.yamllint

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,8 @@ extends: default
44
rules:
55
line-length:
66
max: 300
7-
level: warning
7+
level: warning
8+
9+
ignore:
10+
- pkg/kube/lh-cfg-v1.6.2.yaml
11+
- pkg/kube/descheduler_rbac.yaml

.yetus-excludes

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,3 +17,5 @@
1717
^pkg/apparmor/etc/
1818
^eve-tools/bpftrace-compiler/examples/.+\.bt
1919
^pkg/installer/vendor/
20+
^pkg/kube/lh-cfg-v1.6.2.yaml
21+
^pkg/kube/descheduler_rbac.yaml

pkg/kube/Dockerfile

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,10 +39,18 @@ COPY kubevirt-features.yaml /etc
3939
COPY external-boot-image.tar /etc/
4040

4141
# Longhorn config
42+
COPY longhorn-utils.sh /usr/bin/
43+
COPY lh-cfg-v1.6.2.yaml /etc/
4244
COPY iscsid.conf /etc/iscsi/
4345
COPY longhorn-generate-support-bundle.sh /usr/bin/
4446
COPY nsmounter /usr/bin/
4547

48+
# descheduler
49+
COPY descheduler-utils.sh /usr/bin/
50+
COPY descheduler_rbac.yaml /etc/
51+
COPY descheduler-job.yaml /etc/
52+
COPY descheduler-policy-configmap.yaml /etc/
53+
4654
# Containerd config
4755
RUN mkdir -p /etc/containerd
4856
COPY config-k3s.toml /etc/containerd/

pkg/kube/cluster-init.sh

Lines changed: 73 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55

66
K3S_VERSION=v1.28.5+k3s1
77
KUBEVIRT_VERSION=v1.1.0
8-
LONGHORN_VERSION=v1.6.2
98
CDI_VERSION=v1.54.0
109
NODE_IP=""
1110
MAX_K3S_RESTARTS=10
@@ -18,6 +17,11 @@ HOSTNAME=""
1817
VMICONFIG_FILENAME="/run/zedkube/vmiVNC.run"
1918
VNC_RUNNING=false
2019

20+
# shellcheck source=pkg/kube/descheduler-utils.sh
21+
. /usr/bin/descheduler-utils.sh
22+
# shellcheck source=pkg/kube/longhorn-utils.sh
23+
. /usr/bin/longhorn-utils.sh
24+
2125
logmsg() {
2226
local MSG
2327
local TIME
@@ -220,40 +224,6 @@ config_cluster_roles() {
220224
touch /var/lib/debuguser-initialized
221225
}
222226

223-
apply_longhorn_disk_config() {
224-
node=$1
225-
kubectl label node "$node" node.longhorn.io/create-default-disk='config'
226-
kubectl annotate node "$node" node.longhorn.io/default-disks-config='[ { "path":"/persist/vault/volumes", "allowScheduling":true }]'
227-
}
228-
229-
check_overwrite_nsmounter() {
230-
### REMOVE ME+
231-
# When https://github.com/longhorn/longhorn/issues/6857 is resolved, remove this 'REMOVE ME' section
232-
# In addition to pkg/kube/nsmounter and the copy of it in pkg/kube/Dockerfile
233-
longhornCsiPluginPods=$(kubectl -n longhorn-system get pod -o json | jq -r '.items[] | select(.metadata.labels.app=="longhorn-csi-plugin" and .status.phase=="Running") | .metadata.name')
234-
for csiPod in $longhornCsiPluginPods; do
235-
if ! kubectl -n longhorn-system exec "pod/${csiPod}" --container=longhorn-csi-plugin -- ls /usr/local/sbin/nsmounter.updated > /dev/null 2>@1; then
236-
if kubectl -n longhorn-system exec -i "pod/${csiPod}" --container=longhorn-csi-plugin -- tee /usr/local/sbin/nsmounter < /usr/bin/nsmounter; then
237-
logmsg "Updated nsmounter in longhorn pod ${csiPod}"
238-
kubectl -n longhorn-system exec "pod/${csiPod}" --container=longhorn-csi-plugin -- touch /usr/local/sbin/nsmounter.updated
239-
fi
240-
fi
241-
done
242-
### REMOVE ME-
243-
}
244-
245-
# A spot to do persistent configuration of longhorn
246-
# These are applied once per cluster
247-
longhorn_post_install_config() {
248-
# Wait for longhorn objects to be available before patching them
249-
lhSettingsAvailable=$(kubectl -n longhorn-system get settings -o json | jq '.items | length>0')
250-
if [ "$lhSettingsAvailable" != "true" ]; then
251-
return
252-
fi
253-
kubectl -n longhorn-system patch settings.longhorn.io/upgrade-checker -p '[{"op":"replace","path":"/value","value":"false"}]' --type json
254-
touch /var/lib/longhorn_configured
255-
}
256-
257227
check_start_k3s() {
258228
pgrep -f "k3s server" > /dev/null 2>&1
259229
if [ $? -eq 1 ]; then
@@ -283,6 +253,48 @@ check_start_k3s() {
283253
return 0
284254
}
285255

256+
external_boot_image_import() {
257+
# NOTE: https://kubevirt.io/user-guide/virtual_machines/boot_from_external_source/
258+
# Install external-boot-image image to our eve user containerd registry.
259+
# This image contains just kernel and initrd to bootstrap a container image as a VM.
260+
# This is very similar to what we do on kvm based eve to start container as a VM.
261+
262+
boot_img_path="/etc/external-boot-image.tar"
263+
264+
# Is containerd up?
265+
if ! /var/lib/k3s/bin/k3s ctr -a /run/containerd-user/containerd.sock info > /dev/null 2>&1; then
266+
logmsg "k3s-containerd not yet running for image import"
267+
return 1
268+
fi
269+
270+
eve_external_boot_img_name="docker.io/lfedge/eve-external-boot-image"
271+
eve_external_boot_img_tag=$(cat /run/eve-release)
272+
eve_external_boot_img="${eve_external_boot_img_name}:${eve_external_boot_img_tag}"
273+
if /var/lib/k3s/bin/k3s crictl --runtime-endpoint=unix:///run/containerd-user/containerd.sock inspecti "$eve_external_boot_img"; then
274+
# Already imported
275+
return 0
276+
fi
277+
278+
import_name_tag=$(tar -xOf "$boot_img_path" manifest.json | jq -r '.[0].RepoTags[0]')
279+
import_name=$(echo "$import_name_tag" | cut -d ':' -f 1)
280+
if [ "$import_name" != "$eve_external_boot_img_name" ]; then
281+
logmsg "external-boot-image.tar is corrupt"
282+
return 1
283+
fi
284+
285+
if ! /var/lib/k3s/bin/k3s ctr -a /run/containerd-user/containerd.sock image import "$boot_img_path"; then
286+
logmsg "import $boot_img_path failed"
287+
return 1
288+
fi
289+
290+
if ! /var/lib/k3s/bin/k3s ctr -a /run/containerd-user/containerd.sock image tag "$import_name_tag" "$eve_external_boot_img"; then
291+
logmsg "re-tag external-boot-image failed"
292+
return 1
293+
fi
294+
logmsg "Successfully installed external-boot-image $import_name_tag as $eve_external_boot_img"
295+
return 0
296+
}
297+
286298
check_start_containerd() {
287299
# Needed to get the pods to start
288300
if [ ! -L /usr/bin/runc ]; then
@@ -299,23 +311,6 @@ check_start_containerd() {
299311
containerd_pid=$!
300312
logmsg "Started k3s-containerd at pid:$containerd_pid"
301313
fi
302-
if [ -f /etc/external-boot-image.tar ]; then
303-
# NOTE: https://kubevirt.io/user-guide/virtual_machines/boot_from_external_source/
304-
# Install external-boot-image image to our eve user containerd registry.
305-
# This image contains just kernel and initrd to bootstrap a container image as a VM.
306-
# This is very similar to what we do on kvm based eve to start container as a VM.
307-
logmsg "Trying to install new external-boot-image"
308-
# This import happens once per reboot
309-
if ctr -a /run/containerd-user/containerd.sock image import /etc/external-boot-image.tar; then
310-
eve_external_boot_img_tag=$(cat /run/eve-release)
311-
eve_external_boot_img=docker.io/lfedge/eve-external-boot-image:"$eve_external_boot_img_tag"
312-
import_tag=$(tar -xOf /etc/external-boot-image.tar manifest.json | jq -r '.[0].RepoTags[0]')
313-
ctr -a /run/containerd-user/containerd.sock image tag "$import_tag" "$eve_external_boot_img"
314-
315-
logmsg "Successfully installed external-boot-image $import_tag as $eve_external_boot_img"
316-
rm -f /etc/external-boot-image.tar
317-
fi
318-
fi
319314
}
320315
trigger_k3s_selfextraction() {
321316
# Analysis of the k3s source shows nearly any cli command will first self-extract a series of binaries.
@@ -440,6 +435,9 @@ if [ ! -f /var/lib/all_components_initialized ]; then
440435
sleep 1
441436

442437
check_start_containerd
438+
if ! external_boot_image_import; then
439+
continue
440+
fi
443441
if ! check_start_k3s; then
444442
continue
445443
fi
@@ -497,22 +495,30 @@ if [ ! -f /var/lib/all_components_initialized ]; then
497495
continue
498496
fi
499497

500-
if [ ! -f /var/lib/longhorn_initialized ]; then
501-
wait_for_item "longhorn"
502-
logmsg "Installing longhorn version ${LONGHORN_VERSION}"
503-
apply_longhorn_disk_config "$HOSTNAME"
504-
lhCfgPath=/var/lib/lh-cfg-${LONGHORN_VERSION}.yaml
505-
if [ ! -e $lhCfgPath ]; then
506-
curl -k https://raw.githubusercontent.com/longhorn/longhorn/${LONGHORN_VERSION}/deploy/longhorn.yaml > "$lhCfgPath"
507-
fi
508-
if ! grep -q 'create-default-disk-labeled-nodes: true' "$lhCfgPath"; then
509-
sed -i '/ default-setting.yaml: |-/a\ create-default-disk-labeled-nodes: true' "$lhCfgPath"
510-
fi
511-
kubectl apply -f "$lhCfgPath"
512-
touch /var/lib/longhorn_initialized
498+
#
499+
# Longhorn
500+
#
501+
wait_for_item "longhorn"
502+
if ! longhorn_install "$HOSTNAME"; then
503+
continue
504+
fi
505+
if ! longhorn_is_ready; then
506+
# It can take a moment for the new pods to get to ContainerCreating
507+
# Just back off until they are caught by the earlier are_all_pods_ready
508+
sleep 30
509+
continue
510+
fi
511+
logmsg "longhorn ready"
512+
513+
#
514+
# Descheduler
515+
#
516+
wait_for_item "descheduler"
517+
if ! descheduler_install; then
518+
continue
513519
fi
514520

515-
if [ -f /var/lib/k3s_initialized ] && [ -f /var/lib/kubevirt_initialized ] && [ -f /var/lib/longhorn_initialized ]; then
521+
if [ -f /var/lib/k3s_initialized ] && [ -f /var/lib/kubevirt_initialized ]; then
516522
logmsg "All components initialized"
517523
touch /var/lib/all_components_initialized
518524
fi
@@ -541,7 +547,7 @@ else
541547
cp /var/lib/rancher/k3s/user.yaml /run/.kube/k3s/user.yaml
542548
fi
543549
else
544-
if [ -e /var/lib/longhorn_initialized ]; then
550+
if longhorn_is_ready; then
545551
check_overwrite_nsmounter
546552
fi
547553
if [ ! -e /var/lib/longhorn_configured ]; then

pkg/kube/descheduler-job.yaml

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
---
2+
# from: https://raw.githubusercontent.com/kubernetes-sigs/descheduler/v0.29.0/kubernetes/job/job.yaml
3+
apiVersion: batch/v1
4+
kind: Job
5+
metadata:
6+
name: descheduler-job
7+
namespace: kube-system
8+
spec:
9+
parallelism: 1
10+
completions: 1
11+
template:
12+
metadata:
13+
name: descheduler-pod
14+
spec:
15+
priorityClassName: system-cluster-critical
16+
containers:
17+
- name: descheduler
18+
image: registry.k8s.io/descheduler/descheduler:v0.29.0
19+
volumeMounts:
20+
- mountPath: /policy-dir
21+
name: policy-volume
22+
command:
23+
- "/bin/descheduler"
24+
args:
25+
- "--policy-config-file"
26+
- "/policy-dir/policy.yaml"
27+
- "--v"
28+
- "3"
29+
resources:
30+
requests:
31+
cpu: "500m"
32+
memory: "256Mi"
33+
livenessProbe:
34+
failureThreshold: 3
35+
httpGet:
36+
path: /healthz
37+
port: 10258
38+
scheme: HTTPS
39+
initialDelaySeconds: 3
40+
periodSeconds: 10
41+
securityContext:
42+
allowPrivilegeEscalation: false
43+
capabilities:
44+
drop:
45+
- ALL
46+
privileged: false
47+
readOnlyRootFilesystem: true
48+
runAsNonRoot: true
49+
restartPolicy: "Never"
50+
serviceAccountName: descheduler-sa
51+
volumes:
52+
- name: policy-volume
53+
configMap:
54+
name: descheduler-policy-configmap
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
# Copyright (c) 2024 Zededa, Inc.
2+
# SPDX-License-Identifier: Apache-2.0
3+
---
4+
# Override default policy to rebalance eve apps when descheduler is run.
5+
apiVersion: v1
6+
kind: ConfigMap
7+
metadata:
8+
name: descheduler-policy-configmap
9+
namespace: kube-system
10+
data:
11+
policy.yaml: |
12+
apiVersion: "descheduler/v1alpha2"
13+
kind: "DeschedulerPolicy"
14+
profiles:
15+
- name: EveAppNodeAffinity
16+
pluginConfig:
17+
- name: "RemovePodsViolatingNodeAffinity"
18+
args:
19+
namespaces:
20+
include:
21+
- "eve-kube-app"
22+
nodeAffinityType:
23+
- "preferredDuringSchedulingIgnoredDuringExecution"
24+
plugins:
25+
deschedule:
26+
enabled:
27+
- "RemovePodsViolatingNodeAffinity"

pkg/kube/descheduler-utils.sh

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
#!/bin/sh
2+
#
3+
# Copyright (c) 2024 Zededa, Inc.
4+
# SPDX-License-Identifier: Apache-2.0
5+
6+
DESCHEDULER_VERSION="v0.29.0"
7+
8+
descheduler_install()
9+
{
10+
logmsg "Applying Descheduler ${DESCHEDULER_VERSION}"
11+
if ! kubectl apply -f /etc/descheduler_rbac.yaml; then
12+
logmsg "descheduler rbac not yet applied"
13+
return 1
14+
fi
15+
if ! kubectl apply -f /etc/descheduler-policy-configmap.yaml; then
16+
logmsg "descheduler configmap not yet applied"
17+
return 1
18+
fi
19+
return 0
20+
}

pkg/kube/descheduler_rbac.yaml

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
---
2+
# from: https://raw.githubusercontent.com/kubernetes-sigs/descheduler/refs/tags/v0.29.0/kubernetes/base/rbac.yaml
3+
kind: ClusterRole
4+
apiVersion: rbac.authorization.k8s.io/v1
5+
metadata:
6+
name: descheduler-cluster-role
7+
rules:
8+
- apiGroups: ["events.k8s.io"]
9+
resources: ["events"]
10+
verbs: ["create", "update"]
11+
- apiGroups: [""]
12+
resources: ["nodes"]
13+
verbs: ["get", "watch", "list"]
14+
- apiGroups: [""]
15+
resources: ["namespaces"]
16+
verbs: ["get", "watch", "list"]
17+
- apiGroups: [""]
18+
resources: ["pods"]
19+
verbs: ["get", "watch", "list", "delete"]
20+
- apiGroups: [""]
21+
resources: ["pods/eviction"]
22+
verbs: ["create"]
23+
- apiGroups: ["scheduling.k8s.io"]
24+
resources: ["priorityclasses"]
25+
verbs: ["get", "watch", "list"]
26+
- apiGroups: ["coordination.k8s.io"]
27+
resources: ["leases"]
28+
verbs: ["create"]
29+
- apiGroups: ["coordination.k8s.io"]
30+
resources: ["leases"]
31+
resourceNames: ["descheduler"]
32+
verbs: ["get", "patch", "delete"]
33+
---
34+
apiVersion: v1
35+
kind: ServiceAccount
36+
metadata:
37+
name: descheduler-sa
38+
namespace: kube-system
39+
---
40+
apiVersion: rbac.authorization.k8s.io/v1
41+
kind: ClusterRoleBinding
42+
metadata:
43+
name: descheduler-cluster-role-binding
44+
roleRef:
45+
apiGroup: rbac.authorization.k8s.io
46+
kind: ClusterRole
47+
name: descheduler-cluster-role
48+
subjects:
49+
- name: descheduler-sa
50+
kind: ServiceAccount
51+
namespace: kube-system

0 commit comments

Comments
 (0)