From 6b5bec794a41ab0109a5983d785a35fa263a07ba Mon Sep 17 00:00:00 2001 From: Erik Sundell Date: Wed, 27 Mar 2024 14:15:33 +0100 Subject: [PATCH 1/2] support chart, grafana: allow memory to peak to 2x requests --- helm-charts/support/values.yaml | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/helm-charts/support/values.yaml b/helm-charts/support/values.yaml index 855c033d50..1b8798e3d0 100644 --- a/helm-charts/support/values.yaml +++ b/helm-charts/support/values.yaml @@ -297,8 +297,10 @@ grafana: # prometheus and grafana. # # Grafana's memory use seems to increase over time but seems reasonable to - # stay below 200Mi for years to come. Grafana's CPU use seems minuscule with - # peaks at up to 9m CPU from one user is browsing its dashboards. + # stay below 200Mi in general. Memory can peak when dashboards are updated, + # and was increased to 400Mi as its been seen getting OOMKilled. Grafana's CPU + # use seems minuscule with peaks at up to 9m CPU from one user is browsing its + # dashboards. # # PromQL queries for CPU and memory use: # - CPU: sum(rate(container_cpu_usage_seconds_total{container="grafana", namespace="support"}[5m])) by (pod) @@ -307,7 +309,7 @@ grafana: resources: limits: cpu: 100m - memory: 200Mi + memory: 400Mi requests: cpu: 10m memory: 200Mi From 3fe8547450f9b0de585bb1b9d837d56f57ddfcb3 Mon Sep 17 00:00:00 2001 From: Erik Sundell Date: Wed, 27 Mar 2024 14:16:06 +0100 Subject: [PATCH 2/2] support chart, grafana: tweak readinessProbe for our single replica --- helm-charts/support/values.yaml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/helm-charts/support/values.yaml b/helm-charts/support/values.yaml index 1b8798e3d0..6451176f4b 100644 --- a/helm-charts/support/values.yaml +++ b/helm-charts/support/values.yaml @@ -281,6 +281,12 @@ grafana: # type Recreate is required since we attach a PVC that can only be used by # mounted for writing by one pod at the time. type: Recreate + readinessProbe: + # With one grafana pod replica, having a readiness probe fail is pointless. + # We ensure it won't fail before the livenessProbe that would restart the + # container. + failureThreshold: 1000 + initialDelaySeconds: 1 rbac: # namespaced makes us not get ClusterRole service accounts etc, and we do