From 3af696323d281d84b299dc8bc7cf87067210bb4b Mon Sep 17 00:00:00 2001 From: Sarah Gibson Date: Fri, 24 Jan 2025 13:07:16 +0000 Subject: [PATCH 1/5] CryoCloud: Enable EBS volume monitoring and alerting --- config/clusters/nasa-cryo/support.values.yaml | 44 +++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/config/clusters/nasa-cryo/support.values.yaml b/config/clusters/nasa-cryo/support.values.yaml index cfb6049be..03224baa0 100644 --- a/config/clusters/nasa-cryo/support.values.yaml +++ b/config/clusters/nasa-cryo/support.values.yaml @@ -23,6 +23,25 @@ grafana: - grafana.cryointhecloud.2i2c.cloud prometheus: + alertmanager: + enabled: true + config: + route: + group_wait: 10s + group_interval: 5m + receiver: pagerduty + repeat_interval: 3h + routes: + - receiver: pagerduty + match: + channel: pagerduty + cluster: nasa-cryo + namespace: staging + - receiver: pagerduty + match: + channel: pagerduty + cluster: nasa-cryo + namespace: prod server: ingress: enabled: true @@ -32,6 +51,31 @@ prometheus: - secretName: prometheus-tls hosts: - prometheus.cryointhecloud.2i2c.cloud + serverFiles: + alerting_rules.yml: + groups: + - name: CryoCloud staging EBS volume full + rules: + - alert: staging-jupyterhub-home-nfs-ebs-full + expr: node_filesystem_avail_bytes{mountpoint="/shared-volume", component="shared-volume-metrics", namespace="staging"} / node_filesystem_size_bytes{mountpoint="/shared-volume", component="shared-volume-metrics", namespace="staging"} < 0.1 + for: 15m + labels: + severity: critical + channel: pagerduty + cluster: nasa-cryo + annotations: + summary: "jupyterhub-home-nfs EBS volume full in namespace {{ $labels.namespace }}" + - name: CryoCloud prod jupyterhub-home-nfs EBS volume full + rules: + - alert: prod-jupyterhub-home-nfs-ebs-full + expr: node_filesystem_avail_bytes{mountpoint="/shared-volume", component="shared-volume-metrics", namespace="prod"} / node_filesystem_size_bytes{mountpoint="/shared-volume", component="shared-volume-metrics", namespace="prod"} < 0.1 + for: 15m + labels: + severity: critical + channel: pagerduty + cluster: nasa-cryo + annotations: + summary: "jupyterhub-home-nfs EBS volume full in namespace {{ $labels.namespace }}" aws-ce-grafana-backend: enabled: true From cf5cc0ec52f498fafd02cde42d1f2f058f86d7de Mon Sep 17 00:00:00 2001 From: Sarah Gibson Date: Fri, 24 Jan 2025 13:11:44 +0000 Subject: [PATCH 2/5] VEDA: Update prometheus alertmanager config so that separate alerts are triggered for each hub and they are grouped separately --- config/clusters/nasa-veda/support.values.yaml | 24 ++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/config/clusters/nasa-veda/support.values.yaml b/config/clusters/nasa-veda/support.values.yaml index 7591a19b0..c61718561 100644 --- a/config/clusters/nasa-veda/support.values.yaml +++ b/config/clusters/nasa-veda/support.values.yaml @@ -44,6 +44,13 @@ prometheus: - receiver: pagerduty match: channel: pagerduty + cluster: nasa-veda + namespace: staging + - receiver: pagerduty + match: + channel: pagerduty + cluster: nasa-veda + namespace: prod server: ingress: enabled: true @@ -56,10 +63,21 @@ prometheus: serverFiles: alerting_rules.yml: groups: - - name: NASA VEDA jupyterhub-home-nfs EBS volume full + - name: NASA VEDA staging jupyterhub-home-nfs EBS volume full + rules: + - alert: staging-jupyterhub-home-nfs-ebs-full + expr: node_filesystem_avail_bytes{mountpoint="/shared-volume", component="shared-volume-metrics", namespace="staging"} / node_filesystem_size_bytes{mountpoint="/shared-volume", component="shared-volume-metrics", namespace="staging"} < 0.1 + for: 15m + labels: + severity: critical + channel: pagerduty + cluster: nasa-veda + annotations: + summary: "jupyterhub-home-nfs EBS volume full in namespace {{ $labels.namespace }}" + - name: NASA VEDA prod jupyterhub-home-nfs EBS volume full rules: - - alert: jupyterhub-home-nfs-ebs-full - expr: node_filesystem_avail_bytes{mountpoint="/shared-volume", component="shared-volume-metrics"} / node_filesystem_size_bytes{mountpoint="/shared-volume", component="shared-volume-metrics"} < 0.1 + - alert: prod-jupyterhub-home-nfs-ebs-full + expr: node_filesystem_avail_bytes{mountpoint="/shared-volume", component="shared-volume-metrics", namespace="prod"} / node_filesystem_size_bytes{mountpoint="/shared-volume", component="shared-volume-metrics", namespace="prod"} < 0.1 for: 15m labels: severity: critical From 84729ca2cfd4543b2dd9eb03356f189afcf8b428 Mon Sep 17 00:00:00 2001 From: Sarah Gibson Date: Fri, 24 Jan 2025 13:48:53 +0000 Subject: [PATCH 3/5] NMFS Openscapes: Enable monitoring and alerting for EBS volume size --- .../nmfs-openscapes/support.values.yaml | 60 +++++++++++++++++++ 1 file changed, 60 insertions(+) diff --git a/config/clusters/nmfs-openscapes/support.values.yaml b/config/clusters/nmfs-openscapes/support.values.yaml index 4b340ab42..502b32c0c 100644 --- a/config/clusters/nmfs-openscapes/support.values.yaml +++ b/config/clusters/nmfs-openscapes/support.values.yaml @@ -2,6 +2,30 @@ prometheusIngressAuthSecret: enabled: true prometheus: + alertmanager: + enabled: true + config: + route: + group_wait: 10s + group_interval: 5m + receiver: pagerduty + repeat_interval: 3h + routes: + - receiver: pagerduty + match: + channel: pagerduty + cluster: nmfs-openscapes + namespace: staging + - receiver: pagerduty + match: + channel: pagerduty + cluster: nmfs-openscapes + namespace: prod + - receiver: pagerduty + match: + channel: pagerduty + cluster: nmfs-openscapes + namespace: workshop server: # Bumped as part of https://github.com/2i2c-org/infrastructure/issues/4632 persistentVolume: @@ -19,6 +43,42 @@ prometheus: memory: 8Gi limits: memory: 8Gi + serverFiles: + alerting_rules.yml: + groups: + - name: NMFS Openscapes staging jupyterhub-home-nfs EBS volume full + rules: + - alert: staging-jupyterhub-home-nfs-ebs-full + expr: node_filesystem_avail_bytes{mountpoint="/shared-volume", component="shared-volume-metrics", namespace="staging"} / node_filesystem_size_bytes{mountpoint="/shared-volume", component="shared-volume-metrics", namespace="staging"} < 0.1 + for: 15m + labels: + severity: critical + channel: pagerduty + cluster: nmfs-openscapes + annotations: + summary: "jupyterhub-home-nfs EBS volume full in namespace {{ $labels.namespace }}" + - name: NMFS Openscapes prod jupyterhub-home-nfs EBS volume full + rules: + - alert: prod-jupyterhub-home-nfs-ebs-full + expr: node_filesystem_avail_bytes{mountpoint="/shared-volume", component="shared-volume-metrics", namespace="prod"} / node_filesystem_size_bytes{mountpoint="/shared-volume", component="shared-volume-metrics", namespace="prod"} < 0.1 + for: 15m + labels: + severity: critical + channel: pagerduty + cluster: nmfs-openscapes + annotations: + summary: "jupyterhub-home-nfs EBS volume full in namespace {{ $labels.namespace }}" + - name: NMFS Openscapes workshop jupyterhub-home-nfs EBS volume full + rules: + - alert: workshop-jupyterhub-home-nfs-ebs-full + expr: node_filesystem_avail_bytes{mountpoint="/shared-volume", component="shared-volume-metrics", namespace="workshop"} / node_filesystem_size_bytes{mountpoint="/shared-volume", component="shared-volume-metrics", namespace="workshop"} < 0.1 + for: 15m + labels: + severity: critical + channel: pagerduty + cluster: nmfs-openscapes + annotations: + summary: "jupyterhub-home-nfs EBS volume full in namespace {{ $labels.namespace }}" grafana: grafana.ini: From 3e050a1d7785f1b8f98e2cb4ab45e3e43a26faec Mon Sep 17 00:00:00 2001 From: Sarah Gibson Date: Fri, 24 Jan 2025 13:49:41 +0000 Subject: [PATCH 4/5] Update config examples in docs to mimic the working alertmanager config in the hubs --- docs/howto/features/storage-quota.md | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/docs/howto/features/storage-quota.md b/docs/howto/features/storage-quota.md index 76407fd2b..226521ffb 100644 --- a/docs/howto/features/storage-quota.md +++ b/docs/howto/features/storage-quota.md @@ -127,7 +127,7 @@ Once this is deployed, the hub will automatically enforce the storage quota for Once we have enabled storage quotas, we want to be alerted when the disk usage of the NFS server exceeds a certain threshold so that we can take appropriate action. -To do this, we need to create a Prometheus rule that will alert us when the disk usage of the NFS server exceeds a certain threshold using Alertmanager. +To do this, we need to create a Prometheus rule that will alert us when the disk usage of the NFS server exceeds a certain threshold using Alertmanager. First, we need to enable Alertmanager in the hub's support values file (for example, [here's the one for the `nasa-veda` cluster](https://github.com/2i2c-org/infrastructure/blob/main/config/clusters/nasa-veda/support.values.yaml)). @@ -137,17 +137,18 @@ prometheus: enabled: true ``` -Then, we need to create a Prometheus rule that will alert us when the disk usage of the NFS server exceeds a certain threshold. For example, to alert us when the disk usage of the NFS server exceeds 90% of the total disk size, we would add the following to the hub's support values file: +Then, we need to create a Prometheus rule that will alert us when the disk usage of the NFS server exceeds a certain threshold. For example, to alert us when the disk usage of the NFS server exceeds 90% of the total disk size over a 15min period, we would add the following to the hub's support values file: ```yaml prometheus: serverFiles: alerting_rules.yml: groups: - - name: jupyterhub-home-nfs EBS volume full + # Duplicate this entry for every hub on the cluster that uses an EBS volume as an NFS server + - name: jupyterhub-home-nfs EBS volume full rules: - - alert: jupyterhub-home-nfs-ebs-full - expr: node_filesystem_avail_bytes{mountpoint="/shared-volume", component="shared-volume-metrics"} / node_filesystem_size_bytes{mountpoint="/shared-volume", component="shared-volume-metrics"} < 0.1 + - alert: -jupyterhub-home-nfs-ebs-full + expr: node_filesystem_avail_bytes{mountpoint="/shared-volume", component="shared-volume-metrics", namespace=""} / node_filesystem_size_bytes{mountpoint="/shared-volume", component="shared-volume-metrics", namespace=""} < 0.1 for: 15m labels: severity: critical @@ -170,9 +171,12 @@ prometheus: receiver: pagerduty repeat_interval: 3h routes: + # Duplicate this entry for every hub on the cluster that uses an EBS volume as an NFS server - receiver: pagerduty match: channel: pagerduty + cluster: + namespace: ``` ## Increasing the size of the volume used by the NFS server From f158ee3a70e433a75e112ce50df2ac6adbc9b07a Mon Sep 17 00:00:00 2001 From: Sarah Gibson Date: Fri, 24 Jan 2025 14:15:16 +0000 Subject: [PATCH 5/5] Add links for PagerDuty and Alertmanager and explain some variables --- docs/howto/features/storage-quota.md | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/docs/howto/features/storage-quota.md b/docs/howto/features/storage-quota.md index 226521ffb..11bfc7c0b 100644 --- a/docs/howto/features/storage-quota.md +++ b/docs/howto/features/storage-quota.md @@ -126,8 +126,12 @@ Once this is deployed, the hub will automatically enforce the storage quota for ## Enabling alerting through Prometheus Alertmanager Once we have enabled storage quotas, we want to be alerted when the disk usage of the NFS server exceeds a certain threshold so that we can take appropriate action. - To do this, we need to create a Prometheus rule that will alert us when the disk usage of the NFS server exceeds a certain threshold using Alertmanager. +We will then forward Alertmanager's alert to PagerDuty. + +```{note} +Use these resource to learn more about [PagerDuty's Prometheus integration](https://www.pagerduty.com/docs/guides/prometheus-integration-guide/) and [Prometheus' Alertmanager configuration](https://prometheus.io/docs/alerting/latest/configuration/) +``` First, we need to enable Alertmanager in the hub's support values file (for example, [here's the one for the `nasa-veda` cluster](https://github.com/2i2c-org/infrastructure/blob/main/config/clusters/nasa-veda/support.values.yaml)). @@ -158,6 +162,13 @@ prometheus: summary: "jupyterhub-home-nfs EBS volume full in namespace {{ $labels.namespace }}" ``` +```{note} +The important variables to note here are: + +- `expr`: This is what Prometheus will evaluate +- `for`: This is a duration over which Prometheus will collect data to evaluate `expr` +``` + And finally, we need to configure Alertmanager to send alerts to PagerDuty. ```yaml @@ -179,6 +190,17 @@ prometheus: namespace: ``` +```{note} +The important variables to understand here are: + +- `group_wait`: How long Alertmanager will initially wait to send a notification to PagerDuty for a group of alerts +- `group_interval`: How long Alertmanager will wait to send a notification to PagerDuty for new alerts in a group for which an initial notification has already been sent +- `repeat_interval`: How long Alertmanager will wait to send a notification to PagerDuty again if it has already sent a successful notification +- `match`: These labels are used to group fired alerts together and is how we manage separate incidents per hub per cluster in PagerDuty + +[Read more about these configuration options.](https://prometheus.io/docs/alerting/latest/configuration/#route) +``` + ## Increasing the size of the volume used by the NFS server If the volume used by the NFS server is close to being full, we may need to increase the size of the volume. This can be done by following the instructions in the [Increase the size of an AWS EBS volume](howto:increase-size-aws-ebs) guide.