Merge pull request #5408 from sgibson91/enable-ebs-vol-alerting

Enable alerting for all hubs with jupyterhub-home-nfs currently enabled
2i2c-org · Jan 24, 2025 · 0156665 · 0156665
2 parents d0dc891 + f158ee3
commit 0156665
Show file tree

Hide file tree

Showing 4 changed files with 156 additions and 8 deletions.
diff --git a/config/clusters/nasa-cryo/support.values.yaml b/config/clusters/nasa-cryo/support.values.yaml
@@ -23,6 +23,25 @@ grafana:
           - grafana.cryointhecloud.2i2c.cloud
 
 prometheus:
+  alertmanager:
+    enabled: true
+    config:
+      route:
+        group_wait: 10s
+        group_interval: 5m
+        receiver: pagerduty
+        repeat_interval: 3h
+        routes:
+          - receiver: pagerduty
+            match:
+              channel: pagerduty
+              cluster: nasa-cryo
+              namespace: staging
+          - receiver: pagerduty
+            match:
+              channel: pagerduty
+              cluster: nasa-cryo
+              namespace: prod
   server:
     ingress:
       enabled: true
@@ -32,6 +51,31 @@ prometheus:
         - secretName: prometheus-tls
           hosts:
             - prometheus.cryointhecloud.2i2c.cloud
+  serverFiles:
+    alerting_rules.yml:
+      groups:
+        - name: CryoCloud staging EBS volume full
+          rules:
+            - alert: staging-jupyterhub-home-nfs-ebs-full
+              expr: node_filesystem_avail_bytes{mountpoint="/shared-volume", component="shared-volume-metrics", namespace="staging"} / node_filesystem_size_bytes{mountpoint="/shared-volume", component="shared-volume-metrics", namespace="staging"} < 0.1
+              for: 15m
+              labels:
+                severity: critical
+                channel: pagerduty
+                cluster: nasa-cryo
+              annotations:
+                summary: "jupyterhub-home-nfs EBS volume full in namespace {{ $labels.namespace }}"
+        - name: CryoCloud prod jupyterhub-home-nfs EBS volume full
+          rules:
+            - alert: prod-jupyterhub-home-nfs-ebs-full
+              expr: node_filesystem_avail_bytes{mountpoint="/shared-volume", component="shared-volume-metrics", namespace="prod"} / node_filesystem_size_bytes{mountpoint="/shared-volume", component="shared-volume-metrics", namespace="prod"} < 0.1
+              for: 15m
+              labels:
+                severity: critical
+                channel: pagerduty
+                cluster: nasa-cryo
+              annotations:
+                summary: "jupyterhub-home-nfs EBS volume full in namespace {{ $labels.namespace }}"
 
 aws-ce-grafana-backend:
   enabled: true

diff --git a/config/clusters/nasa-veda/support.values.yaml b/config/clusters/nasa-veda/support.values.yaml
@@ -44,6 +44,13 @@ prometheus:
           - receiver: pagerduty
             match:
               channel: pagerduty
+              cluster: nasa-veda
+              namespace: staging
+          - receiver: pagerduty
+            match:
+              channel: pagerduty
+              cluster: nasa-veda
+              namespace: prod
   server:
     ingress:
       enabled: true
@@ -56,10 +63,21 @@ prometheus:
   serverFiles:
     alerting_rules.yml:
       groups:
-        - name: NASA VEDA jupyterhub-home-nfs EBS volume full
+        - name: NASA VEDA staging jupyterhub-home-nfs EBS volume full
+          rules:
+            - alert: staging-jupyterhub-home-nfs-ebs-full
+              expr: node_filesystem_avail_bytes{mountpoint="/shared-volume", component="shared-volume-metrics", namespace="staging"} / node_filesystem_size_bytes{mountpoint="/shared-volume", component="shared-volume-metrics", namespace="staging"} < 0.1
+              for: 15m
+              labels:
+                severity: critical
+                channel: pagerduty
+                cluster: nasa-veda
+              annotations:
+                summary: "jupyterhub-home-nfs EBS volume full in namespace {{ $labels.namespace }}"
+        - name: NASA VEDA prod jupyterhub-home-nfs EBS volume full
           rules:
-            - alert: jupyterhub-home-nfs-ebs-full
-              expr: node_filesystem_avail_bytes{mountpoint="/shared-volume", component="shared-volume-metrics"} / node_filesystem_size_bytes{mountpoint="/shared-volume", component="shared-volume-metrics"} < 0.1
+            - alert: prod-jupyterhub-home-nfs-ebs-full
+              expr: node_filesystem_avail_bytes{mountpoint="/shared-volume", component="shared-volume-metrics", namespace="prod"} / node_filesystem_size_bytes{mountpoint="/shared-volume", component="shared-volume-metrics", namespace="prod"} < 0.1
               for: 15m
               labels:
                 severity: critical

diff --git a/config/clusters/nmfs-openscapes/support.values.yaml b/config/clusters/nmfs-openscapes/support.values.yaml
@@ -2,6 +2,30 @@ prometheusIngressAuthSecret:
   enabled: true
 
 prometheus:
+  alertmanager:
+    enabled: true
+    config:
+      route:
+        group_wait: 10s
+        group_interval: 5m
+        receiver: pagerduty
+        repeat_interval: 3h
+        routes:
+          - receiver: pagerduty
+            match:
+              channel: pagerduty
+              cluster: nmfs-openscapes
+              namespace: staging
+          - receiver: pagerduty
+            match:
+              channel: pagerduty
+              cluster: nmfs-openscapes
+              namespace: prod
+          - receiver: pagerduty
+            match:
+              channel: pagerduty
+              cluster: nmfs-openscapes
+              namespace: workshop
   server:
     # Bumped as part of https://github.com/2i2c-org/infrastructure/issues/4632
     persistentVolume:
@@ -19,6 +43,42 @@ prometheus:
         memory: 8Gi
       limits:
         memory: 8Gi
+  serverFiles:
+    alerting_rules.yml:
+      groups:
+        - name: NMFS Openscapes staging jupyterhub-home-nfs EBS volume full
+          rules:
+            - alert: staging-jupyterhub-home-nfs-ebs-full
+              expr: node_filesystem_avail_bytes{mountpoint="/shared-volume", component="shared-volume-metrics", namespace="staging"} / node_filesystem_size_bytes{mountpoint="/shared-volume", component="shared-volume-metrics", namespace="staging"} < 0.1
+              for: 15m
+              labels:
+                severity: critical
+                channel: pagerduty
+                cluster: nmfs-openscapes
+              annotations:
+                summary: "jupyterhub-home-nfs EBS volume full in namespace {{ $labels.namespace }}"
+        - name: NMFS Openscapes prod jupyterhub-home-nfs EBS volume full
+          rules:
+            - alert: prod-jupyterhub-home-nfs-ebs-full
+              expr: node_filesystem_avail_bytes{mountpoint="/shared-volume", component="shared-volume-metrics", namespace="prod"} / node_filesystem_size_bytes{mountpoint="/shared-volume", component="shared-volume-metrics", namespace="prod"} < 0.1
+              for: 15m
+              labels:
+                severity: critical
+                channel: pagerduty
+                cluster: nmfs-openscapes
+              annotations:
+                summary: "jupyterhub-home-nfs EBS volume full in namespace {{ $labels.namespace }}"
+        - name: NMFS Openscapes workshop jupyterhub-home-nfs EBS volume full
+          rules:
+            - alert: workshop-jupyterhub-home-nfs-ebs-full
+              expr: node_filesystem_avail_bytes{mountpoint="/shared-volume", component="shared-volume-metrics", namespace="workshop"} / node_filesystem_size_bytes{mountpoint="/shared-volume", component="shared-volume-metrics", namespace="workshop"} < 0.1
+              for: 15m
+              labels:
+                severity: critical
+                channel: pagerduty
+                cluster: nmfs-openscapes
+              annotations:
+                summary: "jupyterhub-home-nfs EBS volume full in namespace {{ $labels.namespace }}"
 
 grafana:
   grafana.ini:

diff --git a/docs/howto/features/storage-quota.md b/docs/howto/features/storage-quota.md
@@ -126,8 +126,12 @@ Once this is deployed, the hub will automatically enforce the storage quota for
 ## Enabling alerting through Prometheus Alertmanager
 
 Once we have enabled storage quotas, we want to be alerted when the disk usage of the NFS server exceeds a certain threshold so that we can take appropriate action.
+To do this, we need to create a Prometheus rule that will alert us when the disk usage of the NFS server exceeds a certain threshold using Alertmanager.
+We will then forward Alertmanager's alert to PagerDuty.
 
-To do this, we need to create a Prometheus rule that will alert us when the disk usage of the NFS server exceeds a certain threshold using Alertmanager. 
+```{note}
+Use these resource to learn more about [PagerDuty's Prometheus integration](https://www.pagerduty.com/docs/guides/prometheus-integration-guide/) and [Prometheus' Alertmanager configuration](https://prometheus.io/docs/alerting/latest/configuration/)
+```
 
 First, we need to enable Alertmanager in the hub's support values file (for example, [here's the one for the `nasa-veda` cluster](https://github.com/2i2c-org/infrastructure/blob/main/config/clusters/nasa-veda/support.values.yaml)).
 
@@ -137,17 +141,18 @@ prometheus:
     enabled: true
 ```
 
-Then, we need to create a Prometheus rule that will alert us when the disk usage of the NFS server exceeds a certain threshold. For example, to alert us when the disk usage of the NFS server exceeds 90% of the total disk size, we would add the following to the hub's support values file:
+Then, we need to create a Prometheus rule that will alert us when the disk usage of the NFS server exceeds a certain threshold. For example, to alert us when the disk usage of the NFS server exceeds 90% of the total disk size over a 15min period, we would add the following to the hub's support values file:
 
 ```yaml
 prometheus:
   serverFiles:
     alerting_rules.yml:
       groups:
-        - name: <cluster_name> jupyterhub-home-nfs EBS volume full
+        # Duplicate this entry for every hub on the cluster that uses an EBS volume as an NFS server
+        - name: <cluster_name> <hub_name> jupyterhub-home-nfs EBS volume full
           rules:
-            - alert: jupyterhub-home-nfs-ebs-full
-              expr: node_filesystem_avail_bytes{mountpoint="/shared-volume", component="shared-volume-metrics"} / node_filesystem_size_bytes{mountpoint="/shared-volume", component="shared-volume-metrics"} < 0.1
+            - alert: <hub_name>-jupyterhub-home-nfs-ebs-full
+              expr: node_filesystem_avail_bytes{mountpoint="/shared-volume", component="shared-volume-metrics", namespace="<hub_name>"} / node_filesystem_size_bytes{mountpoint="/shared-volume", component="shared-volume-metrics", namespace="<hub_name>"} < 0.1
               for: 15m
               labels:
                 severity: critical
@@ -157,6 +162,13 @@ prometheus:
                 summary: "jupyterhub-home-nfs EBS volume full in namespace {{ $labels.namespace }}"
 ```
 
+```{note}
+The important variables to note here are:
+
+- `expr`: This is what Prometheus will evaluate
+- `for`: This is a duration over which Prometheus will collect data to evaluate `expr`
+```
+
 And finally, we need to configure Alertmanager to send alerts to PagerDuty.
 
 ```yaml
@@ -170,9 +182,23 @@ prometheus:
         receiver: pagerduty
         repeat_interval: 3h
         routes:
+          # Duplicate this entry for every hub on the cluster that uses an EBS volume as an NFS server
           - receiver: pagerduty
             match:
               channel: pagerduty
+              cluster: <cluster_name>
+              namespace: <hub_name>
+```
+
+```{note}
+The important variables to understand here are:
+
+- `group_wait`: How long Alertmanager will initially wait to send a notification to PagerDuty for a group of alerts
+- `group_interval`: How long Alertmanager will wait to send a notification to PagerDuty for new alerts in a group for which an initial notification has already been sent
+- `repeat_interval`: How long Alertmanager will wait to send a notification to PagerDuty again if it has already sent a successful notification
+- `match`: These labels are used to group fired alerts together and is how we manage separate incidents per hub per cluster in PagerDuty
+
+[Read more about these configuration options.](https://prometheus.io/docs/alerting/latest/configuration/#route)
 ```
 
 ## Increasing the size of the volume used by the NFS server