From 3af696323d281d84b299dc8bc7cf87067210bb4b Mon Sep 17 00:00:00 2001
From: Sarah Gibson <drsarahlgibson@gmail.com>
Date: Fri, 24 Jan 2025 13:07:16 +0000
Subject: [PATCH 1/5] CryoCloud: Enable EBS volume monitoring and alerting

---
 config/clusters/nasa-cryo/support.values.yaml | 44 +++++++++++++++++++
 1 file changed, 44 insertions(+)

diff --git a/config/clusters/nasa-cryo/support.values.yaml b/config/clusters/nasa-cryo/support.values.yaml
index cfb6049be..03224baa0 100644
--- a/config/clusters/nasa-cryo/support.values.yaml
+++ b/config/clusters/nasa-cryo/support.values.yaml
@@ -23,6 +23,25 @@ grafana:
           - grafana.cryointhecloud.2i2c.cloud
 
 prometheus:
+  alertmanager:
+    enabled: true
+    config:
+      route:
+        group_wait: 10s
+        group_interval: 5m
+        receiver: pagerduty
+        repeat_interval: 3h
+        routes:
+          - receiver: pagerduty
+            match:
+              channel: pagerduty
+              cluster: nasa-cryo
+              namespace: staging
+          - receiver: pagerduty
+            match:
+              channel: pagerduty
+              cluster: nasa-cryo
+              namespace: prod
   server:
     ingress:
       enabled: true
@@ -32,6 +51,31 @@ prometheus:
         - secretName: prometheus-tls
           hosts:
             - prometheus.cryointhecloud.2i2c.cloud
+  serverFiles:
+    alerting_rules.yml:
+      groups:
+        - name: CryoCloud staging EBS volume full
+          rules:
+            - alert: staging-jupyterhub-home-nfs-ebs-full
+              expr: node_filesystem_avail_bytes{mountpoint="/shared-volume", component="shared-volume-metrics", namespace="staging"} / node_filesystem_size_bytes{mountpoint="/shared-volume", component="shared-volume-metrics", namespace="staging"} < 0.1
+              for: 15m
+              labels:
+                severity: critical
+                channel: pagerduty
+                cluster: nasa-cryo
+              annotations:
+                summary: "jupyterhub-home-nfs EBS volume full in namespace {{ $labels.namespace }}"
+        - name: CryoCloud prod jupyterhub-home-nfs EBS volume full
+          rules:
+            - alert: prod-jupyterhub-home-nfs-ebs-full
+              expr: node_filesystem_avail_bytes{mountpoint="/shared-volume", component="shared-volume-metrics", namespace="prod"} / node_filesystem_size_bytes{mountpoint="/shared-volume", component="shared-volume-metrics", namespace="prod"} < 0.1
+              for: 15m
+              labels:
+                severity: critical
+                channel: pagerduty
+                cluster: nasa-cryo
+              annotations:
+                summary: "jupyterhub-home-nfs EBS volume full in namespace {{ $labels.namespace }}"
 
 aws-ce-grafana-backend:
   enabled: true

From cf5cc0ec52f498fafd02cde42d1f2f058f86d7de Mon Sep 17 00:00:00 2001
From: Sarah Gibson <drsarahlgibson@gmail.com>
Date: Fri, 24 Jan 2025 13:11:44 +0000
Subject: [PATCH 2/5] VEDA: Update prometheus alertmanager config so that
 separate alerts are triggered for each hub and they are grouped separately

---
 config/clusters/nasa-veda/support.values.yaml | 24 ++++++++++++++++---
 1 file changed, 21 insertions(+), 3 deletions(-)

diff --git a/config/clusters/nasa-veda/support.values.yaml b/config/clusters/nasa-veda/support.values.yaml
index 7591a19b0..c61718561 100644
--- a/config/clusters/nasa-veda/support.values.yaml
+++ b/config/clusters/nasa-veda/support.values.yaml
@@ -44,6 +44,13 @@ prometheus:
           - receiver: pagerduty
             match:
               channel: pagerduty
+              cluster: nasa-veda
+              namespace: staging
+          - receiver: pagerduty
+            match:
+              channel: pagerduty
+              cluster: nasa-veda
+              namespace: prod
   server:
     ingress:
       enabled: true
@@ -56,10 +63,21 @@ prometheus:
   serverFiles:
     alerting_rules.yml:
       groups:
-        - name: NASA VEDA jupyterhub-home-nfs EBS volume full
+        - name: NASA VEDA staging jupyterhub-home-nfs EBS volume full
+          rules:
+            - alert: staging-jupyterhub-home-nfs-ebs-full
+              expr: node_filesystem_avail_bytes{mountpoint="/shared-volume", component="shared-volume-metrics", namespace="staging"} / node_filesystem_size_bytes{mountpoint="/shared-volume", component="shared-volume-metrics", namespace="staging"} < 0.1
+              for: 15m
+              labels:
+                severity: critical
+                channel: pagerduty
+                cluster: nasa-veda
+              annotations:
+                summary: "jupyterhub-home-nfs EBS volume full in namespace {{ $labels.namespace }}"
+        - name: NASA VEDA prod jupyterhub-home-nfs EBS volume full
           rules:
-            - alert: jupyterhub-home-nfs-ebs-full
-              expr: node_filesystem_avail_bytes{mountpoint="/shared-volume", component="shared-volume-metrics"} / node_filesystem_size_bytes{mountpoint="/shared-volume", component="shared-volume-metrics"} < 0.1
+            - alert: prod-jupyterhub-home-nfs-ebs-full
+              expr: node_filesystem_avail_bytes{mountpoint="/shared-volume", component="shared-volume-metrics", namespace="prod"} / node_filesystem_size_bytes{mountpoint="/shared-volume", component="shared-volume-metrics", namespace="prod"} < 0.1
               for: 15m
               labels:
                 severity: critical

From 84729ca2cfd4543b2dd9eb03356f189afcf8b428 Mon Sep 17 00:00:00 2001
From: Sarah Gibson <drsarahlgibson@gmail.com>
Date: Fri, 24 Jan 2025 13:48:53 +0000
Subject: [PATCH 3/5] NMFS Openscapes: Enable monitoring and alerting for EBS
 volume size

---
 .../nmfs-openscapes/support.values.yaml       | 60 +++++++++++++++++++
 1 file changed, 60 insertions(+)

diff --git a/config/clusters/nmfs-openscapes/support.values.yaml b/config/clusters/nmfs-openscapes/support.values.yaml
index 4b340ab42..502b32c0c 100644
--- a/config/clusters/nmfs-openscapes/support.values.yaml
+++ b/config/clusters/nmfs-openscapes/support.values.yaml
@@ -2,6 +2,30 @@ prometheusIngressAuthSecret:
   enabled: true
 
 prometheus:
+  alertmanager:
+    enabled: true
+    config:
+      route:
+        group_wait: 10s
+        group_interval: 5m
+        receiver: pagerduty
+        repeat_interval: 3h
+        routes:
+          - receiver: pagerduty
+            match:
+              channel: pagerduty
+              cluster: nmfs-openscapes
+              namespace: staging
+          - receiver: pagerduty
+            match:
+              channel: pagerduty
+              cluster: nmfs-openscapes
+              namespace: prod
+          - receiver: pagerduty
+            match:
+              channel: pagerduty
+              cluster: nmfs-openscapes
+              namespace: workshop
   server:
     # Bumped as part of https://github.com/2i2c-org/infrastructure/issues/4632
     persistentVolume:
@@ -19,6 +43,42 @@ prometheus:
         memory: 8Gi
       limits:
         memory: 8Gi
+  serverFiles:
+    alerting_rules.yml:
+      groups:
+        - name: NMFS Openscapes staging jupyterhub-home-nfs EBS volume full
+          rules:
+            - alert: staging-jupyterhub-home-nfs-ebs-full
+              expr: node_filesystem_avail_bytes{mountpoint="/shared-volume", component="shared-volume-metrics", namespace="staging"} / node_filesystem_size_bytes{mountpoint="/shared-volume", component="shared-volume-metrics", namespace="staging"} < 0.1
+              for: 15m
+              labels:
+                severity: critical
+                channel: pagerduty
+                cluster: nmfs-openscapes
+              annotations:
+                summary: "jupyterhub-home-nfs EBS volume full in namespace {{ $labels.namespace }}"
+        - name: NMFS Openscapes prod jupyterhub-home-nfs EBS volume full
+          rules:
+            - alert: prod-jupyterhub-home-nfs-ebs-full
+              expr: node_filesystem_avail_bytes{mountpoint="/shared-volume", component="shared-volume-metrics", namespace="prod"} / node_filesystem_size_bytes{mountpoint="/shared-volume", component="shared-volume-metrics", namespace="prod"} < 0.1
+              for: 15m
+              labels:
+                severity: critical
+                channel: pagerduty
+                cluster: nmfs-openscapes
+              annotations:
+                summary: "jupyterhub-home-nfs EBS volume full in namespace {{ $labels.namespace }}"
+        - name: NMFS Openscapes workshop jupyterhub-home-nfs EBS volume full
+          rules:
+            - alert: workshop-jupyterhub-home-nfs-ebs-full
+              expr: node_filesystem_avail_bytes{mountpoint="/shared-volume", component="shared-volume-metrics", namespace="workshop"} / node_filesystem_size_bytes{mountpoint="/shared-volume", component="shared-volume-metrics", namespace="workshop"} < 0.1
+              for: 15m
+              labels:
+                severity: critical
+                channel: pagerduty
+                cluster: nmfs-openscapes
+              annotations:
+                summary: "jupyterhub-home-nfs EBS volume full in namespace {{ $labels.namespace }}"
 
 grafana:
   grafana.ini:

From 3e050a1d7785f1b8f98e2cb4ab45e3e43a26faec Mon Sep 17 00:00:00 2001
From: Sarah Gibson <drsarahlgibson@gmail.com>
Date: Fri, 24 Jan 2025 13:49:41 +0000
Subject: [PATCH 4/5] Update config examples in docs to mimic the working
 alertmanager config in the hubs

---
 docs/howto/features/storage-quota.md | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/docs/howto/features/storage-quota.md b/docs/howto/features/storage-quota.md
index 76407fd2b..226521ffb 100644
--- a/docs/howto/features/storage-quota.md
+++ b/docs/howto/features/storage-quota.md
@@ -127,7 +127,7 @@ Once this is deployed, the hub will automatically enforce the storage quota for
 
 Once we have enabled storage quotas, we want to be alerted when the disk usage of the NFS server exceeds a certain threshold so that we can take appropriate action.
 
-To do this, we need to create a Prometheus rule that will alert us when the disk usage of the NFS server exceeds a certain threshold using Alertmanager. 
+To do this, we need to create a Prometheus rule that will alert us when the disk usage of the NFS server exceeds a certain threshold using Alertmanager.
 
 First, we need to enable Alertmanager in the hub's support values file (for example, [here's the one for the `nasa-veda` cluster](https://github.com/2i2c-org/infrastructure/blob/main/config/clusters/nasa-veda/support.values.yaml)).
 
@@ -137,17 +137,18 @@ prometheus:
     enabled: true
 ```
 
-Then, we need to create a Prometheus rule that will alert us when the disk usage of the NFS server exceeds a certain threshold. For example, to alert us when the disk usage of the NFS server exceeds 90% of the total disk size, we would add the following to the hub's support values file:
+Then, we need to create a Prometheus rule that will alert us when the disk usage of the NFS server exceeds a certain threshold. For example, to alert us when the disk usage of the NFS server exceeds 90% of the total disk size over a 15min period, we would add the following to the hub's support values file:
 
 ```yaml
 prometheus:
   serverFiles:
     alerting_rules.yml:
       groups:
-        - name: <cluster_name> jupyterhub-home-nfs EBS volume full
+        # Duplicate this entry for every hub on the cluster that uses an EBS volume as an NFS server
+        - name: <cluster_name> <hub_name> jupyterhub-home-nfs EBS volume full
           rules:
-            - alert: jupyterhub-home-nfs-ebs-full
-              expr: node_filesystem_avail_bytes{mountpoint="/shared-volume", component="shared-volume-metrics"} / node_filesystem_size_bytes{mountpoint="/shared-volume", component="shared-volume-metrics"} < 0.1
+            - alert: <hub_name>-jupyterhub-home-nfs-ebs-full
+              expr: node_filesystem_avail_bytes{mountpoint="/shared-volume", component="shared-volume-metrics", namespace="<hub_name>"} / node_filesystem_size_bytes{mountpoint="/shared-volume", component="shared-volume-metrics", namespace="<hub_name>"} < 0.1
               for: 15m
               labels:
                 severity: critical
@@ -170,9 +171,12 @@ prometheus:
         receiver: pagerduty
         repeat_interval: 3h
         routes:
+          # Duplicate this entry for every hub on the cluster that uses an EBS volume as an NFS server
           - receiver: pagerduty
             match:
               channel: pagerduty
+              cluster: <cluster_name>
+              namespace: <hub_name>
 ```
 
 ## Increasing the size of the volume used by the NFS server

From f158ee3a70e433a75e112ce50df2ac6adbc9b07a Mon Sep 17 00:00:00 2001
From: Sarah Gibson <drsarahlgibson@gmail.com>
Date: Fri, 24 Jan 2025 14:15:16 +0000
Subject: [PATCH 5/5] Add links for PagerDuty and Alertmanager and explain some
 variables

---
 docs/howto/features/storage-quota.md | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/docs/howto/features/storage-quota.md b/docs/howto/features/storage-quota.md
index 226521ffb..11bfc7c0b 100644
--- a/docs/howto/features/storage-quota.md
+++ b/docs/howto/features/storage-quota.md
@@ -126,8 +126,12 @@ Once this is deployed, the hub will automatically enforce the storage quota for
 ## Enabling alerting through Prometheus Alertmanager
 
 Once we have enabled storage quotas, we want to be alerted when the disk usage of the NFS server exceeds a certain threshold so that we can take appropriate action.
-
 To do this, we need to create a Prometheus rule that will alert us when the disk usage of the NFS server exceeds a certain threshold using Alertmanager.
+We will then forward Alertmanager's alert to PagerDuty.
+
+```{note}
+Use these resource to learn more about [PagerDuty's Prometheus integration](https://www.pagerduty.com/docs/guides/prometheus-integration-guide/) and [Prometheus' Alertmanager configuration](https://prometheus.io/docs/alerting/latest/configuration/)
+```
 
 First, we need to enable Alertmanager in the hub's support values file (for example, [here's the one for the `nasa-veda` cluster](https://github.com/2i2c-org/infrastructure/blob/main/config/clusters/nasa-veda/support.values.yaml)).
 
@@ -158,6 +162,13 @@ prometheus:
                 summary: "jupyterhub-home-nfs EBS volume full in namespace {{ $labels.namespace }}"
 ```
 
+```{note}
+The important variables to note here are:
+
+- `expr`: This is what Prometheus will evaluate
+- `for`: This is a duration over which Prometheus will collect data to evaluate `expr`
+```
+
 And finally, we need to configure Alertmanager to send alerts to PagerDuty.
 
 ```yaml
@@ -179,6 +190,17 @@ prometheus:
               namespace: <hub_name>
 ```
 
+```{note}
+The important variables to understand here are:
+
+- `group_wait`: How long Alertmanager will initially wait to send a notification to PagerDuty for a group of alerts
+- `group_interval`: How long Alertmanager will wait to send a notification to PagerDuty for new alerts in a group for which an initial notification has already been sent
+- `repeat_interval`: How long Alertmanager will wait to send a notification to PagerDuty again if it has already sent a successful notification
+- `match`: These labels are used to group fired alerts together and is how we manage separate incidents per hub per cluster in PagerDuty
+
+[Read more about these configuration options.](https://prometheus.io/docs/alerting/latest/configuration/#route)
+```
+
 ## Increasing the size of the volume used by the NFS server
 
 If the volume used by the NFS server is close to being full, we may need to increase the size of the volume. This can be done by following the instructions in the [Increase the size of an AWS EBS volume](howto:increase-size-aws-ebs) guide.