Skip to content

Commit

Permalink
Merge pull request #5408 from sgibson91/enable-ebs-vol-alerting
Browse files Browse the repository at this point in the history
Enable alerting for all hubs with jupyterhub-home-nfs currently enabled
  • Loading branch information
sgibson91 authored Jan 24, 2025
2 parents d0dc891 + f158ee3 commit 0156665
Show file tree
Hide file tree
Showing 4 changed files with 156 additions and 8 deletions.
44 changes: 44 additions & 0 deletions config/clusters/nasa-cryo/support.values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,25 @@ grafana:
- grafana.cryointhecloud.2i2c.cloud

prometheus:
alertmanager:
enabled: true
config:
route:
group_wait: 10s
group_interval: 5m
receiver: pagerduty
repeat_interval: 3h
routes:
- receiver: pagerduty
match:
channel: pagerduty
cluster: nasa-cryo
namespace: staging
- receiver: pagerduty
match:
channel: pagerduty
cluster: nasa-cryo
namespace: prod
server:
ingress:
enabled: true
Expand All @@ -32,6 +51,31 @@ prometheus:
- secretName: prometheus-tls
hosts:
- prometheus.cryointhecloud.2i2c.cloud
serverFiles:
alerting_rules.yml:
groups:
- name: CryoCloud staging EBS volume full
rules:
- alert: staging-jupyterhub-home-nfs-ebs-full
expr: node_filesystem_avail_bytes{mountpoint="/shared-volume", component="shared-volume-metrics", namespace="staging"} / node_filesystem_size_bytes{mountpoint="/shared-volume", component="shared-volume-metrics", namespace="staging"} < 0.1
for: 15m
labels:
severity: critical
channel: pagerduty
cluster: nasa-cryo
annotations:
summary: "jupyterhub-home-nfs EBS volume full in namespace {{ $labels.namespace }}"
- name: CryoCloud prod jupyterhub-home-nfs EBS volume full
rules:
- alert: prod-jupyterhub-home-nfs-ebs-full
expr: node_filesystem_avail_bytes{mountpoint="/shared-volume", component="shared-volume-metrics", namespace="prod"} / node_filesystem_size_bytes{mountpoint="/shared-volume", component="shared-volume-metrics", namespace="prod"} < 0.1
for: 15m
labels:
severity: critical
channel: pagerduty
cluster: nasa-cryo
annotations:
summary: "jupyterhub-home-nfs EBS volume full in namespace {{ $labels.namespace }}"

aws-ce-grafana-backend:
enabled: true
Expand Down
24 changes: 21 additions & 3 deletions config/clusters/nasa-veda/support.values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,13 @@ prometheus:
- receiver: pagerduty
match:
channel: pagerduty
cluster: nasa-veda
namespace: staging
- receiver: pagerduty
match:
channel: pagerduty
cluster: nasa-veda
namespace: prod
server:
ingress:
enabled: true
Expand All @@ -56,10 +63,21 @@ prometheus:
serverFiles:
alerting_rules.yml:
groups:
- name: NASA VEDA jupyterhub-home-nfs EBS volume full
- name: NASA VEDA staging jupyterhub-home-nfs EBS volume full
rules:
- alert: staging-jupyterhub-home-nfs-ebs-full
expr: node_filesystem_avail_bytes{mountpoint="/shared-volume", component="shared-volume-metrics", namespace="staging"} / node_filesystem_size_bytes{mountpoint="/shared-volume", component="shared-volume-metrics", namespace="staging"} < 0.1
for: 15m
labels:
severity: critical
channel: pagerduty
cluster: nasa-veda
annotations:
summary: "jupyterhub-home-nfs EBS volume full in namespace {{ $labels.namespace }}"
- name: NASA VEDA prod jupyterhub-home-nfs EBS volume full
rules:
- alert: jupyterhub-home-nfs-ebs-full
expr: node_filesystem_avail_bytes{mountpoint="/shared-volume", component="shared-volume-metrics"} / node_filesystem_size_bytes{mountpoint="/shared-volume", component="shared-volume-metrics"} < 0.1
- alert: prod-jupyterhub-home-nfs-ebs-full
expr: node_filesystem_avail_bytes{mountpoint="/shared-volume", component="shared-volume-metrics", namespace="prod"} / node_filesystem_size_bytes{mountpoint="/shared-volume", component="shared-volume-metrics", namespace="prod"} < 0.1
for: 15m
labels:
severity: critical
Expand Down
60 changes: 60 additions & 0 deletions config/clusters/nmfs-openscapes/support.values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,30 @@ prometheusIngressAuthSecret:
enabled: true

prometheus:
alertmanager:
enabled: true
config:
route:
group_wait: 10s
group_interval: 5m
receiver: pagerduty
repeat_interval: 3h
routes:
- receiver: pagerduty
match:
channel: pagerduty
cluster: nmfs-openscapes
namespace: staging
- receiver: pagerduty
match:
channel: pagerduty
cluster: nmfs-openscapes
namespace: prod
- receiver: pagerduty
match:
channel: pagerduty
cluster: nmfs-openscapes
namespace: workshop
server:
# Bumped as part of https://github.com/2i2c-org/infrastructure/issues/4632
persistentVolume:
Expand All @@ -19,6 +43,42 @@ prometheus:
memory: 8Gi
limits:
memory: 8Gi
serverFiles:
alerting_rules.yml:
groups:
- name: NMFS Openscapes staging jupyterhub-home-nfs EBS volume full
rules:
- alert: staging-jupyterhub-home-nfs-ebs-full
expr: node_filesystem_avail_bytes{mountpoint="/shared-volume", component="shared-volume-metrics", namespace="staging"} / node_filesystem_size_bytes{mountpoint="/shared-volume", component="shared-volume-metrics", namespace="staging"} < 0.1
for: 15m
labels:
severity: critical
channel: pagerduty
cluster: nmfs-openscapes
annotations:
summary: "jupyterhub-home-nfs EBS volume full in namespace {{ $labels.namespace }}"
- name: NMFS Openscapes prod jupyterhub-home-nfs EBS volume full
rules:
- alert: prod-jupyterhub-home-nfs-ebs-full
expr: node_filesystem_avail_bytes{mountpoint="/shared-volume", component="shared-volume-metrics", namespace="prod"} / node_filesystem_size_bytes{mountpoint="/shared-volume", component="shared-volume-metrics", namespace="prod"} < 0.1
for: 15m
labels:
severity: critical
channel: pagerduty
cluster: nmfs-openscapes
annotations:
summary: "jupyterhub-home-nfs EBS volume full in namespace {{ $labels.namespace }}"
- name: NMFS Openscapes workshop jupyterhub-home-nfs EBS volume full
rules:
- alert: workshop-jupyterhub-home-nfs-ebs-full
expr: node_filesystem_avail_bytes{mountpoint="/shared-volume", component="shared-volume-metrics", namespace="workshop"} / node_filesystem_size_bytes{mountpoint="/shared-volume", component="shared-volume-metrics", namespace="workshop"} < 0.1
for: 15m
labels:
severity: critical
channel: pagerduty
cluster: nmfs-openscapes
annotations:
summary: "jupyterhub-home-nfs EBS volume full in namespace {{ $labels.namespace }}"

grafana:
grafana.ini:
Expand Down
36 changes: 31 additions & 5 deletions docs/howto/features/storage-quota.md
Original file line number Diff line number Diff line change
Expand Up @@ -126,8 +126,12 @@ Once this is deployed, the hub will automatically enforce the storage quota for
## Enabling alerting through Prometheus Alertmanager

Once we have enabled storage quotas, we want to be alerted when the disk usage of the NFS server exceeds a certain threshold so that we can take appropriate action.
To do this, we need to create a Prometheus rule that will alert us when the disk usage of the NFS server exceeds a certain threshold using Alertmanager.
We will then forward Alertmanager's alert to PagerDuty.

To do this, we need to create a Prometheus rule that will alert us when the disk usage of the NFS server exceeds a certain threshold using Alertmanager.
```{note}
Use these resource to learn more about [PagerDuty's Prometheus integration](https://www.pagerduty.com/docs/guides/prometheus-integration-guide/) and [Prometheus' Alertmanager configuration](https://prometheus.io/docs/alerting/latest/configuration/)
```

First, we need to enable Alertmanager in the hub's support values file (for example, [here's the one for the `nasa-veda` cluster](https://github.com/2i2c-org/infrastructure/blob/main/config/clusters/nasa-veda/support.values.yaml)).

Expand All @@ -137,17 +141,18 @@ prometheus:
enabled: true
```

Then, we need to create a Prometheus rule that will alert us when the disk usage of the NFS server exceeds a certain threshold. For example, to alert us when the disk usage of the NFS server exceeds 90% of the total disk size, we would add the following to the hub's support values file:
Then, we need to create a Prometheus rule that will alert us when the disk usage of the NFS server exceeds a certain threshold. For example, to alert us when the disk usage of the NFS server exceeds 90% of the total disk size over a 15min period, we would add the following to the hub's support values file:

```yaml
prometheus:
serverFiles:
alerting_rules.yml:
groups:
- name: <cluster_name> jupyterhub-home-nfs EBS volume full
# Duplicate this entry for every hub on the cluster that uses an EBS volume as an NFS server
- name: <cluster_name> <hub_name> jupyterhub-home-nfs EBS volume full
rules:
- alert: jupyterhub-home-nfs-ebs-full
expr: node_filesystem_avail_bytes{mountpoint="/shared-volume", component="shared-volume-metrics"} / node_filesystem_size_bytes{mountpoint="/shared-volume", component="shared-volume-metrics"} < 0.1
- alert: <hub_name>-jupyterhub-home-nfs-ebs-full
expr: node_filesystem_avail_bytes{mountpoint="/shared-volume", component="shared-volume-metrics", namespace="<hub_name>"} / node_filesystem_size_bytes{mountpoint="/shared-volume", component="shared-volume-metrics", namespace="<hub_name>"} < 0.1
for: 15m
labels:
severity: critical
Expand All @@ -157,6 +162,13 @@ prometheus:
summary: "jupyterhub-home-nfs EBS volume full in namespace {{ $labels.namespace }}"
```

```{note}
The important variables to note here are:
- `expr`: This is what Prometheus will evaluate
- `for`: This is a duration over which Prometheus will collect data to evaluate `expr`
```
And finally, we need to configure Alertmanager to send alerts to PagerDuty.
```yaml
Expand All @@ -170,9 +182,23 @@ prometheus:
receiver: pagerduty
repeat_interval: 3h
routes:
# Duplicate this entry for every hub on the cluster that uses an EBS volume as an NFS server
- receiver: pagerduty
match:
channel: pagerduty
cluster: <cluster_name>
namespace: <hub_name>
```
```{note}
The important variables to understand here are:

- `group_wait`: How long Alertmanager will initially wait to send a notification to PagerDuty for a group of alerts
- `group_interval`: How long Alertmanager will wait to send a notification to PagerDuty for new alerts in a group for which an initial notification has already been sent
- `repeat_interval`: How long Alertmanager will wait to send a notification to PagerDuty again if it has already sent a successful notification
- `match`: These labels are used to group fired alerts together and is how we manage separate incidents per hub per cluster in PagerDuty

[Read more about these configuration options.](https://prometheus.io/docs/alerting/latest/configuration/#route)
```

## Increasing the size of the volume used by the NFS server
Expand Down

0 comments on commit 0156665

Please sign in to comment.