diff --git a/config/clusters/nasa-cryo/support.values.yaml b/config/clusters/nasa-cryo/support.values.yaml index cfb6049be..03224baa0 100644 --- a/config/clusters/nasa-cryo/support.values.yaml +++ b/config/clusters/nasa-cryo/support.values.yaml @@ -23,6 +23,25 @@ grafana: - grafana.cryointhecloud.2i2c.cloud prometheus: + alertmanager: + enabled: true + config: + route: + group_wait: 10s + group_interval: 5m + receiver: pagerduty + repeat_interval: 3h + routes: + - receiver: pagerduty + match: + channel: pagerduty + cluster: nasa-cryo + namespace: staging + - receiver: pagerduty + match: + channel: pagerduty + cluster: nasa-cryo + namespace: prod server: ingress: enabled: true @@ -32,6 +51,31 @@ prometheus: - secretName: prometheus-tls hosts: - prometheus.cryointhecloud.2i2c.cloud + serverFiles: + alerting_rules.yml: + groups: + - name: CryoCloud staging EBS volume full + rules: + - alert: staging-jupyterhub-home-nfs-ebs-full + expr: node_filesystem_avail_bytes{mountpoint="/shared-volume", component="shared-volume-metrics", namespace="staging"} / node_filesystem_size_bytes{mountpoint="/shared-volume", component="shared-volume-metrics", namespace="staging"} < 0.1 + for: 15m + labels: + severity: critical + channel: pagerduty + cluster: nasa-cryo + annotations: + summary: "jupyterhub-home-nfs EBS volume full in namespace {{ $labels.namespace }}" + - name: CryoCloud prod jupyterhub-home-nfs EBS volume full + rules: + - alert: prod-jupyterhub-home-nfs-ebs-full + expr: node_filesystem_avail_bytes{mountpoint="/shared-volume", component="shared-volume-metrics", namespace="prod"} / node_filesystem_size_bytes{mountpoint="/shared-volume", component="shared-volume-metrics", namespace="prod"} < 0.1 + for: 15m + labels: + severity: critical + channel: pagerduty + cluster: nasa-cryo + annotations: + summary: "jupyterhub-home-nfs EBS volume full in namespace {{ $labels.namespace }}" aws-ce-grafana-backend: enabled: true diff --git a/config/clusters/nasa-veda/support.values.yaml b/config/clusters/nasa-veda/support.values.yaml index 7591a19b0..c61718561 100644 --- a/config/clusters/nasa-veda/support.values.yaml +++ b/config/clusters/nasa-veda/support.values.yaml @@ -44,6 +44,13 @@ prometheus: - receiver: pagerduty match: channel: pagerduty + cluster: nasa-veda + namespace: staging + - receiver: pagerduty + match: + channel: pagerduty + cluster: nasa-veda + namespace: prod server: ingress: enabled: true @@ -56,10 +63,21 @@ prometheus: serverFiles: alerting_rules.yml: groups: - - name: NASA VEDA jupyterhub-home-nfs EBS volume full + - name: NASA VEDA staging jupyterhub-home-nfs EBS volume full + rules: + - alert: staging-jupyterhub-home-nfs-ebs-full + expr: node_filesystem_avail_bytes{mountpoint="/shared-volume", component="shared-volume-metrics", namespace="staging"} / node_filesystem_size_bytes{mountpoint="/shared-volume", component="shared-volume-metrics", namespace="staging"} < 0.1 + for: 15m + labels: + severity: critical + channel: pagerduty + cluster: nasa-veda + annotations: + summary: "jupyterhub-home-nfs EBS volume full in namespace {{ $labels.namespace }}" + - name: NASA VEDA prod jupyterhub-home-nfs EBS volume full rules: - - alert: jupyterhub-home-nfs-ebs-full - expr: node_filesystem_avail_bytes{mountpoint="/shared-volume", component="shared-volume-metrics"} / node_filesystem_size_bytes{mountpoint="/shared-volume", component="shared-volume-metrics"} < 0.1 + - alert: prod-jupyterhub-home-nfs-ebs-full + expr: node_filesystem_avail_bytes{mountpoint="/shared-volume", component="shared-volume-metrics", namespace="prod"} / node_filesystem_size_bytes{mountpoint="/shared-volume", component="shared-volume-metrics", namespace="prod"} < 0.1 for: 15m labels: severity: critical diff --git a/config/clusters/nmfs-openscapes/support.values.yaml b/config/clusters/nmfs-openscapes/support.values.yaml index 4b340ab42..502b32c0c 100644 --- a/config/clusters/nmfs-openscapes/support.values.yaml +++ b/config/clusters/nmfs-openscapes/support.values.yaml @@ -2,6 +2,30 @@ prometheusIngressAuthSecret: enabled: true prometheus: + alertmanager: + enabled: true + config: + route: + group_wait: 10s + group_interval: 5m + receiver: pagerduty + repeat_interval: 3h + routes: + - receiver: pagerduty + match: + channel: pagerduty + cluster: nmfs-openscapes + namespace: staging + - receiver: pagerduty + match: + channel: pagerduty + cluster: nmfs-openscapes + namespace: prod + - receiver: pagerduty + match: + channel: pagerduty + cluster: nmfs-openscapes + namespace: workshop server: # Bumped as part of https://github.com/2i2c-org/infrastructure/issues/4632 persistentVolume: @@ -19,6 +43,42 @@ prometheus: memory: 8Gi limits: memory: 8Gi + serverFiles: + alerting_rules.yml: + groups: + - name: NMFS Openscapes staging jupyterhub-home-nfs EBS volume full + rules: + - alert: staging-jupyterhub-home-nfs-ebs-full + expr: node_filesystem_avail_bytes{mountpoint="/shared-volume", component="shared-volume-metrics", namespace="staging"} / node_filesystem_size_bytes{mountpoint="/shared-volume", component="shared-volume-metrics", namespace="staging"} < 0.1 + for: 15m + labels: + severity: critical + channel: pagerduty + cluster: nmfs-openscapes + annotations: + summary: "jupyterhub-home-nfs EBS volume full in namespace {{ $labels.namespace }}" + - name: NMFS Openscapes prod jupyterhub-home-nfs EBS volume full + rules: + - alert: prod-jupyterhub-home-nfs-ebs-full + expr: node_filesystem_avail_bytes{mountpoint="/shared-volume", component="shared-volume-metrics", namespace="prod"} / node_filesystem_size_bytes{mountpoint="/shared-volume", component="shared-volume-metrics", namespace="prod"} < 0.1 + for: 15m + labels: + severity: critical + channel: pagerduty + cluster: nmfs-openscapes + annotations: + summary: "jupyterhub-home-nfs EBS volume full in namespace {{ $labels.namespace }}" + - name: NMFS Openscapes workshop jupyterhub-home-nfs EBS volume full + rules: + - alert: workshop-jupyterhub-home-nfs-ebs-full + expr: node_filesystem_avail_bytes{mountpoint="/shared-volume", component="shared-volume-metrics", namespace="workshop"} / node_filesystem_size_bytes{mountpoint="/shared-volume", component="shared-volume-metrics", namespace="workshop"} < 0.1 + for: 15m + labels: + severity: critical + channel: pagerduty + cluster: nmfs-openscapes + annotations: + summary: "jupyterhub-home-nfs EBS volume full in namespace {{ $labels.namespace }}" grafana: grafana.ini: diff --git a/docs/howto/features/storage-quota.md b/docs/howto/features/storage-quota.md index 76407fd2b..11bfc7c0b 100644 --- a/docs/howto/features/storage-quota.md +++ b/docs/howto/features/storage-quota.md @@ -126,8 +126,12 @@ Once this is deployed, the hub will automatically enforce the storage quota for ## Enabling alerting through Prometheus Alertmanager Once we have enabled storage quotas, we want to be alerted when the disk usage of the NFS server exceeds a certain threshold so that we can take appropriate action. +To do this, we need to create a Prometheus rule that will alert us when the disk usage of the NFS server exceeds a certain threshold using Alertmanager. +We will then forward Alertmanager's alert to PagerDuty. -To do this, we need to create a Prometheus rule that will alert us when the disk usage of the NFS server exceeds a certain threshold using Alertmanager. +```{note} +Use these resource to learn more about [PagerDuty's Prometheus integration](https://www.pagerduty.com/docs/guides/prometheus-integration-guide/) and [Prometheus' Alertmanager configuration](https://prometheus.io/docs/alerting/latest/configuration/) +``` First, we need to enable Alertmanager in the hub's support values file (for example, [here's the one for the `nasa-veda` cluster](https://github.com/2i2c-org/infrastructure/blob/main/config/clusters/nasa-veda/support.values.yaml)). @@ -137,17 +141,18 @@ prometheus: enabled: true ``` -Then, we need to create a Prometheus rule that will alert us when the disk usage of the NFS server exceeds a certain threshold. For example, to alert us when the disk usage of the NFS server exceeds 90% of the total disk size, we would add the following to the hub's support values file: +Then, we need to create a Prometheus rule that will alert us when the disk usage of the NFS server exceeds a certain threshold. For example, to alert us when the disk usage of the NFS server exceeds 90% of the total disk size over a 15min period, we would add the following to the hub's support values file: ```yaml prometheus: serverFiles: alerting_rules.yml: groups: - - name: jupyterhub-home-nfs EBS volume full + # Duplicate this entry for every hub on the cluster that uses an EBS volume as an NFS server + - name: jupyterhub-home-nfs EBS volume full rules: - - alert: jupyterhub-home-nfs-ebs-full - expr: node_filesystem_avail_bytes{mountpoint="/shared-volume", component="shared-volume-metrics"} / node_filesystem_size_bytes{mountpoint="/shared-volume", component="shared-volume-metrics"} < 0.1 + - alert: -jupyterhub-home-nfs-ebs-full + expr: node_filesystem_avail_bytes{mountpoint="/shared-volume", component="shared-volume-metrics", namespace=""} / node_filesystem_size_bytes{mountpoint="/shared-volume", component="shared-volume-metrics", namespace=""} < 0.1 for: 15m labels: severity: critical @@ -157,6 +162,13 @@ prometheus: summary: "jupyterhub-home-nfs EBS volume full in namespace {{ $labels.namespace }}" ``` +```{note} +The important variables to note here are: + +- `expr`: This is what Prometheus will evaluate +- `for`: This is a duration over which Prometheus will collect data to evaluate `expr` +``` + And finally, we need to configure Alertmanager to send alerts to PagerDuty. ```yaml @@ -170,9 +182,23 @@ prometheus: receiver: pagerduty repeat_interval: 3h routes: + # Duplicate this entry for every hub on the cluster that uses an EBS volume as an NFS server - receiver: pagerduty match: channel: pagerduty + cluster: + namespace: +``` + +```{note} +The important variables to understand here are: + +- `group_wait`: How long Alertmanager will initially wait to send a notification to PagerDuty for a group of alerts +- `group_interval`: How long Alertmanager will wait to send a notification to PagerDuty for new alerts in a group for which an initial notification has already been sent +- `repeat_interval`: How long Alertmanager will wait to send a notification to PagerDuty again if it has already sent a successful notification +- `match`: These labels are used to group fired alerts together and is how we manage separate incidents per hub per cluster in PagerDuty + +[Read more about these configuration options.](https://prometheus.io/docs/alerting/latest/configuration/#route) ``` ## Increasing the size of the volume used by the NFS server