diff --git a/config/clusters/nasa-veda/support.values.yaml b/config/clusters/nasa-veda/support.values.yaml index b43e07e4d..7591a19b0 100644 --- a/config/clusters/nasa-veda/support.values.yaml +++ b/config/clusters/nasa-veda/support.values.yaml @@ -32,6 +32,18 @@ redirects: to: staging.hub.openveda.cloud prometheus: + alertmanager: + enabled: true + config: + route: + group_wait: 10s + group_interval: 5m + receiver: pagerduty + repeat_interval: 3h + routes: + - receiver: pagerduty + match: + channel: pagerduty server: ingress: enabled: true @@ -41,6 +53,20 @@ prometheus: - secretName: prometheus-tls hosts: - prometheus.nasa-veda.2i2c.cloud + serverFiles: + alerting_rules.yml: + groups: + - name: NASA VEDA jupyterhub-home-nfs EBS volume full + rules: + - alert: jupyterhub-home-nfs-ebs-full + expr: node_filesystem_avail_bytes{mountpoint="/shared-volume", component="shared-volume-metrics"} / node_filesystem_size_bytes{mountpoint="/shared-volume", component="shared-volume-metrics"} < 0.1 + for: 15m + labels: + severity: critical + channel: pagerduty + cluster: nasa-veda + annotations: + summary: "jupyterhub-home-nfs EBS volume full in namespace {{ $labels.namespace }}" aws-ce-grafana-backend: enabled: true diff --git a/docs/howto/features/storage-quota.md b/docs/howto/features/storage-quota.md index 88454ad5e..d9b0da3dc 100644 --- a/docs/howto/features/storage-quota.md +++ b/docs/howto/features/storage-quota.md @@ -138,6 +138,62 @@ deployer deploy Once this is deployed, the hub will automatically enforce the storage quota for each user. If a user's home directory exceeds the quota, the user's pod may not be able to start successfully. +## Enabling alerting through Prometheus Alertmanager + +Once we have enabled storage quotas, we want to be alerted when the disk usage of the NFS server exceeds a certain threshold so that we can take appropriate action. + +To do this, we need to create a Prometheus rule that will alert us when the disk usage of the NFS server exceeds a certain threshold using Alertmanager. + +First, we need to enable Alertmanager in the hub's support values file (for example, [here's the one for the `nasa-veda` cluster](https://github.com/2i2c-org/infrastructure/blob/main/config/clusters/nasa-veda/support.values.yaml)). + +```yaml +prometheus: + alertmanager: + enabled: true +``` + +Then, we need to create a Prometheus rule that will alert us when the disk usage of the NFS server exceeds a certain threshold. For example, to alert us when the disk usage of the NFS server exceeds 90% of the total disk size, we would add the following to the hub's support values file: + +```yaml +prometheus: + serverFiles: + alerting_rules.yml: + groups: + - name: jupyterhub-home-nfs EBS volume full + rules: + - alert: jupyterhub-home-nfs-ebs-full + expr: node_filesystem_avail_bytes{mountpoint="/shared-volume", component="shared-volume-metrics"} / node_filesystem_size_bytes{mountpoint="/shared-volume", component="shared-volume-metrics"} < 0.1 + for: 15m + labels: + severity: critical + channel: pagerduty + cluster: + annotations: + summary: "jupyterhub-home-nfs EBS volume full in namespace {{ $labels.namespace }}" +``` + +And finally, we need to configure Alertmanager to send alerts to PagerDuty. + +```yaml +prometheus: + alertmanager: + enabled: true + config: + route: + group_wait: 10s + group_interval: 5m + receiver: pagerduty + repeat_interval: 3h + routes: + - receiver: pagerduty + match: + channel: pagerduty +``` + +## Increasing the size of the volume used by the NFS server + +If the volume used by the NFS server is close to being full, we may need to increase the size of the volume. This can be done by following the instructions in the [Increase the size of an AWS EBS volume](howto:increase-size-aws-ebs) guide. + ## Troubleshooting ### Checking the NFS server is running properly diff --git a/docs/howto/filesystem-management/increase-size-aws-ebs.md b/docs/howto/filesystem-management/increase-size-aws-ebs.md new file mode 100644 index 000000000..c6b0a514e --- /dev/null +++ b/docs/howto/filesystem-management/increase-size-aws-ebs.md @@ -0,0 +1,18 @@ +(howto:increase-size-aws-ebs)= +# Increase the size of an AWS EBS volume + +To increase the size of an AWS EBS volume, we need to increase the size of the EBS volume in the [tfvars file of the hub](https://github.com/2i2c-org/infrastructure/tree/main/terraform/aws/projects): + +For example, to increase the size of the EBS volume used by `jupyterhub-home-nfs` for the `staging` hub in the `nasa-veda` cluster, we would increase the `size` parameter in the `ebs_volumes` block for the `staging` hub in the [tfvars file for the `nasa-veda` cluster](https://github.com/2i2c-org/infrastructure/blob/main/terraform/aws/projects/nasa-veda.tfvars). + +After updating the tfvars file, we need to plan and apply the changes using terraform: + + ```bash + cd terraform/aws + terraform plan -var-file=projects/$CLUSTER_NAME.tfvars + terraform apply -var-file=projects/$CLUSTER_NAME.tfvars + ``` + +```{note} +The size of an EBS volume can only be increased, not decreased. +``` diff --git a/docs/howto/filesystem-management/index.md b/docs/howto/filesystem-management/index.md index 412b4d929..49ac69d6e 100644 --- a/docs/howto/filesystem-management/index.md +++ b/docs/howto/filesystem-management/index.md @@ -7,4 +7,5 @@ This documentation covers tasks related to managing filesystems. :maxdepth: 2 filesystem-backups/index decrease-size-gcp-filestore +increase-size-aws-ebs ``` diff --git a/helm-charts/support/enc-support.secret.values.yaml b/helm-charts/support/enc-support.secret.values.yaml index 25f716d99..b00e7c3b8 100644 --- a/helm-charts/support/enc-support.secret.values.yaml +++ b/helm-charts/support/enc-support.secret.values.yaml @@ -1,20 +1,24 @@ grafana: - adminPassword: ENC[AES256_GCM,data:B+8/JqM4p6FytAp+8Ec4qzVKANZTVGuaHKMm7vyamhA2Wid5VBiRfIzghPXRxOakPtp8iTXqdO0I/AVu+HQDuA==,iv:ojVQ9u+cCGJeo/e8MbX4LKfKejXq/kjB2wtpjpNsCHA=,tag:Uggw1CS1+fmQdN0GsLhN9w==,type:str] -pagerduty-prometheus-integration: - integrationName: ENC[AES256_GCM,data:REEthqjPY3MEnA==,iv:KTals1+rPcGMAlT8zR5f6Y5RbhWuR9+QkxFQ59L7u/U=,tag:PynUx7lYUgcEmsLsnP+Z3g==,type:str] - integrationKey: ENC[AES256_GCM,data:RRwGXmQghrayoyHGnuPgnH7mkTA68m10ZHyCOn6fJMw=,iv:gLrxOnT6hvAKHSpC6x2mZxfHL/8O00QJU+BBMFo77S0=,tag:bE0YoHvelMLFDWUwSDDPvA==,type:str] - integrationUrl: ENC[AES256_GCM,data:+7kaZ0QGYlyrUpjBBDyZVvnZPIbyjBOUP8FP8DhwhVcSpIdOJn4Koq8NvVJxuTMBgbJiIn3JuVB1bkJCUIVqfW0=,iv:L5QgYp3IvKmwd1GZuv7n0nIXwcrtKeGx7wY+XSDvjjw=,tag:zFhdkN4XXmnaV8xyuW5+fQ==,type:str] + adminPassword: ENC[AES256_GCM,data:oNCAulbNDA7g4jJ3G9j7I5Uqd/XaKo5MvatryHVfat+2bjcqFOlpdHfiZsVZNgNP8kxMNwAlkeLJ7BGlFqYA2g==,iv:lJlTLyO9bvDp0zsE+dZQh1thfE7IfnABhAeynKLDUhA=,tag:jw3PN8OVr7+zl4ThdDlAAw==,type:str] +prometheus: + alertmanager: + config: + receivers: + - name: ENC[AES256_GCM,data:PuB35BjALacz,iv:39j9vTvzB1IB2pEZi+psoAv9FDMikOjrxps6+yxpLEQ=,tag:9apUjoupQkG6W9hsaZ6QHg==,type:str] + pagerduty_configs: + - ENC[AES256_GCM,data:X91fQd0gulxQkTtfMk48RK4EQ0gHJXrBaUeEGZmNYiGZWuGFm7Hf7Cj+yO4Fg51vzqMDwGIVhP3LH80PQbzjo96qJjzVJwAF7SK30eTj9iEcOg==,iv:ymurqszkJ2xB5dj66EAuytARi9mS2oKMdxfbnRzwgP4=,tag:4fYp5OKFiLm2usIqPNN1lQ==,type:comment] + - service_key: ENC[AES256_GCM,data:bCMB2VURBRRPvrV542HQMUGHs9kicvwcD0maVqxuH2I=,iv:NTewUsy4xZNsA9xWVr1Yd62Z2ubtOVMe86erwstXgu4=,tag:DfPqZP1fR/sTNJfVuaCSwA==,type:str] sops: kms: [] gcp_kms: - resource_id: projects/two-eye-two-see/locations/global/keyRings/sops-keys/cryptoKeys/similar-hubs - created_at: "2024-12-05T10:40:02Z" - enc: CiUA4OM7eJ4G+2WdZ9oThEMrrlJSek0IrnRDykmPQYXW99zmb8HQEkkAnGhyNt6qUGehjrI1ovZKXU0p+cr5JwiFBA8PQs/3FR4Rdcy1ZX45Ed0sModB4VvWQlG9WgMC+75fzStuUUZUDtGsV9xvZeM0 + created_at: "2024-12-18T10:09:21Z" + enc: CiUA4OM7eOThrN3NqDXYd4belMxUdYfF8XpeUSYfiefKkYutN2KoEkkAnGhyNqzzrXyUXTvtS4xl4IjlJpo6hwm9FVbQAyh2Vw9dcXH6h1+NNUHGPYj6KQJvvWZzgXHFLiWdqOfUgTJME0YDx/DYgXgm azure_kv: [] hc_vault: [] age: [] - lastmodified: "2024-12-05T10:40:02Z" - mac: ENC[AES256_GCM,data:6rssn9qvZh2ekypkcMmGhoNwGjhGqs85Ld1R1zkIpwx670rTubJlvmFsyslGuqU4JJJ1aJ8A/KKtlMJMjhyGpLGA1krKWf9tgV0E3G/GAk2kR6h/lbdCfPhR+DsHdOAlWuHZTFz2bJuBKboi1znoNoOSKI+32WU6kFNnkcEqgTI=,iv:EAfU8mGbWegtCqu0M4DAjy29cEOJBMgM8/32vRAZf4Y=,tag:E0FtNKt80cRcw9kRXMjW7w==,type:str] + lastmodified: "2024-12-18T10:09:21Z" + mac: ENC[AES256_GCM,data:FLVIB1iZCSfuCwAiL0wBVHik2kO2xfR9iV0TgW2skPWMNJA+RWPhyHk8Ma3SsH/7iSOR8acwrFZ0b6DBhAtTtQEV1DGdSm9LvdHE2hkG7oFZzkdXYaC2YcwwkVJzfEmwZIn7OECXTL2W2I/5ZUEcs0bUW/5YlKmMzphtt9DL5o4=,iv:NdMEKBn90DyGgkrNYG8VKhhe4S+H57XM6AgAqJWA1q4=,tag:0lLWwN5M9jXmV8EzP/Y7nw==,type:str] pgp: [] unencrypted_suffix: _unencrypted version: 3.9.1