From 108fa7ccb8a6d6ae62d89333c8c690692d9fbd6b Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Thu, 19 Sep 2024 13:28:41 +0100 Subject: [PATCH 01/90] Added prometheus operator role compatible with state_dir (still needs more affinity) --- ansible/.gitignore | 2 + ansible/roles/k3s/tasks/main.yml | 5 + .../kube_prometheus_stack/defaults/main.yml | 122 ++++++++++++++++++ .../kube_prometheus_stack/tasks/main.yml | 72 +++++++++++ requirements.txt | 1 + requirements.yml | 2 + 6 files changed, 204 insertions(+) create mode 100644 ansible/roles/kube_prometheus_stack/defaults/main.yml create mode 100644 ansible/roles/kube_prometheus_stack/tasks/main.yml diff --git a/ansible/.gitignore b/ansible/.gitignore index f2268c478..f257f66d6 100644 --- a/ansible/.gitignore +++ b/ansible/.gitignore @@ -62,3 +62,5 @@ roles/* !roles/k3s/** !roles/k9s/ !roles/k9s/** +!roles/kube_prometheus_stack +!roles/kube_prometheus_stack/** diff --git a/ansible/roles/k3s/tasks/main.yml b/ansible/roles/k3s/tasks/main.yml index c52c47ba6..ce8bd0bef 100644 --- a/ansible/roles/k3s/tasks/main.yml +++ b/ansible/roles/k3s/tasks/main.yml @@ -41,3 +41,8 @@ copy: src: start_k3s.yml dest: /etc/ansible-init/playbooks/0-start-k3s.yml + +- name: Install pip dependencies for k8s ansible module + ansible.builtin.pip: + name: + - kubernetes==30.1.0 \ No newline at end of file diff --git a/ansible/roles/kube_prometheus_stack/defaults/main.yml b/ansible/roles/kube_prometheus_stack/defaults/main.yml new file mode 100644 index 000000000..1e1d900c9 --- /dev/null +++ b/ansible/roles/kube_prometheus_stack/defaults/main.yml @@ -0,0 +1,122 @@ +--- + +# The chart to use +kube_prometheus_stack_chart_repo: https://prometheus-community.github.io/helm-charts +kube_prometheus_stack_chart_name: kube-prometheus-stack +kube_prometheus_stack_chart_version: 59.1.0 + +# Release information +kube_prometheus_stack_release_namespace: monitoring-system +kube_prometheus_stack_release_name: kube-prometheus-stack + +# The timeout to wait for the release to become ready +kube_prometheus_stack_wait_timeout: 10m + +# The external URLs for Prometheus and Alertmanager +_kube_prometheus_stack_external_url_tls_enabled: >- + {{- + admin_dashboard_ingress_tls_enabled | + default(ingress_tls_enabled | default(True)) + }} +_kube_prometheus_stack_external_url_scheme: >- + {{- "https" if _kube_prometheus_stack_external_url_tls_enabled else "http" }} +_kube_prometheus_stack_alertmanager_host: >- + {{- + admin_dashboard_ingress_alertmanager_host | + default( + "{}.{}".format(ingress_alertmanager_subdomain, ingress_base_domain) + if ingress_alertmanager_subdomain is defined and ingress_base_domain is defined + else "" + ) + }} +kube_prometheus_stack_alertmanager_external_url: >- + {{- + "{}://{}".format( + _kube_prometheus_stack_external_url_scheme, + _kube_prometheus_stack_alertmanager_host + ) + if _kube_prometheus_stack_alertmanager_host + else "" + }} +_kube_prometheus_stack_prometheus_host: >- + {{- + admin_dashboard_ingress_prometheus_host | + default( + "{}.{}".format(ingress_prometheus_subdomain, ingress_base_domain) + if ingress_prometheus_subdomain is defined and ingress_base_domain is defined + else "" + ) + }} +kube_prometheus_stack_prometheus_external_url: >- + {{- + "{}://{}".format( + _kube_prometheus_stack_external_url_scheme, + _kube_prometheus_stack_prometheus_host + ) + if _kube_prometheus_stack_prometheus_host + else "" + }} + +# The values for the kube-prometheus-stack release +kube_prometheus_stack_release_defaults: + defaultRules: + disabled: + # None of these are relevant in k3s context + KubeSchedulerDown: true + KubeProxyDown: true + KubeControllerManagerDown: true + prometheus: + prometheusSpec: + externalUrl: "{{ kube_prometheus_stack_prometheus_external_url }}" + podMonitorSelectorNilUsesHelmValues: false + serviceMonitorSelectorNilUsesHelmValues: false + probeSelectorNilUsesHelmValues: false + ruleSelectorNilUsesHelmValues: false + storageSpec: + volumeClaimTemplate: + spec: + volumeName: appliances-state-dir + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 4Gi + # may need to change these + securityContext: + runAsUser: 0 + runAsNonRoot: false + runAsGroup: 0 + fsGroup: 0 + nodeSelector: + kubernetes.io/hostname: "{{ openhpc_cluster_name }}-control.{{ openhpc_cluster_name }}.{{ cluster_domain_suffix }}" + + + grafana: + sidecar: + dashboards: + searchNamespace: ALL + grafana.ini: + auth.anonymous: + enabled: true + alertmanager: + # Don't apply the namespace grouping by default + config: + route: + group_by: ['...'] + alertmanagerSpec: + externalUrl: "{{ kube_prometheus_stack_alertmanager_external_url }}" + # Make sure that alertmanager finds configurations with the alertmanager name as a label + alertmanagerConfigSelector: + matchLabels: + alertmanager: "{{ kube_prometheus_stack_release_name }}-alertmanager" + # Do NOT add the namespace matcher to routes from AlertmanagerConfig resources + alertmanagerConfigMatcherStrategy: + type: None + +kube_prometheus_stack_release_overrides: {} + +kube_prometheus_stack_release_values: >- + {{- + kube_prometheus_stack_release_defaults | + combine(kube_prometheus_stack_release_overrides, recursive = True) + }} diff --git a/ansible/roles/kube_prometheus_stack/tasks/main.yml b/ansible/roles/kube_prometheus_stack/tasks/main.yml new file mode 100644 index 000000000..92d60963e --- /dev/null +++ b/ansible/roles/kube_prometheus_stack/tasks/main.yml @@ -0,0 +1,72 @@ +--- + +# Because of the way Helm handles CRDs, we upgrade them first +- name: Get kube-prometheus-stack CRDs + command: >- + helm show crds + {{ kube_prometheus_stack_chart_name }} + --repo {{ kube_prometheus_stack_chart_repo }} + --version {{ kube_prometheus_stack_chart_version }} + register: kube_prometheus_stack_crds + +- name: Install kube-prometheus-stack CRDs + # Use server-side apply because some of the CRDs are too big to fit in the annotation + command: kubectl apply --server-side=true --force-conflicts=true -f - + args: + stdin: "{{ kube_prometheus_stack_crds.stdout }}" + +- name: Create hostPath volume in /var/lib/state + kubernetes.core.k8s: + namespace: "{{ kube_prometheus_stack_release_namespace }}" + definition: + apiVersion: v1 + kind: PersistentVolume + metadata: + name: appliances-state-dir + labels: + app.kubernetes.io/name: appliances-state-dir + spec: + capacity: + storage: 4Gi + accessModes: + - ReadWriteOnce + hostPath: + path: "{{ appliances_state_dir }}/prometheus" + type: DirectoryOrCreate + +- name: Disable rancher default storage class + kubernetes.core.k8s: + namespace: "{{ kube_prometheus_stack_release_namespace }}" + state: patched + definition: + kind: StorageClass + metadata: + name: local-path + annotations: + storageclass.kubernetes.io/is-default-class: "false" + +- name: Install kube-prometheus-stack on target Kubernetes cluster + kubernetes.core.helm: + chart_ref: "{{ kube_prometheus_stack_chart_name }}" + chart_repo_url: "{{ kube_prometheus_stack_chart_repo }}" + chart_version: "{{ kube_prometheus_stack_chart_version }}" + release_namespace: "{{ kube_prometheus_stack_release_namespace }}" + release_name: "{{ kube_prometheus_stack_release_name }}" + release_values: "{{ kube_prometheus_stack_release_values }}" + atomic: yes + create_namespace: yes + wait: yes + wait_timeout: "{{ kube_prometheus_stack_wait_timeout }}" + +- name: Expose prometheus dashboard + kubernetes.core.k8s_service: + state: present + name: prometheus-external + type: NodePort + namespace: "{{ kube_prometheus_stack_release_namespace }}" + ports: + - port: 9090 + targetPort: 9090 + protocol: TCP + selector: + app.kubernetes.io/instance: kube-prometheus-stack-prometheus diff --git a/requirements.txt b/requirements.txt index 6651506fb..5dbe97d91 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,3 +9,4 @@ cookiecutter selinux # this is a shim to avoid having to use --system-site-packages, you still need sudo yum install libselinux-python3 netaddr matplotlib +kubernetes diff --git a/requirements.yml b/requirements.yml index da6ac5d29..e4b6bd6ef 100644 --- a/requirements.yml +++ b/requirements.yml @@ -49,4 +49,6 @@ collections: - name: https://github.com/azimuth-cloud/ansible-collection-image-utils type: git version: main # update on release + - name: kubernetes.core + version: 2.4.2 ... From 9836ef8e58ad8ccac01a97ae82bd0ffd4cc8a226 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Fri, 20 Sep 2024 11:28:56 +0100 Subject: [PATCH 02/90] Added node selectors for non-exporter pods --- ansible/roles/k3s/tasks/main.yml | 4 ++++ .../roles/kube_prometheus_stack/defaults/main.yml | 13 +++++++++++++ 2 files changed, 17 insertions(+) diff --git a/ansible/roles/k3s/tasks/main.yml b/ansible/roles/k3s/tasks/main.yml index db207f77f..49f399243 100644 --- a/ansible/roles/k3s/tasks/main.yml +++ b/ansible/roles/k3s/tasks/main.yml @@ -43,6 +43,10 @@ src: start_k3s.yml dest: /etc/ansible-init/playbooks/0-start-k3s.yml +- name: Install pip + dnf: + name: python3-pip + - name: Install pip dependencies for k8s ansible module ansible.builtin.pip: name: diff --git a/ansible/roles/kube_prometheus_stack/defaults/main.yml b/ansible/roles/kube_prometheus_stack/defaults/main.yml index 1e1d900c9..133e8278d 100644 --- a/ansible/roles/kube_prometheus_stack/defaults/main.yml +++ b/ansible/roles/kube_prometheus_stack/defaults/main.yml @@ -98,6 +98,9 @@ kube_prometheus_stack_release_defaults: grafana.ini: auth.anonymous: enabled: true + nodeSelector: + kubernetes.io/hostname: "{{ openhpc_cluster_name }}-control.{{ openhpc_cluster_name }}.{{ cluster_domain_suffix }}" + alertmanager: # Don't apply the namespace grouping by default config: @@ -112,6 +115,16 @@ kube_prometheus_stack_release_defaults: # Do NOT add the namespace matcher to routes from AlertmanagerConfig resources alertmanagerConfigMatcherStrategy: type: None + nodeSelector: + kubernetes.io/hostname: "{{ openhpc_cluster_name }}-control.{{ openhpc_cluster_name }}.{{ cluster_domain_suffix }}" + + prometheusOperator: + nodeSelector: + kubernetes.io/hostname: "{{ openhpc_cluster_name }}-control.{{ openhpc_cluster_name }}.{{ cluster_domain_suffix }}" + + kube-state-metrics: + nodeSelector: + kubernetes.io/hostname: "{{ openhpc_cluster_name }}-control.{{ openhpc_cluster_name }}.{{ cluster_domain_suffix }}" kube_prometheus_stack_release_overrides: {} From d790b2b39e98454e3cc2bb79527b89c73c2a2008 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Fri, 20 Sep 2024 15:11:21 +0100 Subject: [PATCH 03/90] Added services for monitoring --- .../kube_prometheus_stack/tasks/main.yml | 36 ++++++++++++++++--- 1 file changed, 32 insertions(+), 4 deletions(-) diff --git a/ansible/roles/kube_prometheus_stack/tasks/main.yml b/ansible/roles/kube_prometheus_stack/tasks/main.yml index 92d60963e..1567e9179 100644 --- a/ansible/roles/kube_prometheus_stack/tasks/main.yml +++ b/ansible/roles/kube_prometheus_stack/tasks/main.yml @@ -58,15 +58,43 @@ wait: yes wait_timeout: "{{ kube_prometheus_stack_wait_timeout }}" -- name: Expose prometheus dashboard +# Not looping through this because ports get templated as strings for some reason + +- name: Expose prometheus kubernetes.core.k8s_service: state: present - name: prometheus-external - type: NodePort + name: "prometheus-external" + type: LoadBalancer namespace: "{{ kube_prometheus_stack_release_namespace }}" ports: - port: 9090 targetPort: 9090 protocol: TCP selector: - app.kubernetes.io/instance: kube-prometheus-stack-prometheus + app.kubernetes.io/name: "prometheus" + +- name: Expose grafana + kubernetes.core.k8s_service: + state: present + name: "grafana-external" + type: LoadBalancer + namespace: "{{ kube_prometheus_stack_release_namespace }}" + ports: + - port: 3000 + targetPort: 3000 + protocol: TCP + selector: + app.kubernetes.io/name: "grafana" + +- name: Expose alertmanager + kubernetes.core.k8s_service: + state: present + name: "alertmanager-external" + type: LoadBalancer + namespace: "{{ kube_prometheus_stack_release_namespace }}" + ports: + - port: 9093 + targetPort: 9093 + protocol: TCP + selector: + app.kubernetes.io/name: "alertmanager" From 10af75df54ed67ccf08bf74b9afbcf0b99fdd33d Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Tue, 24 Sep 2024 14:01:45 +0100 Subject: [PATCH 04/90] WIP porting prometheus rolevars --- .../kube_prometheus_stack/defaults/main.yml | 250 +++++++++++++++++- .../kube_prometheus_stack/tasks/main.yml | 4 +- 2 files changed, 251 insertions(+), 3 deletions(-) diff --git a/ansible/roles/kube_prometheus_stack/defaults/main.yml b/ansible/roles/kube_prometheus_stack/defaults/main.yml index 133e8278d..e20735e96 100644 --- a/ansible/roles/kube_prometheus_stack/defaults/main.yml +++ b/ansible/roles/kube_prometheus_stack/defaults/main.yml @@ -57,6 +57,234 @@ kube_prometheus_stack_prometheus_external_url: >- else "" }} +### PREVIOUS ROLE VALUES + +prometheus_image_tag: "v2.27.0" + +# prometheus_config_dir: /etc/prometheus +prometheus_db_dir: "{{ appliances_state_dir }}/prometheus" +# prometheus_read_only_dirs: [] + +# prometheus_binary_local_dir: '' +# prometheus_skip_install: false + +# prometheus_web_listen_address: "0.0.0.0:9090" +# prometheus_web_external_url: '' +# See https://github.com/prometheus/exporter-toolkit/blob/master/docs/web-configuration.md +# prometheus_web_config: +# tls_server_config: {} +# http_server_config: {} +# basic_auth_users: {} + +prometheus_storage_retention: "30d" +# Available since Prometheus 2.7.0 +# [EXPERIMENTAL] Maximum number of bytes that can be stored for blocks. Units +# supported: KB, MB, GB, TB, PB. +prometheus_storage_retention_size: "40GB" + +kube_prometheus_stack_volume_size: 40Gi + +prometheus_config_flags_extra: {} +# prometheus_config_flags_extra: +# storage.tsdb.retention: 15d +# alertmanager.timeout: 10s + +prometheus_alertmanager_config: [] +# prometheus_alertmanager_config: +# - scheme: https +# path_prefix: alertmanager/ +# basic_auth: +# username: user +# password: pass +# static_configs: +# - targets: ["127.0.0.1:9093"] +# proxy_url: "127.0.0.2" + +prometheus_alert_relabel_configs: [] +# prometheus_alert_relabel_configs: +# - action: labeldrop +# regex: replica + +prometheus_global: + scrape_interval: 15s + scrape_timeout: 10s + evaluation_interval: 15s + +prometheus_remote_write: [] +# prometheus_remote_write: +# - url: https://dev.kausal.co/prom/push +# basic_auth: +# password: FOO + +prometheus_remote_read: [] +# prometheus_remote_read: +# - url: https://demo.cloudalchemy.org:9201/read +# basic_auth: +# password: FOO + +# todo: readd +# prometheus_external_labels: +# environment: "{{ ansible_fqdn | default(ansible_host) | default(inventory_hostname) }}" + +prometheus_targets: {} +# node: +# - targets: +# - localhost:9100 +# labels: +# env: test + +# todo: readd +# prometheus_scrape_configs: +# - job_name: "prometheus" +# metrics_path: "{{ prometheus_metrics_path }}" +# static_configs: +# - targets: +# - "{{ ansible_fqdn | default(ansible_host) | default('localhost') }}:9090" +# - job_name: "node" +# file_sd_configs: +# - files: +# - "{{ prometheus_config_dir }}/file_sd/node.yml" + +# Alternative config file name, searched in ansible templates path. +# prometheus_config_file: 'prometheus.yml.j2' + +# prometheus_alert_rules_files: +# - prometheus/rules/*.rules + +# prometheus_static_targets_files: +# - prometheus/targets/*.yml +# - prometheus/targets/*.json + +prometheus_alert_rules: + - alert: Watchdog + expr: vector(1) + for: 10m + labels: + severity: warning + annotations: + description: "This is an alert meant to ensure that the entire alerting pipeline is functional.\nThis alert is always firing, therefore it should always be firing in Alertmanager\nand always fire against a receiver. There are integrations with various notification\nmechanisms that send a notification when this alert is not firing. For example the\n\"DeadMansSnitch\" integration in PagerDuty." + summary: 'Ensure entire alerting pipeline is functional' + - alert: InstanceDown + expr: 'up == 0' + for: 5m + labels: + severity: critical + annotations: + description: '{% raw %}{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes.{% endraw %}' + summary: '{% raw %}Instance {{ $labels.instance }} down{% endraw %}' + - alert: RebootRequired + expr: 'node_reboot_required > 0' + labels: + severity: warning + annotations: + description: '{% raw %}{{ $labels.instance }} requires a reboot.{% endraw %}' + summary: '{% raw %}Instance {{ $labels.instance }} - reboot required{% endraw %}' + - alert: NodeFilesystemSpaceFillingUp + annotations: + description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up.{% endraw %}' + summary: 'Filesystem is predicted to run out of space within the next 24 hours.' + expr: "(\n node_filesystem_avail_bytes{job=\"node\",fstype!=\"\"} / node_filesystem_size_bytes{job=\"node\",fstype!=\"\"} * 100 < 40\nand\n predict_linear(node_filesystem_avail_bytes{job=\"node\",fstype!=\"\"}[6h], 24*60*60) < 0\nand\n node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n" + for: 1h + labels: + severity: warning + - alert: NodeFilesystemSpaceFillingUp + annotations: + description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up fast.{% endraw %}' + summary: 'Filesystem is predicted to run out of space within the next 4 hours.' + expr: "(\n node_filesystem_avail_bytes{job=\"node\",fstype!=\"\"} / node_filesystem_size_bytes{job=\"node\",fstype!=\"\"} * 100 < 20\nand\n predict_linear(node_filesystem_avail_bytes{job=\"node\",fstype!=\"\"}[6h], 4*60*60) < 0\nand\n node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n" + for: 1h + labels: + severity: critical + - alert: NodeFilesystemAlmostOutOfSpace + annotations: + description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.{% endraw %}' + summary: 'Filesystem has less than 5% space left.' + expr: "(\n node_filesystem_avail_bytes{job=\"node\",fstype!=\"\"} / node_filesystem_size_bytes{job=\"node\",fstype!=\"\"} * 100 < 5\nand\n node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n" + for: 1h + labels: + severity: warning + - alert: NodeFilesystemAlmostOutOfSpace + annotations: + description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.{% endraw %}' + summary: 'Filesystem has less than 3% space left.' + expr: "(\n node_filesystem_avail_bytes{job=\"node\",fstype!=\"\"} / node_filesystem_size_bytes{job=\"node\",fstype!=\"\"} * 100 < 3\nand\n node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n" + for: 1h + labels: + severity: critical + - alert: NodeFilesystemFilesFillingUp + annotations: + description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up.{% endraw %}' + summary: 'Filesystem is predicted to run out of inodes within the next 24 hours.' + expr: "(\n node_filesystem_files_free{job=\"node\",fstype!=\"\"} / node_filesystem_files{job=\"node\",fstype!=\"\"} * 100 < 40\nand\n predict_linear(node_filesystem_files_free{job=\"node\",fstype!=\"\"}[6h], 24*60*60) < 0\nand\n node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n" + for: 1h + labels: + severity: warning + - alert: NodeFilesystemFilesFillingUp + annotations: + description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up fast.{% endraw %}' + summary: 'Filesystem is predicted to run out of inodes within the next 4 hours.' + expr: "(\n node_filesystem_files_free{job=\"node\",fstype!=\"\"} / node_filesystem_files{job=\"node\",fstype!=\"\"} * 100 < 20\nand\n predict_linear(node_filesystem_files_free{job=\"node\",fstype!=\"\"}[6h], 4*60*60) < 0\nand\n node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n" + for: 1h + labels: + severity: critical + - alert: NodeFilesystemAlmostOutOfFiles + annotations: + description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.{% endraw %}' + summary: 'Filesystem has less than 5% inodes left.' + expr: "(\n node_filesystem_files_free{job=\"node\",fstype!=\"\"} / node_filesystem_files{job=\"node\",fstype!=\"\"} * 100 < 5\nand\n node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n" + for: 1h + labels: + severity: warning + - alert: NodeFilesystemAlmostOutOfFiles + annotations: + description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.{% endraw %}' + summary: 'Filesystem has less than 3% inodes left.' + expr: "(\n node_filesystem_files_free{job=\"node\",fstype!=\"\"} / node_filesystem_files{job=\"node\",fstype!=\"\"} * 100 < 3\nand\n node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n" + for: 1h + labels: + severity: critical + - alert: NodeNetworkReceiveErrs + annotations: + description: '{% raw %}{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last two minutes.{% endraw %}' + summary: 'Network interface is reporting many receive errors.' + expr: "increase(node_network_receive_errs_total[2m]) > 10\n" + for: 1h + labels: + severity: warning + - alert: NodeNetworkTransmitErrs + annotations: + description: '{% raw %}{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last two minutes.{% endraw %}' + summary: 'Network interface is reporting many transmit errors.' + expr: "increase(node_network_transmit_errs_total[2m]) > 10\n" + for: 1h + labels: + severity: warning + - alert: NodeHighNumberConntrackEntriesUsed + annotations: + description: '{% raw %}{{ $value | humanizePercentage }} of conntrack entries are used{% endraw %}' + summary: 'Number of conntrack are getting close to the limit' + expr: "(node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75\n" + labels: + severity: warning + - alert: NodeClockSkewDetected + annotations: + message: '{% raw %}Clock on {{ $labels.instance }} is out of sync by more than 300s. Ensure NTP is configured correctly on this host.{% endraw %}' + summary: 'Clock skew detected.' + expr: "(\n node_timex_offset_seconds > 0.05\nand\n deriv(node_timex_offset_seconds[5m]) >= 0\n)\nor\n(\n node_timex_offset_seconds < -0.05\nand\n deriv(node_timex_offset_seconds[5m]) <= 0\n)\n" + for: 10m + labels: + severity: warning + - alert: NodeClockNotSynchronising + annotations: + message: '{% raw %}Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is configured on this host.{% endraw %}' + summary: 'Clock not synchronising.' + expr: "min_over_time(node_timex_sync_status[5m]) == 0\n" + for: 10m + labels: + severity: warning + +# ------------------------------------------------------------------------------------------ + # The values for the kube-prometheus-stack release kube_prometheus_stack_release_defaults: defaultRules: @@ -72,6 +300,8 @@ kube_prometheus_stack_release_defaults: serviceMonitorSelectorNilUsesHelmValues: false probeSelectorNilUsesHelmValues: false ruleSelectorNilUsesHelmValues: false + image: + tag: "{{ prometheus_image_tag }}" storageSpec: volumeClaimTemplate: spec: @@ -80,7 +310,17 @@ kube_prometheus_stack_release_defaults: - ReadWriteOnce resources: requests: - storage: 4Gi + storage: "{{ kube_prometheus_stack_volume_size }}" + retention: "{{ prometheus_storage_retention }}" + retentionSize: "{{ prometheus_storage_retention_size }}" + additionalAlertRelabelConfigs: "{{ prometheus_alert_relabel_configs }}" + scrapeInterval: "{{ prometheus_global.scrape_interval }}" + scrapeTimeout: "{{ prometheus_global.scrape_timeout }}" + evaluationInterval: "{{ prometheus_global.evaluation_interval }}" + remoteRead: "{{ prometheus_remote_read }}" + remoteWrite: "{{ prometheus_remote_write }}" + # externalLabels: "{{ prometheus_external_labels }}" TODO: readd + # additionalScrapeConfigs: "{{ prometheus_scrape_configs }}" TODO:readd # may need to change these securityContext: runAsUser: 0 @@ -90,6 +330,7 @@ kube_prometheus_stack_release_defaults: nodeSelector: kubernetes.io/hostname: "{{ openhpc_cluster_name }}-control.{{ openhpc_cluster_name }}.{{ cluster_domain_suffix }}" + additionalPrometheusRulesMap: "{{ prometheus_alert_rules }}" grafana: sidecar: @@ -106,6 +347,8 @@ kube_prometheus_stack_release_defaults: config: route: group_by: ['...'] + global: + resolve_timeout: "{{ prometheus_config_flags_extra.alertmanager.timeout | default( '5m' ) }}" alertmanagerSpec: externalUrl: "{{ kube_prometheus_stack_alertmanager_external_url }}" # Make sure that alertmanager finds configurations with the alertmanager name as a label @@ -117,6 +360,11 @@ kube_prometheus_stack_release_defaults: type: None nodeSelector: kubernetes.io/hostname: "{{ openhpc_cluster_name }}-control.{{ openhpc_cluster_name }}.{{ cluster_domain_suffix }}" + scheme: "{{ prometheus_alertmanager_config.scheme | default( '' ) }}" + serviceMonitor: + scheme: "{{ prometheus_alertmanager_config.scheme | default( '' ) }}" + proxyUrl: "{{ prometheus_alertmanager_config.proxy_url | default( '' ) }}" + prometheusOperator: nodeSelector: diff --git a/ansible/roles/kube_prometheus_stack/tasks/main.yml b/ansible/roles/kube_prometheus_stack/tasks/main.yml index 1567e9179..e9f8de18c 100644 --- a/ansible/roles/kube_prometheus_stack/tasks/main.yml +++ b/ansible/roles/kube_prometheus_stack/tasks/main.yml @@ -27,11 +27,11 @@ app.kubernetes.io/name: appliances-state-dir spec: capacity: - storage: 4Gi + storage: "{{ kube_prometheus_stack_volume_size }}" accessModes: - ReadWriteOnce hostPath: - path: "{{ appliances_state_dir }}/prometheus" + path: "{{ prometheus_db_dir }}" type: DirectoryOrCreate - name: Disable rancher default storage class From 7b29a3b4ce1019cdcea757e1eda46f25715823dd Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Wed, 25 Sep 2024 09:10:22 +0100 Subject: [PATCH 05/90] Added ingress for monitoring services --- .../kube_prometheus_stack/defaults/main.yml | 28 +++++++++ .../kube_prometheus_stack/tasks/main.yml | 57 ++++++------------- 2 files changed, 45 insertions(+), 40 deletions(-) diff --git a/ansible/roles/kube_prometheus_stack/defaults/main.yml b/ansible/roles/kube_prometheus_stack/defaults/main.yml index e20735e96..f05fe8488 100644 --- a/ansible/roles/kube_prometheus_stack/defaults/main.yml +++ b/ansible/roles/kube_prometheus_stack/defaults/main.yml @@ -285,6 +285,8 @@ prometheus_alert_rules: # ------------------------------------------------------------------------------------------ +control_sslip: "{{ ansible_all_ipv4_addresses[0] | regex_replace('\\.', '-') }}.sslip.io" + # The values for the kube-prometheus-stack release kube_prometheus_stack_release_defaults: defaultRules: @@ -294,7 +296,15 @@ kube_prometheus_stack_release_defaults: KubeProxyDown: true KubeControllerManagerDown: true prometheus: + ingress: + enabled: true + hosts: + - "{{ control_sslip }}" + ingressClassName: traefik + paths: + - /prometheus prometheusSpec: + routePrefix: /prometheus externalUrl: "{{ kube_prometheus_stack_prometheus_external_url }}" podMonitorSelectorNilUsesHelmValues: false serviceMonitorSelectorNilUsesHelmValues: false @@ -333,16 +343,33 @@ kube_prometheus_stack_release_defaults: additionalPrometheusRulesMap: "{{ prometheus_alert_rules }}" grafana: + ingress: + enabled: true + hosts: + - "{{ control_sslip }}" + ingressClassName: traefik + path: / sidecar: dashboards: searchNamespace: ALL grafana.ini: + server: + domain: "{{ control_sslip }}" + root_url: "http://{{ control_sslip }}/grafana" + serve_from_sub_path: true auth.anonymous: enabled: true nodeSelector: kubernetes.io/hostname: "{{ openhpc_cluster_name }}-control.{{ openhpc_cluster_name }}.{{ cluster_domain_suffix }}" alertmanager: + ingress: + enabled: true + hosts: + - "{{ control_sslip }}" + ingressClassName: traefik + paths: + - /alertmanager # Don't apply the namespace grouping by default config: route: @@ -350,6 +377,7 @@ kube_prometheus_stack_release_defaults: global: resolve_timeout: "{{ prometheus_config_flags_extra.alertmanager.timeout | default( '5m' ) }}" alertmanagerSpec: + routePrefix: /alertmanager externalUrl: "{{ kube_prometheus_stack_alertmanager_external_url }}" # Make sure that alertmanager finds configurations with the alertmanager name as a label alertmanagerConfigSelector: diff --git a/ansible/roles/kube_prometheus_stack/tasks/main.yml b/ansible/roles/kube_prometheus_stack/tasks/main.yml index e9f8de18c..a3828f9e9 100644 --- a/ansible/roles/kube_prometheus_stack/tasks/main.yml +++ b/ansible/roles/kube_prometheus_stack/tasks/main.yml @@ -58,43 +58,20 @@ wait: yes wait_timeout: "{{ kube_prometheus_stack_wait_timeout }}" -# Not looping through this because ports get templated as strings for some reason - -- name: Expose prometheus - kubernetes.core.k8s_service: - state: present - name: "prometheus-external" - type: LoadBalancer - namespace: "{{ kube_prometheus_stack_release_namespace }}" - ports: - - port: 9090 - targetPort: 9090 - protocol: TCP - selector: - app.kubernetes.io/name: "prometheus" - -- name: Expose grafana - kubernetes.core.k8s_service: - state: present - name: "grafana-external" - type: LoadBalancer - namespace: "{{ kube_prometheus_stack_release_namespace }}" - ports: - - port: 3000 - targetPort: 3000 - protocol: TCP - selector: - app.kubernetes.io/name: "grafana" - -- name: Expose alertmanager - kubernetes.core.k8s_service: - state: present - name: "alertmanager-external" - type: LoadBalancer - namespace: "{{ kube_prometheus_stack_release_namespace }}" - ports: - - port: 9093 - targetPort: 9093 - protocol: TCP - selector: - app.kubernetes.io/name: "alertmanager" +# kind: Ingress +# metadata: +# name: test-ingress +# annotations: +# traefik.ingress.kubernetes.io/router.entrypoints: web +# spec: +# rules: +# - host: 172-16-0-125.sslip.io +# http: +# paths: +# - path: /grafana +# pathType: Prefix +# backend: +# service: +# name: kube-prometheus-stack-grafana +# port: +# name: http-web \ No newline at end of file From b959e92ea31c71af677b7ca64861f139413b7885 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Wed, 25 Sep 2024 10:43:18 +0100 Subject: [PATCH 06/90] Refactored + re-enabled external labels (not sure if working) --- .../defaults/main/helm.yml | 167 +++++++++++++++++ .../defaults/{ => main}/main.yml | 177 +----------------- 2 files changed, 172 insertions(+), 172 deletions(-) create mode 100644 ansible/roles/kube_prometheus_stack/defaults/main/helm.yml rename ansible/roles/kube_prometheus_stack/defaults/{ => main}/main.yml (64%) diff --git a/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml b/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml new file mode 100644 index 000000000..d6e45885e --- /dev/null +++ b/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml @@ -0,0 +1,167 @@ +# The external URLs for Prometheus and Alertmanager +_kube_prometheus_stack_external_url_tls_enabled: >- + {{- + admin_dashboard_ingress_tls_enabled | + default(ingress_tls_enabled | default(True)) + }} +_kube_prometheus_stack_external_url_scheme: >- + {{- "https" if _kube_prometheus_stack_external_url_tls_enabled else "http" }} +_kube_prometheus_stack_alertmanager_host: >- + {{- + admin_dashboard_ingress_alertmanager_host | + default( + "{}.{}".format(ingress_alertmanager_subdomain, ingress_base_domain) + if ingress_alertmanager_subdomain is defined and ingress_base_domain is defined + else "" + ) + }} +kube_prometheus_stack_alertmanager_external_url: >- + {{- + "{}://{}".format( + _kube_prometheus_stack_external_url_scheme, + _kube_prometheus_stack_alertmanager_host + ) + if _kube_prometheus_stack_alertmanager_host + else "" + }} +_kube_prometheus_stack_prometheus_host: >- + {{- + admin_dashboard_ingress_prometheus_host | + default( + "{}.{}".format(ingress_prometheus_subdomain, ingress_base_domain) + if ingress_prometheus_subdomain is defined and ingress_base_domain is defined + else "" + ) + }} +kube_prometheus_stack_prometheus_external_url: >- + {{- + "{}://{}".format( + _kube_prometheus_stack_external_url_scheme, + _kube_prometheus_stack_prometheus_host + ) + if _kube_prometheus_stack_prometheus_host + else "" + }} + +# The values for the kube-prometheus-stack release +kube_prometheus_stack_release_defaults: + defaultRules: + disabled: + # None of these are relevant in k3s context + KubeSchedulerDown: true + KubeProxyDown: true + KubeControllerManagerDown: true + prometheus: + ingress: + enabled: true + hosts: + - "{{ control_sslip }}" + ingressClassName: traefik + paths: + - /prometheus + prometheusSpec: + routePrefix: /prometheus + externalUrl: "{{ kube_prometheus_stack_prometheus_external_url }}" + podMonitorSelectorNilUsesHelmValues: false + serviceMonitorSelectorNilUsesHelmValues: false + probeSelectorNilUsesHelmValues: false + ruleSelectorNilUsesHelmValues: false + image: + tag: "{{ prometheus_image_tag }}" + storageSpec: + volumeClaimTemplate: + spec: + volumeName: appliances-state-dir + accessModes: + - ReadWriteOnce + resources: + requests: + storage: "{{ kube_prometheus_stack_volume_size }}" + retention: "{{ prometheus_storage_retention }}" + retentionSize: "{{ prometheus_storage_retention_size }}" + additionalAlertRelabelConfigs: "{{ prometheus_alert_relabel_configs }}" + scrapeInterval: "{{ prometheus_global.scrape_interval }}" + scrapeTimeout: "{{ prometheus_global.scrape_timeout }}" + evaluationInterval: "{{ prometheus_global.evaluation_interval }}" + remoteRead: "{{ prometheus_remote_read }}" + remoteWrite: "{{ prometheus_remote_write }}" + externalLabels: "{{ prometheus_external_labels }}" + # additionalScrapeConfigs: "{{ prometheus_scrape_configs }}" TODO: readd + # may need to change these + securityContext: + runAsUser: 0 + runAsNonRoot: false + runAsGroup: 0 + fsGroup: 0 + nodeSelector: + kubernetes.io/hostname: "{{ openhpc_cluster_name }}-control.{{ openhpc_cluster_name }}.{{ cluster_domain_suffix }}" + + additionalPrometheusRulesMap: "{{ prometheus_alert_rules }}" + + grafana: + ingress: + enabled: true + hosts: + - "{{ control_sslip }}" + ingressClassName: traefik + path: /grafana + sidecar: + dashboards: + searchNamespace: ALL + grafana.ini: + server: + domain: "{{ control_sslip }}" + root_url: "http://{{ control_sslip }}/grafana" + serve_from_sub_path: true + auth.anonymous: + enabled: true + nodeSelector: + kubernetes.io/hostname: "{{ openhpc_cluster_name }}-control.{{ openhpc_cluster_name }}.{{ cluster_domain_suffix }}" + + alertmanager: + ingress: + enabled: true + hosts: + - "{{ control_sslip }}" + ingressClassName: traefik + paths: + - /alertmanager + # Don't apply the namespace grouping by default + config: + route: + group_by: ['...'] + global: + resolve_timeout: "{{ prometheus_config_flags_extra.alertmanager.timeout | default( '5m' ) }}" + alertmanagerSpec: + routePrefix: /alertmanager + externalUrl: "{{ kube_prometheus_stack_alertmanager_external_url }}" + # Make sure that alertmanager finds configurations with the alertmanager name as a label + alertmanagerConfigSelector: + matchLabels: + alertmanager: "{{ kube_prometheus_stack_release_name }}-alertmanager" + # Do NOT add the namespace matcher to routes from AlertmanagerConfig resources + alertmanagerConfigMatcherStrategy: + type: None + nodeSelector: + kubernetes.io/hostname: "{{ openhpc_cluster_name }}-control.{{ openhpc_cluster_name }}.{{ cluster_domain_suffix }}" + scheme: "{{ prometheus_alertmanager_config.scheme | default( '' ) }}" + serviceMonitor: + scheme: "{{ prometheus_alertmanager_config.scheme | default( '' ) }}" + proxyUrl: "{{ prometheus_alertmanager_config.proxy_url | default( '' ) }}" + + + prometheusOperator: + nodeSelector: + kubernetes.io/hostname: "{{ openhpc_cluster_name }}-control.{{ openhpc_cluster_name }}.{{ cluster_domain_suffix }}" + + kube-state-metrics: + nodeSelector: + kubernetes.io/hostname: "{{ openhpc_cluster_name }}-control.{{ openhpc_cluster_name }}.{{ cluster_domain_suffix }}" + +kube_prometheus_stack_release_overrides: {} + +kube_prometheus_stack_release_values: >- + {{- + kube_prometheus_stack_release_defaults | + combine(kube_prometheus_stack_release_overrides, recursive = True) + }} diff --git a/ansible/roles/kube_prometheus_stack/defaults/main.yml b/ansible/roles/kube_prometheus_stack/defaults/main/main.yml similarity index 64% rename from ansible/roles/kube_prometheus_stack/defaults/main.yml rename to ansible/roles/kube_prometheus_stack/defaults/main/main.yml index f05fe8488..cd0635658 100644 --- a/ansible/roles/kube_prometheus_stack/defaults/main.yml +++ b/ansible/roles/kube_prometheus_stack/defaults/main/main.yml @@ -12,50 +12,7 @@ kube_prometheus_stack_release_name: kube-prometheus-stack # The timeout to wait for the release to become ready kube_prometheus_stack_wait_timeout: 10m -# The external URLs for Prometheus and Alertmanager -_kube_prometheus_stack_external_url_tls_enabled: >- - {{- - admin_dashboard_ingress_tls_enabled | - default(ingress_tls_enabled | default(True)) - }} -_kube_prometheus_stack_external_url_scheme: >- - {{- "https" if _kube_prometheus_stack_external_url_tls_enabled else "http" }} -_kube_prometheus_stack_alertmanager_host: >- - {{- - admin_dashboard_ingress_alertmanager_host | - default( - "{}.{}".format(ingress_alertmanager_subdomain, ingress_base_domain) - if ingress_alertmanager_subdomain is defined and ingress_base_domain is defined - else "" - ) - }} -kube_prometheus_stack_alertmanager_external_url: >- - {{- - "{}://{}".format( - _kube_prometheus_stack_external_url_scheme, - _kube_prometheus_stack_alertmanager_host - ) - if _kube_prometheus_stack_alertmanager_host - else "" - }} -_kube_prometheus_stack_prometheus_host: >- - {{- - admin_dashboard_ingress_prometheus_host | - default( - "{}.{}".format(ingress_prometheus_subdomain, ingress_base_domain) - if ingress_prometheus_subdomain is defined and ingress_base_domain is defined - else "" - ) - }} -kube_prometheus_stack_prometheus_external_url: >- - {{- - "{}://{}".format( - _kube_prometheus_stack_external_url_scheme, - _kube_prometheus_stack_prometheus_host - ) - if _kube_prometheus_stack_prometheus_host - else "" - }} +control_sslip: "{{ ansible_all_ipv4_addresses[0] | regex_replace('\\.', '-') }}.sslip.io" ### PREVIOUS ROLE VALUES @@ -122,9 +79,9 @@ prometheus_remote_read: [] # basic_auth: # password: FOO -# todo: readd -# prometheus_external_labels: -# environment: "{{ ansible_fqdn | default(ansible_host) | default(inventory_hostname) }}" +prometheus_external_labels: + environment: "{{ control_sslip }}" +# environment: "{{ ansible_fqdn | default(ansible_host) | default(inventory_hostname) }}" prometheus_targets: {} # node: @@ -133,7 +90,7 @@ prometheus_targets: {} # labels: # env: test -# todo: readd +# TODO: readd # prometheus_scrape_configs: # - job_name: "prometheus" # metrics_path: "{{ prometheus_metrics_path }}" @@ -285,127 +242,3 @@ prometheus_alert_rules: # ------------------------------------------------------------------------------------------ -control_sslip: "{{ ansible_all_ipv4_addresses[0] | regex_replace('\\.', '-') }}.sslip.io" - -# The values for the kube-prometheus-stack release -kube_prometheus_stack_release_defaults: - defaultRules: - disabled: - # None of these are relevant in k3s context - KubeSchedulerDown: true - KubeProxyDown: true - KubeControllerManagerDown: true - prometheus: - ingress: - enabled: true - hosts: - - "{{ control_sslip }}" - ingressClassName: traefik - paths: - - /prometheus - prometheusSpec: - routePrefix: /prometheus - externalUrl: "{{ kube_prometheus_stack_prometheus_external_url }}" - podMonitorSelectorNilUsesHelmValues: false - serviceMonitorSelectorNilUsesHelmValues: false - probeSelectorNilUsesHelmValues: false - ruleSelectorNilUsesHelmValues: false - image: - tag: "{{ prometheus_image_tag }}" - storageSpec: - volumeClaimTemplate: - spec: - volumeName: appliances-state-dir - accessModes: - - ReadWriteOnce - resources: - requests: - storage: "{{ kube_prometheus_stack_volume_size }}" - retention: "{{ prometheus_storage_retention }}" - retentionSize: "{{ prometheus_storage_retention_size }}" - additionalAlertRelabelConfigs: "{{ prometheus_alert_relabel_configs }}" - scrapeInterval: "{{ prometheus_global.scrape_interval }}" - scrapeTimeout: "{{ prometheus_global.scrape_timeout }}" - evaluationInterval: "{{ prometheus_global.evaluation_interval }}" - remoteRead: "{{ prometheus_remote_read }}" - remoteWrite: "{{ prometheus_remote_write }}" - # externalLabels: "{{ prometheus_external_labels }}" TODO: readd - # additionalScrapeConfigs: "{{ prometheus_scrape_configs }}" TODO:readd - # may need to change these - securityContext: - runAsUser: 0 - runAsNonRoot: false - runAsGroup: 0 - fsGroup: 0 - nodeSelector: - kubernetes.io/hostname: "{{ openhpc_cluster_name }}-control.{{ openhpc_cluster_name }}.{{ cluster_domain_suffix }}" - - additionalPrometheusRulesMap: "{{ prometheus_alert_rules }}" - - grafana: - ingress: - enabled: true - hosts: - - "{{ control_sslip }}" - ingressClassName: traefik - path: / - sidecar: - dashboards: - searchNamespace: ALL - grafana.ini: - server: - domain: "{{ control_sslip }}" - root_url: "http://{{ control_sslip }}/grafana" - serve_from_sub_path: true - auth.anonymous: - enabled: true - nodeSelector: - kubernetes.io/hostname: "{{ openhpc_cluster_name }}-control.{{ openhpc_cluster_name }}.{{ cluster_domain_suffix }}" - - alertmanager: - ingress: - enabled: true - hosts: - - "{{ control_sslip }}" - ingressClassName: traefik - paths: - - /alertmanager - # Don't apply the namespace grouping by default - config: - route: - group_by: ['...'] - global: - resolve_timeout: "{{ prometheus_config_flags_extra.alertmanager.timeout | default( '5m' ) }}" - alertmanagerSpec: - routePrefix: /alertmanager - externalUrl: "{{ kube_prometheus_stack_alertmanager_external_url }}" - # Make sure that alertmanager finds configurations with the alertmanager name as a label - alertmanagerConfigSelector: - matchLabels: - alertmanager: "{{ kube_prometheus_stack_release_name }}-alertmanager" - # Do NOT add the namespace matcher to routes from AlertmanagerConfig resources - alertmanagerConfigMatcherStrategy: - type: None - nodeSelector: - kubernetes.io/hostname: "{{ openhpc_cluster_name }}-control.{{ openhpc_cluster_name }}.{{ cluster_domain_suffix }}" - scheme: "{{ prometheus_alertmanager_config.scheme | default( '' ) }}" - serviceMonitor: - scheme: "{{ prometheus_alertmanager_config.scheme | default( '' ) }}" - proxyUrl: "{{ prometheus_alertmanager_config.proxy_url | default( '' ) }}" - - - prometheusOperator: - nodeSelector: - kubernetes.io/hostname: "{{ openhpc_cluster_name }}-control.{{ openhpc_cluster_name }}.{{ cluster_domain_suffix }}" - - kube-state-metrics: - nodeSelector: - kubernetes.io/hostname: "{{ openhpc_cluster_name }}-control.{{ openhpc_cluster_name }}.{{ cluster_domain_suffix }}" - -kube_prometheus_stack_release_overrides: {} - -kube_prometheus_stack_release_values: >- - {{- - kube_prometheus_stack_release_defaults | - combine(kube_prometheus_stack_release_overrides, recursive = True) - }} From 560eb96a10a361fd0b1e592b8dc474eaadef4e4c Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Wed, 25 Sep 2024 12:20:48 +0100 Subject: [PATCH 07/90] replaced monitoring in site.yml and fixed sslip IPs --- ansible/monitoring.yml | 44 +++---------------- .../defaults/main/main.yml | 2 +- .../kube_prometheus_stack/tasks/main.yml | 18 -------- 3 files changed, 7 insertions(+), 57 deletions(-) diff --git a/ansible/monitoring.yml b/ansible/monitoring.yml index 84f319688..e8507fc0a 100644 --- a/ansible/monitoring.yml +++ b/ansible/monitoring.yml @@ -33,11 +33,13 @@ - import_role: name: filebeat -- name: Deploy node_exporter - hosts: node_exporter - tags: node_exporter +- name: Install monitoring helm chart + hosts: prometheus + become: true + tags: prometheus tasks: - - import_role: name=cloudalchemy.node_exporter + - ansible.builtin.import_role: + name: kube_prometheus_stack - name: Deploy OpenOndemand exporter hosts: openondemand @@ -57,37 +59,3 @@ tasks: - import_role: name: slurm_exporter - -- name: Setup core monitoring software - hosts: prometheus - tags: prometheus - tasks: - - name: Check for existing prometheus binaries - stat: - path: /usr/local/bin/{{ item }} - register: prometheus_binaries - loop: - - prometheus - - promtool - - name: Skip prometheus install if prometheus binaries exist and prometheus_version not defined - # i.e. if prometheus_version isn't defined we don't care, so use what's already there - set_fact: - prometheus_skip_install: "{{ false if prometheus_version is defined else true }}" - when: "{{ (prometheus_binaries.results | map(attribute='stat') | map(attribute='exists')) + [prometheus_skip_install is not defined] }}" - - import_role: - name: cloudalchemy.prometheus - -- name: Deploy grafana - hosts: grafana - tags: grafana - tasks: - - assert: - that: vault_grafana_admin_password is defined - fail_msg: "Must define vault_grafana_admin_password - use `ansible-playbook generate-passwords.yml` to generate a set of passwords" - - include_role: - name: cloudalchemy.grafana - vars: - # We use internal roles to register the dashboards as the role does not support all options that we require. - grafana_dashboards: [] - - import_role: # done in same play so it can use handlers from cloudalchemy.grafana - name: grafana-dashboards diff --git a/ansible/roles/kube_prometheus_stack/defaults/main/main.yml b/ansible/roles/kube_prometheus_stack/defaults/main/main.yml index cd0635658..4d91ed8e1 100644 --- a/ansible/roles/kube_prometheus_stack/defaults/main/main.yml +++ b/ansible/roles/kube_prometheus_stack/defaults/main/main.yml @@ -12,7 +12,7 @@ kube_prometheus_stack_release_name: kube-prometheus-stack # The timeout to wait for the release to become ready kube_prometheus_stack_wait_timeout: 10m -control_sslip: "{{ ansible_all_ipv4_addresses[0] | regex_replace('\\.', '-') }}.sslip.io" +control_sslip: "{{ ansible_default_ipv4.address| default(ansible_all_ipv4_addresses[0]) | regex_replace('\\.', '-') }}.sslip.io" ### PREVIOUS ROLE VALUES diff --git a/ansible/roles/kube_prometheus_stack/tasks/main.yml b/ansible/roles/kube_prometheus_stack/tasks/main.yml index a3828f9e9..f0d343c27 100644 --- a/ansible/roles/kube_prometheus_stack/tasks/main.yml +++ b/ansible/roles/kube_prometheus_stack/tasks/main.yml @@ -57,21 +57,3 @@ create_namespace: yes wait: yes wait_timeout: "{{ kube_prometheus_stack_wait_timeout }}" - -# kind: Ingress -# metadata: -# name: test-ingress -# annotations: -# traefik.ingress.kubernetes.io/router.entrypoints: web -# spec: -# rules: -# - host: 172-16-0-125.sslip.io -# http: -# paths: -# - path: /grafana -# pathType: Prefix -# backend: -# service: -# name: kube-prometheus-stack-grafana -# port: -# name: http-web \ No newline at end of file From 0106a951b8cb252728400fc0be8a7d68d0b94589 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Wed, 25 Sep 2024 15:40:14 +0100 Subject: [PATCH 08/90] Added slurm exporter service to k3s --- .../defaults/main/helm.yml | 2 +- .../defaults/main/main.yml | 3 +- .../kube_prometheus_stack/tasks/main.yml | 29 ++++++++++ .../inventory/group_vars/all/prometheus.yml | 56 +++++++++++-------- 4 files changed, 64 insertions(+), 26 deletions(-) diff --git a/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml b/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml index d6e45885e..ea02dca60 100644 --- a/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml +++ b/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml @@ -86,7 +86,7 @@ kube_prometheus_stack_release_defaults: remoteRead: "{{ prometheus_remote_read }}" remoteWrite: "{{ prometheus_remote_write }}" externalLabels: "{{ prometheus_external_labels }}" - # additionalScrapeConfigs: "{{ prometheus_scrape_configs }}" TODO: readd + additionalScrapeConfigs: "{{ prometheus_scrape_configs }}" # may need to change these securityContext: runAsUser: 0 diff --git a/ansible/roles/kube_prometheus_stack/defaults/main/main.yml b/ansible/roles/kube_prometheus_stack/defaults/main/main.yml index 4d91ed8e1..d61a9315a 100644 --- a/ansible/roles/kube_prometheus_stack/defaults/main/main.yml +++ b/ansible/roles/kube_prometheus_stack/defaults/main/main.yml @@ -12,7 +12,8 @@ kube_prometheus_stack_release_name: kube-prometheus-stack # The timeout to wait for the release to become ready kube_prometheus_stack_wait_timeout: 10m -control_sslip: "{{ ansible_default_ipv4.address| default(ansible_all_ipv4_addresses[0]) | regex_replace('\\.', '-') }}.sslip.io" +control_ip: "{{ ansible_default_ipv4.address| default(ansible_all_ipv4_addresses[0]) }}" +control_sslip: "{{ control_ip | regex_replace('\\.', '-') }}.sslip.io" ### PREVIOUS ROLE VALUES diff --git a/ansible/roles/kube_prometheus_stack/tasks/main.yml b/ansible/roles/kube_prometheus_stack/tasks/main.yml index f0d343c27..e4e0f78ca 100644 --- a/ansible/roles/kube_prometheus_stack/tasks/main.yml +++ b/ansible/roles/kube_prometheus_stack/tasks/main.yml @@ -45,6 +45,35 @@ annotations: storageclass.kubernetes.io/is-default-class: "false" +- name: Creating headless service for slurm exporter + kubernetes.core.k8s: + namespace: "{{ kube_prometheus_stack_release_namespace }}" + definition: + kind: Service + metadata: + name: slurm-exporter + spec: + clusterIP: None + ports: + - name: slurm-exporter + port: 9341 + protocol: TCP + +- name: Binding slurm exporter service to host + kubernetes.core.k8s: + namespace: "{{ kube_prometheus_stack_release_namespace }}" + definition: + kind: Endpoints + metadata: + name: slurm-exporter + subsets: + - addresses: + - ip: "{{ control_ip }}" + ports: + - port: 9341 + name: slurm-exporter + protocol: TCP + - name: Install kube-prometheus-stack on target Kubernetes cluster kubernetes.core.helm: chart_ref: "{{ kube_prometheus_stack_chart_name }}" diff --git a/environments/common/inventory/group_vars/all/prometheus.yml b/environments/common/inventory/group_vars/all/prometheus.yml index 87da90e4a..ccff07c01 100644 --- a/environments/common/inventory/group_vars/all/prometheus.yml +++ b/environments/common/inventory/group_vars/all/prometheus.yml @@ -21,35 +21,43 @@ prometheus_alert_rules: [] prometheus_targets: node: "{{ groups.get('node_exporter', []) | reject('equalto', 'localhost') | prometheus_node_exporter_targets(env | default('ungrouped')) }}" -prometheus_scrape_configs_default: -- job_name: "prometheus" - metrics_path: "/metrics" - static_configs: - - targets: - - "{{ prometheus_address }}:9090" -- job_name: "grafana" - static_configs: - - targets: - - "{{ grafana_api_address }}:{{ grafana_port }}" -- job_name: "node" - file_sd_configs: - - files: - - "/etc/prometheus/file_sd/node.yml" - relabel_configs: - # strip off port - - source_labels: ['__address__'] - separator: ':' - regex: '(.*):.*' - target_label: 'instance' - replacement: '${1}' - scrape_interval: 30s - scrape_timeout: 20s +# prometheus_scrape_configs_default: +# - job_name: "prometheus" +# metrics_path: "/metrics" +# static_configs: +# - targets: +# - "{{ prometheus_address }}:9090" +# - job_name: "grafana" +# static_configs: +# - targets: +# - "{{ grafana_api_address }}:{{ grafana_port }}" +# - job_name: "node" +# file_sd_configs: +# - files: +# - "/etc/prometheus/file_sd/node.yml" +# relabel_configs: +# # strip off port +# - source_labels: ['__address__'] +# separator: ':' +# regex: '(.*):.*' +# target_label: 'instance' +# replacement: '${1}' +# scrape_interval: 30s +# scrape_timeout: 20s + +# - job_name: "slurm_exporter" +# scrape_interval: 30s +# scrape_timeout: 30s +# static_configs: +# - targets: +# - "{{ openhpc_slurm_control_host }}:{{ slurm_exporter_port }}" +prometheus_scrape_configs_default: - job_name: "slurm_exporter" scrape_interval: 30s scrape_timeout: 30s static_configs: - targets: - - "{{ openhpc_slurm_control_host }}:{{ slurm_exporter_port }}" + - "{{ control_ip }}:{{ slurm_exporter_port }}" prometheus_scrape_configs: "{{ prometheus_scrape_configs_default + (openondemand_scrape_configs if groups['openondemand'] | count > 0 else []) }}" From a4dca772d93427850ca0cff8bc3cbe6a92ea9db4 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Thu, 26 Sep 2024 09:54:29 +0100 Subject: [PATCH 09/90] Added ood exporter to k3s --- .../defaults/main/main.yml | 14 ++++----- .../kube_prometheus_stack/tasks/main.yml | 30 +++++++++++++++++++ .../inventory/group_vars/all/openondemand.yml | 2 +- 3 files changed, 38 insertions(+), 8 deletions(-) diff --git a/ansible/roles/kube_prometheus_stack/defaults/main/main.yml b/ansible/roles/kube_prometheus_stack/defaults/main/main.yml index d61a9315a..4d072ee36 100644 --- a/ansible/roles/kube_prometheus_stack/defaults/main/main.yml +++ b/ansible/roles/kube_prometheus_stack/defaults/main/main.yml @@ -12,6 +12,7 @@ kube_prometheus_stack_release_name: kube-prometheus-stack # The timeout to wait for the release to become ready kube_prometheus_stack_wait_timeout: 10m +login_ip: "{{ hostvars[groups['openondemand'][0]]['ansible_host'] }}" # probably needs to be more robust control_ip: "{{ ansible_default_ipv4.address| default(ansible_all_ipv4_addresses[0]) }}" control_sslip: "{{ control_ip | regex_replace('\\.', '-') }}.sslip.io" @@ -91,13 +92,12 @@ prometheus_targets: {} # labels: # env: test -# TODO: readd -# prometheus_scrape_configs: -# - job_name: "prometheus" -# metrics_path: "{{ prometheus_metrics_path }}" -# static_configs: -# - targets: -# - "{{ ansible_fqdn | default(ansible_host) | default('localhost') }}:9090" +prometheus_scrape_configs: + - job_name: "prometheus" + metrics_path: "{{ prometheus_metrics_path }}" + static_configs: + - targets: + - "{{ ansible_fqdn | default(ansible_host) | default('localhost') }}:9090" # - job_name: "node" # file_sd_configs: # - files: diff --git a/ansible/roles/kube_prometheus_stack/tasks/main.yml b/ansible/roles/kube_prometheus_stack/tasks/main.yml index e4e0f78ca..8bafb0b2d 100644 --- a/ansible/roles/kube_prometheus_stack/tasks/main.yml +++ b/ansible/roles/kube_prometheus_stack/tasks/main.yml @@ -45,6 +45,36 @@ annotations: storageclass.kubernetes.io/is-default-class: "false" +# not looping through these because templating doesn't set ports as integer +- name: Creating headless service for OOD exporter + kubernetes.core.k8s: + namespace: "{{ kube_prometheus_stack_release_namespace }}" + definition: + kind: Service + metadata: + name: ood-exporter + spec: + clusterIP: None + ports: + - name: ood-exporter + port: 9301 + protocol: TCP + +- name: Binding OOD exporter service to host + kubernetes.core.k8s: + namespace: "{{ kube_prometheus_stack_release_namespace }}" + definition: + kind: Endpoints + metadata: + name: ood-exporter + subsets: + - addresses: + - ip: "{{ login_ip }}" + ports: + - port: 9301 + name: ood-exporter + protocol: TCP + - name: Creating headless service for slurm exporter kubernetes.core.k8s: namespace: "{{ kube_prometheus_stack_release_namespace }}" diff --git a/environments/common/inventory/group_vars/all/openondemand.yml b/environments/common/inventory/group_vars/all/openondemand.yml index 5e85392ca..f206f8192 100644 --- a/environments/common/inventory/group_vars/all/openondemand.yml +++ b/environments/common/inventory/group_vars/all/openondemand.yml @@ -178,7 +178,7 @@ openondemand_scrape_configs: scrape_interval: 2m static_configs: - targets: - - "{{ openondemand_address }}:9301" + - "{{ login_ip }}:9301" labels: environment: "{{ appliances_environment_name }}" service: "openondemand" From 6081d77c372f36f56b4cca20a923c6069d322fc4 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Thu, 26 Sep 2024 11:05:19 +0100 Subject: [PATCH 10/90] added grafana metrics --- .../defaults/main/main.yml | 15 ++++---- .../inventory/group_vars/all/prometheus.yml | 35 +++---------------- 2 files changed, 12 insertions(+), 38 deletions(-) diff --git a/ansible/roles/kube_prometheus_stack/defaults/main/main.yml b/ansible/roles/kube_prometheus_stack/defaults/main/main.yml index 4d072ee36..70f101f33 100644 --- a/ansible/roles/kube_prometheus_stack/defaults/main/main.yml +++ b/ansible/roles/kube_prometheus_stack/defaults/main/main.yml @@ -93,15 +93,16 @@ prometheus_targets: {} # env: test prometheus_scrape_configs: - - job_name: "prometheus" - metrics_path: "{{ prometheus_metrics_path }}" + - job_name: "slurm_exporter" + scrape_interval: 30s + scrape_timeout: 30s static_configs: - targets: - - "{{ ansible_fqdn | default(ansible_host) | default('localhost') }}:9090" -# - job_name: "node" -# file_sd_configs: -# - files: -# - "{{ prometheus_config_dir }}/file_sd/node.yml" + - "{{ control_ip }}:{{ slurm_exporter_port }}" + - job_name: "grafana" + static_configs: + - targets: + - "kube-prometheus-stack-grafana:80" # Alternative config file name, searched in ansible templates path. # prometheus_config_file: 'prometheus.yml.j2' diff --git a/environments/common/inventory/group_vars/all/prometheus.yml b/environments/common/inventory/group_vars/all/prometheus.yml index ccff07c01..ac202e197 100644 --- a/environments/common/inventory/group_vars/all/prometheus.yml +++ b/environments/common/inventory/group_vars/all/prometheus.yml @@ -21,37 +21,6 @@ prometheus_alert_rules: [] prometheus_targets: node: "{{ groups.get('node_exporter', []) | reject('equalto', 'localhost') | prometheus_node_exporter_targets(env | default('ungrouped')) }}" -# prometheus_scrape_configs_default: -# - job_name: "prometheus" -# metrics_path: "/metrics" -# static_configs: -# - targets: -# - "{{ prometheus_address }}:9090" -# - job_name: "grafana" -# static_configs: -# - targets: -# - "{{ grafana_api_address }}:{{ grafana_port }}" -# - job_name: "node" -# file_sd_configs: -# - files: -# - "/etc/prometheus/file_sd/node.yml" -# relabel_configs: -# # strip off port -# - source_labels: ['__address__'] -# separator: ':' -# regex: '(.*):.*' -# target_label: 'instance' -# replacement: '${1}' -# scrape_interval: 30s -# scrape_timeout: 20s - -# - job_name: "slurm_exporter" -# scrape_interval: 30s -# scrape_timeout: 30s -# static_configs: -# - targets: -# - "{{ openhpc_slurm_control_host }}:{{ slurm_exporter_port }}" - prometheus_scrape_configs_default: - job_name: "slurm_exporter" scrape_interval: 30s @@ -59,5 +28,9 @@ prometheus_scrape_configs_default: static_configs: - targets: - "{{ control_ip }}:{{ slurm_exporter_port }}" +- job_name: "grafana" + static_configs: + - targets: + - "kube-prometheus-stack-grafana:80" prometheus_scrape_configs: "{{ prometheus_scrape_configs_default + (openondemand_scrape_configs if groups['openondemand'] | count > 0 else []) }}" From 84fd3558ff62bfa02c2ea21b0f68fcc1da75be83 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Thu, 26 Sep 2024 11:27:29 +0100 Subject: [PATCH 11/90] fixed alertmanager status --- ansible/roles/kube_prometheus_stack/defaults/main/helm.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml b/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml index ea02dca60..e78212512 100644 --- a/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml +++ b/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml @@ -133,6 +133,7 @@ kube_prometheus_stack_release_defaults: global: resolve_timeout: "{{ prometheus_config_flags_extra.alertmanager.timeout | default( '5m' ) }}" alertmanagerSpec: + forceEnableClusterMode: true routePrefix: /alertmanager externalUrl: "{{ kube_prometheus_stack_alertmanager_external_url }}" # Make sure that alertmanager finds configurations with the alertmanager name as a label From e2d1c62b62f58199a604f83940c759410a038294 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Fri, 27 Sep 2024 11:09:19 +0100 Subject: [PATCH 12/90] Dashboards now installed into k3s (dataources not configured yet) --- .../roles/grafana-dashboards/tasks/main.yml | 149 +++++++++++------- .../templates/configmap-template.yml.j2 | 9 ++ .../kube_prometheus_stack/tasks/main.yml | 4 + .../inventory/group_vars/all/prometheus.yml | 2 + 4 files changed, 107 insertions(+), 57 deletions(-) create mode 100644 ansible/roles/grafana-dashboards/templates/configmap-template.yml.j2 diff --git a/ansible/roles/grafana-dashboards/tasks/main.yml b/ansible/roles/grafana-dashboards/tasks/main.yml index 235088f77..211b6c6f0 100644 --- a/ansible/roles/grafana-dashboards/tasks/main.yml +++ b/ansible/roles/grafana-dashboards/tasks/main.yml @@ -25,7 +25,8 @@ - become: false block: - name: Create local grafana dashboard directory - tempfile: + file: + path: /tmp/dashboards state: directory register: _tmp_dashboards changed_when: false @@ -107,59 +108,93 @@ when: - grafana_dashboards | length > 0 -- name: Create/Update dashboards file (provisioning) - become: true - copy: - dest: "/etc/grafana/provisioning/dashboards/ansible.yml" - content: | - apiVersion: 1 - providers: - - name: 'default' - orgId: 1 - folder: '' - type: file - options: - path: "{{ grafana_data_dir }}/dashboards" - backup: false - owner: root - group: grafana - mode: 0640 - notify: restart grafana - -- name: Register preexisting dashboards - become: true - find: - paths: "{{ grafana_data_dir }}/dashboards" - hidden: true - patterns: - - "*.json" - register: _dashboards_pre - -- name: Import grafana dashboards - become: true - copy: - remote_src: yes - src: "{{ _tmp_dashboards.path }}/" # Note trailing / to only copy contents, not directory itself - dest: "{{ grafana_data_dir }}/dashboards/" - notify: "provisioned dashboards changed" - -- name: Register all installed dashboards - become: true - find: - paths: "{{ grafana_data_dir }}/dashboards" - hidden: true - patterns: - - "*.json" - register: _dashboards_post - -- name: Get dashboard lists - set_fact: - _dashboards_pre_list: "{{ _dashboards_pre | json_query('files[*].path') | default([]) }}" - _dashboards_post_list: "{{ _dashboards_post | json_query('files[*].path') | default([]) }}" - -- name: Remove installed dashboards not defined through this role - become: true - file: - path: "{{ item }}" - state: absent - with_items: "{{ _dashboards_pre_list | difference( _dashboards_post_list ) }}" +# Templating partial manifests and then adding the dashboard's data server-side because the k8s module doesn't like Jinja2 +- name: Create partial configmaps for server-side templating + ansible.builtin.template: + src: configmap-template.yml.j2 + dest: "{{ _tmp_dashboards.path }}/{{ item_filename }}.yml" + loop: "{{ grafana_dashboards }}" + vars: + item_filename: "{{ ((item.dashboard_id | string) + '.json') if 'dashboard_id' in item else item.dashboard_file }}" + +- name: Setting data keys + ansible.builtin.replace: + path: "{{ _tmp_dashboards.path }}/{{ item_filename }}.yml" + regexp: 'PLACEHOLDER' + replace: "{{ item_filename }}" + loop: "{{ grafana_dashboards }}" + vars: + item_filename: "{{ ((item.dashboard_id | string) + '.json') if 'dashboard_id' in item else item.dashboard_file }}" + +- name: Appending json data to configmaps + ansible.builtin.shell: + cmd: "sed 's/^/ /' {{ item_path }} >> {{ item_path }}.yml" + loop: "{{ grafana_dashboards }}" + vars: + item_filename: "{{ ((item.dashboard_id | string) + '.json') if 'dashboard_id' in item else item.dashboard_file }}" + item_path: "{{ _tmp_dashboards.path }}/{{ item_filename }}" + +- name: Applying dashboard configmaps + ansible.builtin.k8s: + src: "{{ item_path }}.yml" + loop: "{{ grafana_dashboards }}" + vars: + item_filename: "{{ ((item.dashboard_id | string) + '.json') if 'dashboard_id' in item else item.dashboard_file }}" + item_path: "{{ _tmp_dashboards.path }}/{{ item_filename }}" + +# - name: Create/Update dashboards file (provisioning) +# become: true +# copy: +# dest: "/etc/grafana/provisioning/dashboards/ansible.yml" +# content: | +# apiVersion: 1 +# providers: +# - name: 'default' +# orgId: 1 +# folder: '' +# type: file +# options: +# path: "{{ grafana_data_dir }}/dashboards" +# backup: false +# owner: root +# group: grafana +# mode: 0640 +# notify: restart grafana + +# - name: Register preexisting dashboards +# become: true +# find: +# paths: "{{ grafana_data_dir }}/dashboards" +# hidden: true +# patterns: +# - "*.json" +# register: _dashboards_pre + +# - name: Import grafana dashboards +# become: true +# copy: +# remote_src: yes +# src: "{{ _tmp_dashboards.path }}/" # Note trailing / to only copy contents, not directory itself +# dest: "{{ grafana_data_dir }}/dashboards/" +# notify: "provisioned dashboards changed" + +# - name: Register all installed dashboards +# become: true +# find: +# paths: "{{ grafana_data_dir }}/dashboards" +# hidden: true +# patterns: +# - "*.json" +# register: _dashboards_post + +# - name: Get dashboard lists +# set_fact: +# _dashboards_pre_list: "{{ _dashboards_pre | json_query('files[*].path') | default([]) }}" +# _dashboards_post_list: "{{ _dashboards_post | json_query('files[*].path') | default([]) }}" + +# - name: Remove installed dashboards not defined through this role +# become: true +# file: +# path: "{{ item }}" +# state: absent +# with_items: "{{ _dashboards_pre_list | difference( _dashboards_post_list ) }}" diff --git a/ansible/roles/grafana-dashboards/templates/configmap-template.yml.j2 b/ansible/roles/grafana-dashboards/templates/configmap-template.yml.j2 new file mode 100644 index 000000000..d6e2473a0 --- /dev/null +++ b/ansible/roles/grafana-dashboards/templates/configmap-template.yml.j2 @@ -0,0 +1,9 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + namespace: "{{ kube_prometheus_stack_release_namespace }}" + name: "{{ item_filename }}" + labels: + grafana_dashboard: "1" +data: + PLACEHOLDER: | diff --git a/ansible/roles/kube_prometheus_stack/tasks/main.yml b/ansible/roles/kube_prometheus_stack/tasks/main.yml index 8bafb0b2d..d2d08fc24 100644 --- a/ansible/roles/kube_prometheus_stack/tasks/main.yml +++ b/ansible/roles/kube_prometheus_stack/tasks/main.yml @@ -104,6 +104,10 @@ name: slurm-exporter protocol: TCP +- name: Import grafana dashboards + ansible.builtin.import_role: + name: grafana-dashboards + - name: Install kube-prometheus-stack on target Kubernetes cluster kubernetes.core.helm: chart_ref: "{{ kube_prometheus_stack_chart_name }}" diff --git a/environments/common/inventory/group_vars/all/prometheus.yml b/environments/common/inventory/group_vars/all/prometheus.yml index ac202e197..d2239b5c3 100644 --- a/environments/common/inventory/group_vars/all/prometheus.yml +++ b/environments/common/inventory/group_vars/all/prometheus.yml @@ -33,4 +33,6 @@ prometheus_scrape_configs_default: - targets: - "kube-prometheus-stack-grafana:80" +kube_prometheus_stack_release_namespace: monitoring-system + prometheus_scrape_configs: "{{ prometheus_scrape_configs_default + (openondemand_scrape_configs if groups['openondemand'] | count > 0 else []) }}" From 7afdc1dc3b5ba9481525cd9e709f18c3ac37fa96 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Tue, 1 Oct 2024 13:49:11 +0100 Subject: [PATCH 13/90] Added slurmstats datasource --- .../defaults/main/helm.yml | 2 + .../kube_prometheus_stack/tasks/main.yml | 38 ++++++++++++++++++- .../inventory/group_vars/all/grafana.yml | 10 ++--- 3 files changed, 44 insertions(+), 6 deletions(-) diff --git a/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml b/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml index e78212512..9c37115e5 100644 --- a/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml +++ b/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml @@ -115,6 +115,8 @@ kube_prometheus_stack_release_defaults: serve_from_sub_path: true auth.anonymous: enabled: true + additionalDataSources: "{{ grafana_datasources }}" + plugins: "{{ grafana_plugins }}" nodeSelector: kubernetes.io/hostname: "{{ openhpc_cluster_name }}-control.{{ openhpc_cluster_name }}.{{ cluster_domain_suffix }}" diff --git a/ansible/roles/kube_prometheus_stack/tasks/main.yml b/ansible/roles/kube_prometheus_stack/tasks/main.yml index d2d08fc24..aac539b87 100644 --- a/ansible/roles/kube_prometheus_stack/tasks/main.yml +++ b/ansible/roles/kube_prometheus_stack/tasks/main.yml @@ -1,5 +1,12 @@ --- +- name: Creating namespace + kubernetes.core.k8s: + name: "{{ kube_prometheus_stack_release_namespace }}" + api_version: v1 + kind: Namespace + state: present + # Because of the way Helm handles CRDs, we upgrade them first - name: Get kube-prometheus-stack CRDs command: >- @@ -104,6 +111,35 @@ name: slurm-exporter protocol: TCP +- name: Creating headless service for opensearch datasource + kubernetes.core.k8s: + namespace: "{{ kube_prometheus_stack_release_namespace }}" + definition: + kind: Service + metadata: + name: opensearch + spec: + clusterIP: None + ports: + - name: opensearch + port: 9200 + protocol: TCP + +- name: Binding opensearch service to host + kubernetes.core.k8s: + namespace: "{{ kube_prometheus_stack_release_namespace }}" + definition: + kind: Endpoints + metadata: + name: opensearch + subsets: + - addresses: + - ip: "{{ control_ip }}" + ports: + - port: 9200 + name: opensearch + protocol: TCP + - name: Import grafana dashboards ansible.builtin.import_role: name: grafana-dashboards @@ -117,6 +153,6 @@ release_name: "{{ kube_prometheus_stack_release_name }}" release_values: "{{ kube_prometheus_stack_release_values }}" atomic: yes - create_namespace: yes + create_namespace: no wait: yes wait_timeout: "{{ kube_prometheus_stack_wait_timeout }}" diff --git a/environments/common/inventory/group_vars/all/grafana.yml b/environments/common/inventory/group_vars/all/grafana.yml index 90ef51c59..802b025e0 100644 --- a/environments/common/inventory/group_vars/all/grafana.yml +++ b/environments/common/inventory/group_vars/all/grafana.yml @@ -58,14 +58,14 @@ grafana_security: allow_embedding: true grafana_datasources: - - name: prometheus - type: prometheus - url: "http://{{ prometheus_address }}:9090" # default prometheus port - editable: true + # - name: prometheus + # type: prometheus + # url: "http://{{ prometheus_address }}:9090" # default prometheus port + # editable: true - name: slurmstats # see https://github.com/grafana/opensearch-datasource#configure-the-data-source-with-provisioning type: grafana-opensearch-datasource - url: "https://{{ opensearch_address }}:9200" + url: "https://{{ control_ip }}:9200" basicAuth: true basicAuthUser: admin secureJsonData: From b3020caeabfadd0e573af04b7ed30edb5e9dcfc8 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Tue, 1 Oct 2024 16:16:18 +0100 Subject: [PATCH 14/90] enabled ips for monitoring services (except prometheus) --- .../kube_prometheus_stack/defaults/main/helm.yml | 13 +++++++++---- .../kube_prometheus_stack/defaults/main/main.yml | 4 ++-- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml b/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml index 9c37115e5..6ef94e969 100644 --- a/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml +++ b/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml @@ -56,6 +56,7 @@ kube_prometheus_stack_release_defaults: enabled: true hosts: - "{{ control_sslip }}" + - "localhost" ingressClassName: traefik paths: - /prometheus @@ -99,10 +100,13 @@ kube_prometheus_stack_release_defaults: additionalPrometheusRulesMap: "{{ prometheus_alert_rules }}" grafana: + # service: + # port: 3000 # seems to cause issues with prometheus targets ingress: enabled: true - hosts: - - "{{ control_sslip }}" + # hosts: + # - "{{ control_sslip }}" + # - "localhost" ingressClassName: traefik path: /grafana sidecar: @@ -123,8 +127,9 @@ kube_prometheus_stack_release_defaults: alertmanager: ingress: enabled: true - hosts: - - "{{ control_sslip }}" + # hosts: + # - "{{ control_sslip }}" + # - "localhost" ingressClassName: traefik paths: - /alertmanager diff --git a/ansible/roles/kube_prometheus_stack/defaults/main/main.yml b/ansible/roles/kube_prometheus_stack/defaults/main/main.yml index 70f101f33..e0e5b378e 100644 --- a/ansible/roles/kube_prometheus_stack/defaults/main/main.yml +++ b/ansible/roles/kube_prometheus_stack/defaults/main/main.yml @@ -82,8 +82,8 @@ prometheus_remote_read: [] # password: FOO prometheus_external_labels: - environment: "{{ control_sslip }}" -# environment: "{{ ansible_fqdn | default(ansible_host) | default(inventory_hostname) }}" + environment: "{{ ansible_fqdn | default(ansible_host) | default(inventory_hostname) }}" + # environment: "{{ control_sslip }}" prometheus_targets: {} # node: From 0dff07f0c4ab38dca11dec38b1186ab2c4a540c6 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Wed, 2 Oct 2024 10:58:24 +0100 Subject: [PATCH 15/90] Added grafana to state directory and made port configurable --- .../defaults/main/helm.yml | 10 +++-- .../defaults/main/main.yml | 4 +- .../kube_prometheus_stack/tasks/main.yml | 43 +++++++++++++++++-- 3 files changed, 50 insertions(+), 7 deletions(-) diff --git a/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml b/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml index 6ef94e969..d76a7802f 100644 --- a/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml +++ b/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml @@ -72,7 +72,7 @@ kube_prometheus_stack_release_defaults: storageSpec: volumeClaimTemplate: spec: - volumeName: appliances-state-dir + volumeName: prometheus-dir accessModes: - ReadWriteOnce resources: @@ -100,8 +100,8 @@ kube_prometheus_stack_release_defaults: additionalPrometheusRulesMap: "{{ prometheus_alert_rules }}" grafana: - # service: - # port: 3000 # seems to cause issues with prometheus targets + service: + port: "{{ grafana_port }}" ingress: enabled: true # hosts: @@ -123,6 +123,10 @@ kube_prometheus_stack_release_defaults: plugins: "{{ grafana_plugins }}" nodeSelector: kubernetes.io/hostname: "{{ openhpc_cluster_name }}-control.{{ openhpc_cluster_name }}.{{ cluster_domain_suffix }}" + persistence: + type: pvc + enabled: true + existingClaim: grafana-pvc alertmanager: ingress: diff --git a/ansible/roles/kube_prometheus_stack/defaults/main/main.yml b/ansible/roles/kube_prometheus_stack/defaults/main/main.yml index e0e5b378e..ce54733e8 100644 --- a/ansible/roles/kube_prometheus_stack/defaults/main/main.yml +++ b/ansible/roles/kube_prometheus_stack/defaults/main/main.yml @@ -16,6 +16,8 @@ login_ip: "{{ hostvars[groups['openondemand'][0]]['ansible_host'] }}" # probably control_ip: "{{ ansible_default_ipv4.address| default(ansible_all_ipv4_addresses[0]) }}" control_sslip: "{{ control_ip | regex_replace('\\.', '-') }}.sslip.io" +grafana_claim_size: 10Gi + ### PREVIOUS ROLE VALUES prometheus_image_tag: "v2.27.0" @@ -102,7 +104,7 @@ prometheus_scrape_configs: - job_name: "grafana" static_configs: - targets: - - "kube-prometheus-stack-grafana:80" + - "kube-prometheus-stack-grafana:{{ grafana_port }}" # Alternative config file name, searched in ansible templates path. # prometheus_config_file: 'prometheus.yml.j2' diff --git a/ansible/roles/kube_prometheus_stack/tasks/main.yml b/ansible/roles/kube_prometheus_stack/tasks/main.yml index aac539b87..363f07ddd 100644 --- a/ansible/roles/kube_prometheus_stack/tasks/main.yml +++ b/ansible/roles/kube_prometheus_stack/tasks/main.yml @@ -22,16 +22,17 @@ args: stdin: "{{ kube_prometheus_stack_crds.stdout }}" -- name: Create hostPath volume in /var/lib/state +# variables would need refactoring to let us loop through the data directories nicely +- name: Create Prometheus hostPath volume in /var/lib/state kubernetes.core.k8s: namespace: "{{ kube_prometheus_stack_release_namespace }}" definition: apiVersion: v1 kind: PersistentVolume metadata: - name: appliances-state-dir + name: prometheus-dir labels: - app.kubernetes.io/name: appliances-state-dir + app.kubernetes.io/name: prometheus-dir spec: capacity: storage: "{{ kube_prometheus_stack_volume_size }}" @@ -41,6 +42,42 @@ path: "{{ prometheus_db_dir }}" type: DirectoryOrCreate +- name: Create Grafana hostPath volume in /var/lib/state + kubernetes.core.k8s: + namespace: "{{ kube_prometheus_stack_release_namespace }}" + definition: + apiVersion: v1 + kind: PersistentVolume + metadata: + name: grafana-dir + labels: + app.kubernetes.io/name: grafana-dir + spec: + capacity: + storage: "{{ grafana_claim_size }}" + accessModes: + - ReadWriteOnce + hostPath: + path: "{{ grafana_data_dir }}" + type: DirectoryOrCreate + +- name: Create PVC for Grafana + kubernetes.core.k8s: + namespace: "{{ kube_prometheus_stack_release_namespace }}" + definition: + apiVersion: v1 + kind: PersistentVolumeClaim + metadata: + name: grafana-pvc + spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: "{{ grafana_claim_size }}" + volumeMode: Filesystem + volumeName: grafana-dir + - name: Disable rancher default storage class kubernetes.core.k8s: namespace: "{{ kube_prometheus_stack_release_namespace }}" From f7e555b7cf596c1f9eeab62663146b26953e6497 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Thu, 3 Oct 2024 11:05:09 +0100 Subject: [PATCH 16/90] grafana can now be reverse proxied by ood --- .../grafana-dashboards/defaults/main.yml | 2 +- .../defaults/main/helm.yml | 7 +++--- .../defaults/main/main.yml | 2 +- .../kube_prometheus_stack/tasks/main.yml | 22 +++++++++---------- .../inventory/group_vars/all/grafana.yml | 4 ++-- 5 files changed, 18 insertions(+), 19 deletions(-) diff --git a/ansible/roles/grafana-dashboards/defaults/main.yml b/ansible/roles/grafana-dashboards/defaults/main.yml index e93ce8e31..064b86498 100644 --- a/ansible/roles/grafana-dashboards/defaults/main.yml +++ b/ansible/roles/grafana-dashboards/defaults/main.yml @@ -1,7 +1,7 @@ --- grafana_address: "0.0.0.0" -grafana_port: 3000 +grafana_port: 80 # External Grafana address. Variable maps to "root_url" in grafana server section grafana_url: "http://{{ grafana_address }}:{{ grafana_port }}" diff --git a/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml b/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml index d76a7802f..c1a36a663 100644 --- a/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml +++ b/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml @@ -108,15 +108,14 @@ kube_prometheus_stack_release_defaults: # - "{{ control_sslip }}" # - "localhost" ingressClassName: traefik - path: /grafana + path: / sidecar: dashboards: searchNamespace: ALL grafana.ini: server: - domain: "{{ control_sslip }}" - root_url: "http://{{ control_sslip }}/grafana" - serve_from_sub_path: true + domain: "{{ openondemand_servername }}" + root_url: "{{ grafana_url_openondemand_proxy }}" auth.anonymous: enabled: true additionalDataSources: "{{ grafana_datasources }}" diff --git a/ansible/roles/kube_prometheus_stack/defaults/main/main.yml b/ansible/roles/kube_prometheus_stack/defaults/main/main.yml index ce54733e8..a9464d541 100644 --- a/ansible/roles/kube_prometheus_stack/defaults/main/main.yml +++ b/ansible/roles/kube_prometheus_stack/defaults/main/main.yml @@ -10,7 +10,7 @@ kube_prometheus_stack_release_namespace: monitoring-system kube_prometheus_stack_release_name: kube-prometheus-stack # The timeout to wait for the release to become ready -kube_prometheus_stack_wait_timeout: 10m +kube_prometheus_stack_wait_timeout: 5m login_ip: "{{ hostvars[groups['openondemand'][0]]['ansible_host'] }}" # probably needs to be more robust control_ip: "{{ ansible_default_ipv4.address| default(ansible_all_ipv4_addresses[0]) }}" diff --git a/ansible/roles/kube_prometheus_stack/tasks/main.yml b/ansible/roles/kube_prometheus_stack/tasks/main.yml index 363f07ddd..e0efa7975 100644 --- a/ansible/roles/kube_prometheus_stack/tasks/main.yml +++ b/ansible/roles/kube_prometheus_stack/tasks/main.yml @@ -22,6 +22,17 @@ args: stdin: "{{ kube_prometheus_stack_crds.stdout }}" +- name: Disable rancher default storage class + kubernetes.core.k8s: + namespace: "{{ kube_prometheus_stack_release_namespace }}" + state: patched + definition: + kind: StorageClass + metadata: + name: local-path + annotations: + storageclass.kubernetes.io/is-default-class: "false" + # variables would need refactoring to let us loop through the data directories nicely - name: Create Prometheus hostPath volume in /var/lib/state kubernetes.core.k8s: @@ -78,17 +89,6 @@ volumeMode: Filesystem volumeName: grafana-dir -- name: Disable rancher default storage class - kubernetes.core.k8s: - namespace: "{{ kube_prometheus_stack_release_namespace }}" - state: patched - definition: - kind: StorageClass - metadata: - name: local-path - annotations: - storageclass.kubernetes.io/is-default-class: "false" - # not looping through these because templating doesn't set ports as integer - name: Creating headless service for OOD exporter kubernetes.core.k8s: diff --git a/environments/common/inventory/group_vars/all/grafana.yml b/environments/common/inventory/group_vars/all/grafana.yml index 802b025e0..057252018 100644 --- a/environments/common/inventory/group_vars/all/grafana.yml +++ b/environments/common/inventory/group_vars/all/grafana.yml @@ -5,7 +5,7 @@ grafana_version: '9.5.21' # need to copy some role defaults here so we can use in inventory: -grafana_port: 3000 +grafana_port: 80 # Define where state is stored grafana_data_dir: "{{ appliances_state_dir | default('/var/lib') }}/grafana" @@ -16,7 +16,7 @@ grafana_api_url: "http://{{ grafana_api_address }}:{{ grafana_port }}" # Configure external address, with external URL depending on whether we are using Open Ondemand as a proxy grafana_url_direct: "http://{{ grafana_address }}:{{ grafana_port }}" -grafana_url_openondemand_proxy: "https://{{ openondemand_servername | default('') }}/node/{{ groups['grafana'].0 }}/{{ grafana_port }}" +grafana_url_openondemand_proxy: "http://{{ openondemand_servername | default('') }}/rnode/{{ groups['grafana'].0 }}/{{ grafana_port }}" grafana_url: "{{ grafana_url_openondemand_proxy if groups['openondemand'] | count > 0 else grafana_url_direct }}" grafana_serve_from_sub_path: "{{ groups['openondemand'] | count > 0 }}" From d142a9f4fda1be8ee525fd510b475a8be29d27bd Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Mon, 7 Oct 2024 09:24:13 +0100 Subject: [PATCH 17/90] Ported grafana rolevars --- .../roles/grafana-dashboards/tasks/main.yml | 57 ------ .../defaults/main/helm.yml | 14 +- .../defaults/main/main.yml | 188 ++++++++++++++++++ 3 files changed, 201 insertions(+), 58 deletions(-) diff --git a/ansible/roles/grafana-dashboards/tasks/main.yml b/ansible/roles/grafana-dashboards/tasks/main.yml index 211b6c6f0..d8042d0bb 100644 --- a/ansible/roles/grafana-dashboards/tasks/main.yml +++ b/ansible/roles/grafana-dashboards/tasks/main.yml @@ -141,60 +141,3 @@ vars: item_filename: "{{ ((item.dashboard_id | string) + '.json') if 'dashboard_id' in item else item.dashboard_file }}" item_path: "{{ _tmp_dashboards.path }}/{{ item_filename }}" - -# - name: Create/Update dashboards file (provisioning) -# become: true -# copy: -# dest: "/etc/grafana/provisioning/dashboards/ansible.yml" -# content: | -# apiVersion: 1 -# providers: -# - name: 'default' -# orgId: 1 -# folder: '' -# type: file -# options: -# path: "{{ grafana_data_dir }}/dashboards" -# backup: false -# owner: root -# group: grafana -# mode: 0640 -# notify: restart grafana - -# - name: Register preexisting dashboards -# become: true -# find: -# paths: "{{ grafana_data_dir }}/dashboards" -# hidden: true -# patterns: -# - "*.json" -# register: _dashboards_pre - -# - name: Import grafana dashboards -# become: true -# copy: -# remote_src: yes -# src: "{{ _tmp_dashboards.path }}/" # Note trailing / to only copy contents, not directory itself -# dest: "{{ grafana_data_dir }}/dashboards/" -# notify: "provisioned dashboards changed" - -# - name: Register all installed dashboards -# become: true -# find: -# paths: "{{ grafana_data_dir }}/dashboards" -# hidden: true -# patterns: -# - "*.json" -# register: _dashboards_post - -# - name: Get dashboard lists -# set_fact: -# _dashboards_pre_list: "{{ _dashboards_pre | json_query('files[*].path') | default([]) }}" -# _dashboards_post_list: "{{ _dashboards_post | json_query('files[*].path') | default([]) }}" - -# - name: Remove installed dashboards not defined through this role -# become: true -# file: -# path: "{{ item }}" -# state: absent -# with_items: "{{ _dashboards_pre_list | difference( _dashboards_post_list ) }}" diff --git a/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml b/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml index c1a36a663..01d8c161a 100644 --- a/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml +++ b/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml @@ -116,8 +116,14 @@ kube_prometheus_stack_release_defaults: server: domain: "{{ openondemand_servername }}" root_url: "{{ grafana_url_openondemand_proxy }}" + auth: "{{ grafana_auth }}" auth.anonymous: - enabled: true + enabled: "{{ grafana_anonymous_auth }}" + analytics: "{{ grafana_analytics }}" + smtp: "{{ grafana_smtp }}" + log: "{{ grafana_log }}" + tracing: "{{ grafana_tracing }}" + panels: "{{ grafana_panels }}" additionalDataSources: "{{ grafana_datasources }}" plugins: "{{ grafana_plugins }}" nodeSelector: @@ -126,6 +132,12 @@ kube_prometheus_stack_release_defaults: type: pvc enabled: true existingClaim: grafana-pvc + ldap: + enabled: "{{ false if grafana_ldap == {} else true }}" + config: "{{ grafana_ldap | to_yaml }}" + image: + tag: "{{ grafana_image_tag }}" + env: "{{ grafana_environment }}" alertmanager: ingress: diff --git a/ansible/roles/kube_prometheus_stack/defaults/main/main.yml b/ansible/roles/kube_prometheus_stack/defaults/main/main.yml index a9464d541..0dab3d6a8 100644 --- a/ansible/roles/kube_prometheus_stack/defaults/main/main.yml +++ b/ansible/roles/kube_prometheus_stack/defaults/main/main.yml @@ -18,6 +18,8 @@ control_sslip: "{{ control_ip | regex_replace('\\.', '-') }}.sslip.io" grafana_claim_size: 10Gi +grafana_anonymous_auth: true + ### PREVIOUS ROLE VALUES prometheus_image_tag: "v2.27.0" @@ -246,3 +248,189 @@ prometheus_alert_rules: # ------------------------------------------------------------------------------------------ +### PREVIOUS GRAFANA VARS +grafana_image_tag: 11.2.2 + +grafana_instance: "{{ ansible_fqdn | default(ansible_host) | default(inventory_hostname) }}" + +grafana_data_dir: "/var/lib/grafana" + +grafana_port: 80 + +# Additional options for grafana "server" section +# This section WILL omit options for: http_addr, http_port, domain, and root_url, as those settings are set by variables listed before +# grafana_server: +# protocol: http +# enforce_domain: false +# socket: "" +# cert_key: "" +# cert_file: "" +# enable_gzip: false +# static_root_path: public +# router_logging: false +# serve_from_sub_path: false + +# # Variables correspond to ones in grafana.ini configuration file +# # Security +# grafana_security: +# admin_user: admin +# admin_password: "" +# # secret_key: "" +# # login_remember_days: 7 +# # cookie_username: grafana_user +# # cookie_remember_name: grafana_remember +# # disable_gravatar: true +# # data_source_proxy_whitelist: + +# User management and registration +# grafana_welcome_email_on_sign_up: false +# grafana_users: +# allow_sign_up: false +# # allow_org_create: true +# # auto_assign_org: true +# auto_assign_org_role: Viewer +# # login_hint: "email or username" +# default_theme: dark +# # external_manage_link_url: "" +# # external_manage_link_name: "" +# # external_manage_info: "" + +# grafana authentication mechanisms +grafana_auth: {} +# disable_login_form: false +# oauth_auto_login: false +# disable_signout_menu: false +# signout_redirect_url: "" +# ldap: +# config_file: "/etc/grafana/ldap.toml" +# allow_sign_up: false +# basic: +# enabled: true + +grafana_ldap: {} +# verbose_logging: false +# servers: +# host: 127.0.0.1 +# port: 389 # 636 for SSL +# use_ssl: false +# start_tls: false +# ssl_skip_verify: false +# root_ca_cert: /path/to/certificate.crt +# bind_dn: "cn=admin,dc=grafana,dc=org" +# bind_password: grafana +# search_filter: "(cn=%s)" # "(sAMAccountName=%s)" on AD +# search_base_dns: +# - "dc=grafana,dc=org" +# group_search_filter: "(&(objectClass=posixGroup)(memberUid=%s))" +# group_search_base_dns: +# - "ou=groups,dc=grafana,dc=org" +# attributes: +# name: givenName +# surname: sn +# username: sAMAccountName +# member_of: memberOf +# email: mail +# group_mappings: +# - name: Main Org. +# id: 1 +# groups: +# - group_dn: "cn=admins,ou=groups,dc=grafana,dc=org" +# org_role: Admin +# - group_dn: "cn=editors,ou=groups,dc=grafana,dc=org" +# org_role: Editor +# - group_dn: "*" +# org_role: Viewer +# - name: Alternative Org +# id: 2 +# groups: +# - group_dn: "cn=alternative_admins,ou=groups,dc=grafana,dc=org" +# org_role: Admin + +grafana_analytics: {} +# reporting_enabled: true +# google_analytics_ua_id: "" + +# Set this for mail notifications +grafana_smtp: {} +# host: +# user: +# password: +# from_address: + +# Grafana logging configuration +grafana_log: +# mode: 'console file' +# level: info + +# Distributed tracing options +grafana_tracing: {} +# address: "localhost:6831" +# always_included_tag: "tag1:value1,tag2:value2" +# sampler_type: const +# sampler_param: 1 + +grafana_snapshots: {} +# external_enabled: true +# external_snapshot_url: "https://snapshots-origin.raintank.io" +# external_snapshot_name: "Publish to snapshot.raintank.io" +# snapshot_remove_expired: true +# snapshot_TTL_days: 90 + +# # External image store +# grafana_image_storage: {} +# # provider: gcs +# # key_file: +# # bucket: +# # path: + + +####### +# Plugins from https://grafana.com/plugins +grafana_plugins: [] +# - raintank-worldping-app + +# # Dashboards from https://grafana.com/dashboards +# grafana_dashboards: [] +# # - dashboard_id: '4271' +# # revision_id: '3' +# # datasource: 'Prometheus' +# # - dashboard_id: '1860' +# # revision_id: '4' +# # datasource: 'Prometheus' +# # - dashboard_id: '358' +# # revision_id: '1' +# # datasource: 'Prometheus' + +# grafana_dashboards_dir: "dashboards" + +# # Alert notification channels to configure +# grafana_alert_notifications: [] +# # - name: "Email Alert" +# # type: "email" +# # uid: channel1 +# # is_default: true +# # settings: +# # addresses: "example@example.com" + +# Datasources to configure +grafana_datasources: [] +# - name: "Prometheus" +# type: "prometheus" +# access: "proxy" +# url: "http://prometheus.mydomain" +# basicAuth: true +# basicAuthUser: "admin" +# basicAuthPassword: "password" +# isDefault: true +# jsonData: +# tlsAuth: false +# tlsAuthWithCACert: false +# tlsSkipVerify: true + +grafana_environment: {} + +# Panels configurations +grafana_panels: {} +# disable_sanitize_html: false +# enable_alpha: false + From 7fa36098df1c331d70c772bd5dd4c597725f1a46 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Mon, 7 Oct 2024 14:32:48 +0100 Subject: [PATCH 18/90] Added slack integration default --- .../defaults/main/helm.yml | 6 +---- .../defaults/main/main.yml | 22 +++++++++++++++++++ 2 files changed, 23 insertions(+), 5 deletions(-) diff --git a/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml b/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml index 01d8c161a..7060b2a16 100644 --- a/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml +++ b/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml @@ -149,11 +149,7 @@ kube_prometheus_stack_release_defaults: paths: - /alertmanager # Don't apply the namespace grouping by default - config: - route: - group_by: ['...'] - global: - resolve_timeout: "{{ prometheus_config_flags_extra.alertmanager.timeout | default( '5m' ) }}" + config: "{{ alertmanager_config }}" alertmanagerSpec: forceEnableClusterMode: true routePrefix: /alertmanager diff --git a/ansible/roles/kube_prometheus_stack/defaults/main/main.yml b/ansible/roles/kube_prometheus_stack/defaults/main/main.yml index 0dab3d6a8..eefbb47b6 100644 --- a/ansible/roles/kube_prometheus_stack/defaults/main/main.yml +++ b/ansible/roles/kube_prometheus_stack/defaults/main/main.yml @@ -20,6 +20,28 @@ grafana_claim_size: 10Gi grafana_anonymous_auth: true +slack_integration: + channel: "#alerts" + app_creds: # TODO: need to find best way to build this into secrets store as must be manually retrieved from app + +alertmanager_config: + route: + group_by: ['...'] + receiver: slack-receiver + global: + resolve_timeout: "{{ prometheus_config_flags_extra.alertmanager.timeout | default( '5m' ) }}" + receivers: + - name: 'null' + - name: slack-receiver + slack_configs: + - channel: "{{ slack_integration.channel }}" + api_url: https://slack.com/api/chat.postMessage + http_config: + authorization: + credentials: "{{ slack_integration.app_creds }}" + text: "{{ '{{' }} .GroupLabels.alertname {{ '}}' }} : {{ '{{' }} .CommonAnnotations.description {{ '}}' }}" + title_link: "http://{{ control_ip }}/alertmanager/#/alerts?receiver=slack-receiver" + ### PREVIOUS ROLE VALUES prometheus_image_tag: "v2.27.0" From 96edb79976403be2e7346f3d3a3bff6c616645ed Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Mon, 7 Oct 2024 17:17:37 +0100 Subject: [PATCH 19/90] Ported alertmanager rolevars --- .../defaults/main/helm.yml | 3 +++ .../defaults/main/main.yml | 19 ++++++++++++------- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml b/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml index 7060b2a16..d4e4ec9d3 100644 --- a/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml +++ b/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml @@ -151,6 +151,8 @@ kube_prometheus_stack_release_defaults: # Don't apply the namespace grouping by default config: "{{ alertmanager_config }}" alertmanagerSpec: + image: + tag: "{{ alertmanager_image_tag }}" forceEnableClusterMode: true routePrefix: /alertmanager externalUrl: "{{ kube_prometheus_stack_alertmanager_external_url }}" @@ -167,6 +169,7 @@ kube_prometheus_stack_release_defaults: serviceMonitor: scheme: "{{ prometheus_alertmanager_config.scheme | default( '' ) }}" proxyUrl: "{{ prometheus_alertmanager_config.proxy_url | default( '' ) }}" + templateFiles: "{{ alertmanager_template_files }}" prometheusOperator: diff --git a/ansible/roles/kube_prometheus_stack/defaults/main/main.yml b/ansible/roles/kube_prometheus_stack/defaults/main/main.yml index eefbb47b6..55d4d01af 100644 --- a/ansible/roles/kube_prometheus_stack/defaults/main/main.yml +++ b/ansible/roles/kube_prometheus_stack/defaults/main/main.yml @@ -41,6 +41,9 @@ alertmanager_config: credentials: "{{ slack_integration.app_creds }}" text: "{{ '{{' }} .GroupLabels.alertname {{ '}}' }} : {{ '{{' }} .CommonAnnotations.description {{ '}}' }}" title_link: "http://{{ control_ip }}/alertmanager/#/alerts?receiver=slack-receiver" + send_resolved: true + +alertmanager_replicas: 1 ### PREVIOUS ROLE VALUES @@ -111,13 +114,6 @@ prometheus_external_labels: environment: "{{ ansible_fqdn | default(ansible_host) | default(inventory_hostname) }}" # environment: "{{ control_sslip }}" -prometheus_targets: {} -# node: -# - targets: -# - localhost:9100 -# labels: -# env: test - prometheus_scrape_configs: - job_name: "slurm_exporter" scrape_interval: 30s @@ -456,3 +452,12 @@ grafana_panels: {} # disable_sanitize_html: false # enable_alpha: false +### PREVIOUS ALERTMANAGER ROLEVARS + +alertmanager_image_tag: v0.27.0 + +# alertmanager_config_file: 'alertmanager.yml.j2' + +alertmanager_template_files: {} + +# alertmanager_http_config: {} From b13311a93ff2f884dee30e911cf3c0e0087ff943 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Tue, 8 Oct 2024 10:47:08 +0100 Subject: [PATCH 20/90] removed k3s ingress --- ansible/roles/k3s/files/start_k3s.yml | 17 ----------------- ansible/roles/k3s/tasks/main.yml | 2 +- 2 files changed, 1 insertion(+), 18 deletions(-) diff --git a/ansible/roles/k3s/files/start_k3s.yml b/ansible/roles/k3s/files/start_k3s.yml index 56ef20313..78f950a59 100644 --- a/ansible/roles/k3s/files/start_k3s.yml +++ b/ansible/roles/k3s/files/start_k3s.yml @@ -18,23 +18,6 @@ line: "K3S_URL=https://{{ k3s_server_name }}:6443" when: k3s_server_name is defined - - name: Only run loadbalancer on k3s server - # avoids problems with Ondemand https server - when: k3s_server_name is undefined - block: - - name: Create override directory - ansible.builtin.file: - state: directory - path: "/etc/systemd/system/{{ service_name }}.service.d" - - - name: Set loadbalancer label on k3s server - ansible.builtin.copy: - dest: "/etc/systemd/system/{{ service_name }}.service.d/override.conf" - content: | - [Service] - ExecStart= - ExecStart=/usr/bin/k3s server --node-label svccontroller.k3s.cattle.io/enablelb=true - - name: Start k3s service ansible.builtin.systemd: name: "{{ service_name }}" diff --git a/ansible/roles/k3s/tasks/main.yml b/ansible/roles/k3s/tasks/main.yml index 01f5d56d5..9e9e2ddf2 100644 --- a/ansible/roles/k3s/tasks/main.yml +++ b/ansible/roles/k3s/tasks/main.yml @@ -28,7 +28,7 @@ INSTALL_K3S_BIN_DIR: "/usr/bin" changed_when: true loop: - - server + - server --disable=traefik - agent - name: Install helm From 01718ee2270d5b052913130fc091e22d6e802327 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Tue, 8 Oct 2024 12:52:54 +0100 Subject: [PATCH 21/90] Services now exposed/proxied via nodeports --- .../defaults/main/helm.yml | 26 +---------- .../kube_prometheus_stack/tasks/main.yml | 43 +++++++++++++++++++ .../inventory/group_vars/all/grafana.yml | 2 +- 3 files changed, 46 insertions(+), 25 deletions(-) diff --git a/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml b/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml index d4e4ec9d3..180fa0ed6 100644 --- a/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml +++ b/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml @@ -52,16 +52,7 @@ kube_prometheus_stack_release_defaults: KubeProxyDown: true KubeControllerManagerDown: true prometheus: - ingress: - enabled: true - hosts: - - "{{ control_sslip }}" - - "localhost" - ingressClassName: traefik - paths: - - /prometheus prometheusSpec: - routePrefix: /prometheus externalUrl: "{{ kube_prometheus_stack_prometheus_external_url }}" podMonitorSelectorNilUsesHelmValues: false serviceMonitorSelectorNilUsesHelmValues: false @@ -103,12 +94,7 @@ kube_prometheus_stack_release_defaults: service: port: "{{ grafana_port }}" ingress: - enabled: true - # hosts: - # - "{{ control_sslip }}" - # - "localhost" - ingressClassName: traefik - path: / + path: /node/{{ groups['grafana'].0 }}/30001 sidecar: dashboards: searchNamespace: ALL @@ -116,6 +102,7 @@ kube_prometheus_stack_release_defaults: server: domain: "{{ openondemand_servername }}" root_url: "{{ grafana_url_openondemand_proxy }}" + serve_from_sub_path: true auth: "{{ grafana_auth }}" auth.anonymous: enabled: "{{ grafana_anonymous_auth }}" @@ -140,21 +127,12 @@ kube_prometheus_stack_release_defaults: env: "{{ grafana_environment }}" alertmanager: - ingress: - enabled: true - # hosts: - # - "{{ control_sslip }}" - # - "localhost" - ingressClassName: traefik - paths: - - /alertmanager # Don't apply the namespace grouping by default config: "{{ alertmanager_config }}" alertmanagerSpec: image: tag: "{{ alertmanager_image_tag }}" forceEnableClusterMode: true - routePrefix: /alertmanager externalUrl: "{{ kube_prometheus_stack_alertmanager_external_url }}" # Make sure that alertmanager finds configurations with the alertmanager name as a label alertmanagerConfigSelector: diff --git a/ansible/roles/kube_prometheus_stack/tasks/main.yml b/ansible/roles/kube_prometheus_stack/tasks/main.yml index e0efa7975..145054350 100644 --- a/ansible/roles/kube_prometheus_stack/tasks/main.yml +++ b/ansible/roles/kube_prometheus_stack/tasks/main.yml @@ -193,3 +193,46 @@ create_namespace: no wait: yes wait_timeout: "{{ kube_prometheus_stack_wait_timeout }}" + +# Again can't loop through these because the k8s module doesn't like templating ints +- name: Opening NodePort for Prometheus + kubernetes.core.k8s: + namespace: "{{ kube_prometheus_stack_release_namespace }}" + name: prometheus-external + definition: + kind: Service + spec: + type: NodePort + selector: + app.kubernetes.io/name: prometheus + ports: + - port: 9090 + nodePort: 30000 + +- name: Opening NodePort for Grafana + kubernetes.core.k8s: + namespace: "{{ kube_prometheus_stack_release_namespace }}" + name: grafana-external + definition: + kind: Service + spec: + type: NodePort + selector: + app.kubernetes.io/name: grafana + ports: + - port: 3000 + nodePort: 30001 + +- name: Opening NodePort for Alertmanager + kubernetes.core.k8s: + namespace: "{{ kube_prometheus_stack_release_namespace }}" + name: alertmanager-external + definition: + kind: Service + spec: + type: NodePort + selector: + app.kubernetes.io/name: alertmanager + ports: + - port: 9093 + nodePort: 30002 diff --git a/environments/common/inventory/group_vars/all/grafana.yml b/environments/common/inventory/group_vars/all/grafana.yml index 057252018..a15e5ce97 100644 --- a/environments/common/inventory/group_vars/all/grafana.yml +++ b/environments/common/inventory/group_vars/all/grafana.yml @@ -16,7 +16,7 @@ grafana_api_url: "http://{{ grafana_api_address }}:{{ grafana_port }}" # Configure external address, with external URL depending on whether we are using Open Ondemand as a proxy grafana_url_direct: "http://{{ grafana_address }}:{{ grafana_port }}" -grafana_url_openondemand_proxy: "http://{{ openondemand_servername | default('') }}/rnode/{{ groups['grafana'].0 }}/{{ grafana_port }}" +grafana_url_openondemand_proxy: "https://{{ openondemand_servername | default('') }}/node/{{ groups['grafana'].0 }}/30001" grafana_url: "{{ grafana_url_openondemand_proxy if groups['openondemand'] | count > 0 else grafana_url_direct }}" grafana_serve_from_sub_path: "{{ groups['openondemand'] | count > 0 }}" From 74bd3ba753a5efae53e88374f2f5e884d83b026a Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Wed, 9 Oct 2024 14:06:32 +0100 Subject: [PATCH 22/90] Removed grafana servicemonitor and moved nodeports to helm config --- .../defaults/main/helm.yml | 13 +++++- .../defaults/main/main.yml | 13 +++--- .../kube_prometheus_stack/tasks/main.yml | 43 ------------------- .../inventory/group_vars/all/grafana.yml | 4 +- .../inventory/group_vars/all/prometheus.yml | 4 -- 5 files changed, 19 insertions(+), 58 deletions(-) diff --git a/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml b/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml index 180fa0ed6..3505a2e0d 100644 --- a/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml +++ b/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml @@ -52,6 +52,9 @@ kube_prometheus_stack_release_defaults: KubeProxyDown: true KubeControllerManagerDown: true prometheus: + service: + type: NodePort + nodePort: "{{ prometheus_port }}" prometheusSpec: externalUrl: "{{ kube_prometheus_stack_prometheus_external_url }}" podMonitorSelectorNilUsesHelmValues: false @@ -92,9 +95,12 @@ kube_prometheus_stack_release_defaults: grafana: service: - port: "{{ grafana_port }}" + type: NodePort + nodePort: "{{ grafana_port }}" + serviceMonitor: + enabled: false ingress: - path: /node/{{ groups['grafana'].0 }}/30001 + path: "/node/{{ groups['grafana'].0 }}/{{ grafana_port }}" sidecar: dashboards: searchNamespace: ALL @@ -127,6 +133,9 @@ kube_prometheus_stack_release_defaults: env: "{{ grafana_environment }}" alertmanager: + service: + type: NodePort + nodePort: "{{ alertmanager_port }}" # Don't apply the namespace grouping by default config: "{{ alertmanager_config }}" alertmanagerSpec: diff --git a/ansible/roles/kube_prometheus_stack/defaults/main/main.yml b/ansible/roles/kube_prometheus_stack/defaults/main/main.yml index 55d4d01af..4d66813ae 100644 --- a/ansible/roles/kube_prometheus_stack/defaults/main/main.yml +++ b/ansible/roles/kube_prometheus_stack/defaults/main/main.yml @@ -45,6 +45,11 @@ alertmanager_config: alertmanager_replicas: 1 +# Must be within K3s' reserved port range (default 30000-32767) +prometheus_port: 30000 +grafana_port: 30001 +alertmanager_port: 30002 + ### PREVIOUS ROLE VALUES prometheus_image_tag: "v2.27.0" @@ -94,7 +99,7 @@ prometheus_alert_relabel_configs: [] # regex: replica prometheus_global: - scrape_interval: 15s + scrape_interval: 30s scrape_timeout: 10s evaluation_interval: 15s @@ -121,10 +126,6 @@ prometheus_scrape_configs: static_configs: - targets: - "{{ control_ip }}:{{ slurm_exporter_port }}" - - job_name: "grafana" - static_configs: - - targets: - - "kube-prometheus-stack-grafana:{{ grafana_port }}" # Alternative config file name, searched in ansible templates path. # prometheus_config_file: 'prometheus.yml.j2' @@ -273,8 +274,6 @@ grafana_instance: "{{ ansible_fqdn | default(ansible_host) | default(inventory_h grafana_data_dir: "/var/lib/grafana" -grafana_port: 80 - # Additional options for grafana "server" section # This section WILL omit options for: http_addr, http_port, domain, and root_url, as those settings are set by variables listed before # grafana_server: diff --git a/ansible/roles/kube_prometheus_stack/tasks/main.yml b/ansible/roles/kube_prometheus_stack/tasks/main.yml index 145054350..e0efa7975 100644 --- a/ansible/roles/kube_prometheus_stack/tasks/main.yml +++ b/ansible/roles/kube_prometheus_stack/tasks/main.yml @@ -193,46 +193,3 @@ create_namespace: no wait: yes wait_timeout: "{{ kube_prometheus_stack_wait_timeout }}" - -# Again can't loop through these because the k8s module doesn't like templating ints -- name: Opening NodePort for Prometheus - kubernetes.core.k8s: - namespace: "{{ kube_prometheus_stack_release_namespace }}" - name: prometheus-external - definition: - kind: Service - spec: - type: NodePort - selector: - app.kubernetes.io/name: prometheus - ports: - - port: 9090 - nodePort: 30000 - -- name: Opening NodePort for Grafana - kubernetes.core.k8s: - namespace: "{{ kube_prometheus_stack_release_namespace }}" - name: grafana-external - definition: - kind: Service - spec: - type: NodePort - selector: - app.kubernetes.io/name: grafana - ports: - - port: 3000 - nodePort: 30001 - -- name: Opening NodePort for Alertmanager - kubernetes.core.k8s: - namespace: "{{ kube_prometheus_stack_release_namespace }}" - name: alertmanager-external - definition: - kind: Service - spec: - type: NodePort - selector: - app.kubernetes.io/name: alertmanager - ports: - - port: 9093 - nodePort: 30002 diff --git a/environments/common/inventory/group_vars/all/grafana.yml b/environments/common/inventory/group_vars/all/grafana.yml index a15e5ce97..cc6a325b4 100644 --- a/environments/common/inventory/group_vars/all/grafana.yml +++ b/environments/common/inventory/group_vars/all/grafana.yml @@ -5,7 +5,7 @@ grafana_version: '9.5.21' # need to copy some role defaults here so we can use in inventory: -grafana_port: 80 +grafana_port: 30001 # Define where state is stored grafana_data_dir: "{{ appliances_state_dir | default('/var/lib') }}/grafana" @@ -16,7 +16,7 @@ grafana_api_url: "http://{{ grafana_api_address }}:{{ grafana_port }}" # Configure external address, with external URL depending on whether we are using Open Ondemand as a proxy grafana_url_direct: "http://{{ grafana_address }}:{{ grafana_port }}" -grafana_url_openondemand_proxy: "https://{{ openondemand_servername | default('') }}/node/{{ groups['grafana'].0 }}/30001" +grafana_url_openondemand_proxy: "https://{{ openondemand_servername | default('') }}/node/{{ groups['grafana'].0 }}/{{ grafana_port }}" grafana_url: "{{ grafana_url_openondemand_proxy if groups['openondemand'] | count > 0 else grafana_url_direct }}" grafana_serve_from_sub_path: "{{ groups['openondemand'] | count > 0 }}" diff --git a/environments/common/inventory/group_vars/all/prometheus.yml b/environments/common/inventory/group_vars/all/prometheus.yml index d2239b5c3..9c800e090 100644 --- a/environments/common/inventory/group_vars/all/prometheus.yml +++ b/environments/common/inventory/group_vars/all/prometheus.yml @@ -28,10 +28,6 @@ prometheus_scrape_configs_default: static_configs: - targets: - "{{ control_ip }}:{{ slurm_exporter_port }}" -- job_name: "grafana" - static_configs: - - targets: - - "kube-prometheus-stack-grafana:80" kube_prometheus_stack_release_namespace: monitoring-system From e724b5d3f586d7e26c172a84d355827d041f8eb6 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Wed, 9 Oct 2024 14:42:00 +0100 Subject: [PATCH 23/90] grafana admin now definable --- .../defaults/main/helm.yml | 2 ++ .../defaults/main/main.yml | 18 +++++++++--------- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml b/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml index 3505a2e0d..a9d9cfdd9 100644 --- a/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml +++ b/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml @@ -131,6 +131,8 @@ kube_prometheus_stack_release_defaults: image: tag: "{{ grafana_image_tag }}" env: "{{ grafana_environment }}" + adminUser: "{{ grafana_security.admin_user }}" + adminPassword: "{{ grafana_security.admin_password }}" alertmanager: service: diff --git a/ansible/roles/kube_prometheus_stack/defaults/main/main.yml b/ansible/roles/kube_prometheus_stack/defaults/main/main.yml index 4d66813ae..604d73dae 100644 --- a/ansible/roles/kube_prometheus_stack/defaults/main/main.yml +++ b/ansible/roles/kube_prometheus_stack/defaults/main/main.yml @@ -289,15 +289,15 @@ grafana_data_dir: "/var/lib/grafana" # # Variables correspond to ones in grafana.ini configuration file # # Security -# grafana_security: -# admin_user: admin -# admin_password: "" -# # secret_key: "" -# # login_remember_days: 7 -# # cookie_username: grafana_user -# # cookie_remember_name: grafana_remember -# # disable_gravatar: true -# # data_source_proxy_whitelist: +grafana_security: + admin_user: grafana + admin_password: "{{ vault_grafana_admin_password }}" +# secret_key: "" # would we want to template the rest of this into the ini? its not currently +# login_remember_days: 7 +# cookie_username: grafana_user +# cookie_remember_name: grafana_remember +# disable_gravatar: true +# data_source_proxy_whitelist: # User management and registration # grafana_welcome_email_on_sign_up: false From 9c359d9bb6b414a8966652dc6e646c4dc99b0e20 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Wed, 9 Oct 2024 15:41:28 +0100 Subject: [PATCH 24/90] Now adds additional rules correctly --- .../defaults/main/main.yml | 257 +++++++++--------- 1 file changed, 131 insertions(+), 126 deletions(-) diff --git a/ansible/roles/kube_prometheus_stack/defaults/main/main.yml b/ansible/roles/kube_prometheus_stack/defaults/main/main.yml index 604d73dae..9c3575e17 100644 --- a/ansible/roles/kube_prometheus_stack/defaults/main/main.yml +++ b/ansible/roles/kube_prometheus_stack/defaults/main/main.yml @@ -138,132 +138,137 @@ prometheus_scrape_configs: # - prometheus/targets/*.json prometheus_alert_rules: - - alert: Watchdog - expr: vector(1) - for: 10m - labels: - severity: warning - annotations: - description: "This is an alert meant to ensure that the entire alerting pipeline is functional.\nThis alert is always firing, therefore it should always be firing in Alertmanager\nand always fire against a receiver. There are integrations with various notification\nmechanisms that send a notification when this alert is not firing. For example the\n\"DeadMansSnitch\" integration in PagerDuty." - summary: 'Ensure entire alerting pipeline is functional' - - alert: InstanceDown - expr: 'up == 0' - for: 5m - labels: - severity: critical - annotations: - description: '{% raw %}{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes.{% endraw %}' - summary: '{% raw %}Instance {{ $labels.instance }} down{% endraw %}' - - alert: RebootRequired - expr: 'node_reboot_required > 0' - labels: - severity: warning - annotations: - description: '{% raw %}{{ $labels.instance }} requires a reboot.{% endraw %}' - summary: '{% raw %}Instance {{ $labels.instance }} - reboot required{% endraw %}' - - alert: NodeFilesystemSpaceFillingUp - annotations: - description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up.{% endraw %}' - summary: 'Filesystem is predicted to run out of space within the next 24 hours.' - expr: "(\n node_filesystem_avail_bytes{job=\"node\",fstype!=\"\"} / node_filesystem_size_bytes{job=\"node\",fstype!=\"\"} * 100 < 40\nand\n predict_linear(node_filesystem_avail_bytes{job=\"node\",fstype!=\"\"}[6h], 24*60*60) < 0\nand\n node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n" - for: 1h - labels: - severity: warning - - alert: NodeFilesystemSpaceFillingUp - annotations: - description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up fast.{% endraw %}' - summary: 'Filesystem is predicted to run out of space within the next 4 hours.' - expr: "(\n node_filesystem_avail_bytes{job=\"node\",fstype!=\"\"} / node_filesystem_size_bytes{job=\"node\",fstype!=\"\"} * 100 < 20\nand\n predict_linear(node_filesystem_avail_bytes{job=\"node\",fstype!=\"\"}[6h], 4*60*60) < 0\nand\n node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n" - for: 1h - labels: - severity: critical - - alert: NodeFilesystemAlmostOutOfSpace - annotations: - description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.{% endraw %}' - summary: 'Filesystem has less than 5% space left.' - expr: "(\n node_filesystem_avail_bytes{job=\"node\",fstype!=\"\"} / node_filesystem_size_bytes{job=\"node\",fstype!=\"\"} * 100 < 5\nand\n node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n" - for: 1h - labels: - severity: warning - - alert: NodeFilesystemAlmostOutOfSpace - annotations: - description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.{% endraw %}' - summary: 'Filesystem has less than 3% space left.' - expr: "(\n node_filesystem_avail_bytes{job=\"node\",fstype!=\"\"} / node_filesystem_size_bytes{job=\"node\",fstype!=\"\"} * 100 < 3\nand\n node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n" - for: 1h - labels: - severity: critical - - alert: NodeFilesystemFilesFillingUp - annotations: - description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up.{% endraw %}' - summary: 'Filesystem is predicted to run out of inodes within the next 24 hours.' - expr: "(\n node_filesystem_files_free{job=\"node\",fstype!=\"\"} / node_filesystem_files{job=\"node\",fstype!=\"\"} * 100 < 40\nand\n predict_linear(node_filesystem_files_free{job=\"node\",fstype!=\"\"}[6h], 24*60*60) < 0\nand\n node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n" - for: 1h - labels: - severity: warning - - alert: NodeFilesystemFilesFillingUp - annotations: - description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up fast.{% endraw %}' - summary: 'Filesystem is predicted to run out of inodes within the next 4 hours.' - expr: "(\n node_filesystem_files_free{job=\"node\",fstype!=\"\"} / node_filesystem_files{job=\"node\",fstype!=\"\"} * 100 < 20\nand\n predict_linear(node_filesystem_files_free{job=\"node\",fstype!=\"\"}[6h], 4*60*60) < 0\nand\n node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n" - for: 1h - labels: - severity: critical - - alert: NodeFilesystemAlmostOutOfFiles - annotations: - description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.{% endraw %}' - summary: 'Filesystem has less than 5% inodes left.' - expr: "(\n node_filesystem_files_free{job=\"node\",fstype!=\"\"} / node_filesystem_files{job=\"node\",fstype!=\"\"} * 100 < 5\nand\n node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n" - for: 1h - labels: - severity: warning - - alert: NodeFilesystemAlmostOutOfFiles - annotations: - description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.{% endraw %}' - summary: 'Filesystem has less than 3% inodes left.' - expr: "(\n node_filesystem_files_free{job=\"node\",fstype!=\"\"} / node_filesystem_files{job=\"node\",fstype!=\"\"} * 100 < 3\nand\n node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n" - for: 1h - labels: - severity: critical - - alert: NodeNetworkReceiveErrs - annotations: - description: '{% raw %}{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last two minutes.{% endraw %}' - summary: 'Network interface is reporting many receive errors.' - expr: "increase(node_network_receive_errs_total[2m]) > 10\n" - for: 1h - labels: - severity: warning - - alert: NodeNetworkTransmitErrs - annotations: - description: '{% raw %}{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last two minutes.{% endraw %}' - summary: 'Network interface is reporting many transmit errors.' - expr: "increase(node_network_transmit_errs_total[2m]) > 10\n" - for: 1h - labels: - severity: warning - - alert: NodeHighNumberConntrackEntriesUsed - annotations: - description: '{% raw %}{{ $value | humanizePercentage }} of conntrack entries are used{% endraw %}' - summary: 'Number of conntrack are getting close to the limit' - expr: "(node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75\n" - labels: - severity: warning - - alert: NodeClockSkewDetected - annotations: - message: '{% raw %}Clock on {{ $labels.instance }} is out of sync by more than 300s. Ensure NTP is configured correctly on this host.{% endraw %}' - summary: 'Clock skew detected.' - expr: "(\n node_timex_offset_seconds > 0.05\nand\n deriv(node_timex_offset_seconds[5m]) >= 0\n)\nor\n(\n node_timex_offset_seconds < -0.05\nand\n deriv(node_timex_offset_seconds[5m]) <= 0\n)\n" - for: 10m - labels: - severity: warning - - alert: NodeClockNotSynchronising - annotations: - message: '{% raw %}Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is configured on this host.{% endraw %}' - summary: 'Clock not synchronising.' - expr: "min_over_time(node_timex_sync_status[5m]) == 0\n" - for: 10m - labels: - severity: warning + appliance-rules: + groups: + - name: all + rules: + - alert: Watchdog + expr: vector(1) + for: 10m + labels: + severity: warning + alertname: Watchdog + annotations: + description: "This is an alert meant to ensure that the entire alerting pipeline is functional.\nThis alert is always firing, therefore it should always be firing in Alertmanager\nand always fire against a receiver. There are integrations with various notification\nmechanisms that send a notification when this alert is not firing. For example the\n\"DeadMansSnitch\" integration in PagerDuty." + summary: 'Ensure entire alerting pipeline is functional' + - alert: InstanceDown + expr: 'up == 0' + for: 5m + labels: + severity: critical + annotations: + description: '{% raw %}{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes.{% endraw %}' + summary: '{% raw %}Instance {{ $labels.instance }} down{% endraw %}' + - alert: RebootRequired + expr: 'node_reboot_required > 0' + labels: + severity: warning + annotations: + description: '{% raw %}{{ $labels.instance }} requires a reboot.{% endraw %}' + summary: '{% raw %}Instance {{ $labels.instance }} - reboot required{% endraw %}' + - alert: NodeFilesystemSpaceFillingUp + annotations: + description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up.{% endraw %}' + summary: 'Filesystem is predicted to run out of space within the next 24 hours.' + expr: "(\n node_filesystem_avail_bytes{job=\"node\",fstype!=\"\"} / node_filesystem_size_bytes{job=\"node\",fstype!=\"\"} * 100 < 40\nand\n predict_linear(node_filesystem_avail_bytes{job=\"node\",fstype!=\"\"}[6h], 24*60*60) < 0\nand\n node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n" + for: 1h + labels: + severity: warning + - alert: NodeFilesystemSpaceFillingUp + annotations: + description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up fast.{% endraw %}' + summary: 'Filesystem is predicted to run out of space within the next 4 hours.' + expr: "(\n node_filesystem_avail_bytes{job=\"node\",fstype!=\"\"} / node_filesystem_size_bytes{job=\"node\",fstype!=\"\"} * 100 < 20\nand\n predict_linear(node_filesystem_avail_bytes{job=\"node\",fstype!=\"\"}[6h], 4*60*60) < 0\nand\n node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n" + for: 1h + labels: + severity: critical + - alert: NodeFilesystemAlmostOutOfSpace + annotations: + description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.{% endraw %}' + summary: 'Filesystem has less than 5% space left.' + expr: "(\n node_filesystem_avail_bytes{job=\"node\",fstype!=\"\"} / node_filesystem_size_bytes{job=\"node\",fstype!=\"\"} * 100 < 5\nand\n node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n" + for: 1h + labels: + severity: warning + - alert: NodeFilesystemAlmostOutOfSpace + annotations: + description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.{% endraw %}' + summary: 'Filesystem has less than 3% space left.' + expr: "(\n node_filesystem_avail_bytes{job=\"node\",fstype!=\"\"} / node_filesystem_size_bytes{job=\"node\",fstype!=\"\"} * 100 < 3\nand\n node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n" + for: 1h + labels: + severity: critical + - alert: NodeFilesystemFilesFillingUp + annotations: + description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up.{% endraw %}' + summary: 'Filesystem is predicted to run out of inodes within the next 24 hours.' + expr: "(\n node_filesystem_files_free{job=\"node\",fstype!=\"\"} / node_filesystem_files{job=\"node\",fstype!=\"\"} * 100 < 40\nand\n predict_linear(node_filesystem_files_free{job=\"node\",fstype!=\"\"}[6h], 24*60*60) < 0\nand\n node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n" + for: 1h + labels: + severity: warning + - alert: NodeFilesystemFilesFillingUp + annotations: + description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up fast.{% endraw %}' + summary: 'Filesystem is predicted to run out of inodes within the next 4 hours.' + expr: "(\n node_filesystem_files_free{job=\"node\",fstype!=\"\"} / node_filesystem_files{job=\"node\",fstype!=\"\"} * 100 < 20\nand\n predict_linear(node_filesystem_files_free{job=\"node\",fstype!=\"\"}[6h], 4*60*60) < 0\nand\n node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n" + for: 1h + labels: + severity: critical + - alert: NodeFilesystemAlmostOutOfFiles + annotations: + description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.{% endraw %}' + summary: 'Filesystem has less than 5% inodes left.' + expr: "(\n node_filesystem_files_free{job=\"node\",fstype!=\"\"} / node_filesystem_files{job=\"node\",fstype!=\"\"} * 100 < 5\nand\n node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n" + for: 1h + labels: + severity: warning + - alert: NodeFilesystemAlmostOutOfFiles + annotations: + description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.{% endraw %}' + summary: 'Filesystem has less than 3% inodes left.' + expr: "(\n node_filesystem_files_free{job=\"node\",fstype!=\"\"} / node_filesystem_files{job=\"node\",fstype!=\"\"} * 100 < 3\nand\n node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n" + for: 1h + labels: + severity: critical + - alert: NodeNetworkReceiveErrs + annotations: + description: '{% raw %}{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last two minutes.{% endraw %}' + summary: 'Network interface is reporting many receive errors.' + expr: "increase(node_network_receive_errs_total[2m]) > 10\n" + for: 1h + labels: + severity: warning + - alert: NodeNetworkTransmitErrs + annotations: + description: '{% raw %}{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last two minutes.{% endraw %}' + summary: 'Network interface is reporting many transmit errors.' + expr: "increase(node_network_transmit_errs_total[2m]) > 10\n" + for: 1h + labels: + severity: warning + - alert: NodeHighNumberConntrackEntriesUsed + annotations: + description: '{% raw %}{{ $value | humanizePercentage }} of conntrack entries are used{% endraw %}' + summary: 'Number of conntrack are getting close to the limit' + expr: "(node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75\n" + labels: + severity: warning + - alert: NodeClockSkewDetected + annotations: + message: '{% raw %}Clock on {{ $labels.instance }} is out of sync by more than 300s. Ensure NTP is configured correctly on this host.{% endraw %}' + summary: 'Clock skew detected.' + expr: "(\n node_timex_offset_seconds > 0.05\nand\n deriv(node_timex_offset_seconds[5m]) >= 0\n)\nor\n(\n node_timex_offset_seconds < -0.05\nand\n deriv(node_timex_offset_seconds[5m]) <= 0\n)\n" + for: 10m + labels: + severity: warning + - alert: NodeClockNotSynchronising + annotations: + message: '{% raw %}Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is configured on this host.{% endraw %}' + summary: 'Clock not synchronising.' + expr: "min_over_time(node_timex_sync_status[5m]) == 0\n" + for: 10m + labels: + severity: warning # ------------------------------------------------------------------------------------------ From 337c1017ba9e3d3798f76a50e5e281089e5d1b6d Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Mon, 14 Oct 2024 09:57:33 +0100 Subject: [PATCH 25/90] Removed monitoring binaries from build --- ansible/fatimage.yml | 72 -------------------------------------------- 1 file changed, 72 deletions(-) diff --git a/ansible/fatimage.yml b/ansible/fatimage.yml index e623c2794..d6c250c5d 100644 --- a/ansible/fatimage.yml +++ b/ansible/fatimage.yml @@ -101,12 +101,6 @@ tasks_from: install.yml when: "'filebeat' in group_names" - - import_role: - # can't only run cloudalchemy.node_exporter/tasks/install.yml as needs vars from preflight.yml and triggers service start - # however starting node exporter is ok - name: cloudalchemy.node_exporter - when: "'node_exporter' in group_names" - - name: openondemand exporter dnf: name: ondemand_exporter @@ -120,72 +114,6 @@ slurm_exporter_state: stopped when: "'slurm_exporter' in group_names" -- hosts: prometheus - become: yes - gather_facts: yes - tasks: - - import_role: - name: cloudalchemy.prometheus - tasks_from: preflight.yml - - # can't run cloudalchemy.prometheus/tasks/install.yml as it triggers a unit start - # so below is a partial extraction of this: - - name: create prometheus system group - group: - name: prometheus - system: true - state: present - - - name: create prometheus system user - user: - name: prometheus - system: true - shell: "/usr/sbin/nologin" - group: prometheus - createhome: false - home: "{{ prometheus_db_dir }}" - - - name: download prometheus binary to local folder - become: false - get_url: - url: "https://github.com/prometheus/prometheus/releases/download/v{{ prometheus_version }}/prometheus-{{ prometheus_version }}.linux-{{ go_arch }}.tar.gz" - dest: "/tmp/prometheus-{{ prometheus_version }}.linux-{{ go_arch }}.tar.gz" - checksum: "sha256:{{ __prometheus_checksum }}" - register: _download_archive - until: _download_archive is succeeded - retries: 5 - delay: 2 - - - name: unpack prometheus binaries - become: false - unarchive: - remote_src: yes - src: "/tmp/prometheus-{{ prometheus_version }}.linux-{{ go_arch }}.tar.gz" - dest: "/tmp" - creates: "/tmp/prometheus-{{ prometheus_version }}.linux-{{ go_arch }}/prometheus" - - - name: propagate official prometheus and promtool binaries - copy: - remote_src: yes - src: "/tmp/prometheus-{{ prometheus_version }}.linux-{{ go_arch }}/{{ item }}" - dest: "{{ _prometheus_binary_install_dir }}/{{ item }}" - mode: 0755 - owner: root - group: root - with_items: - - prometheus - - promtool - -- hosts: grafana - become: yes - gather_facts: yes - tasks: - - name: Include distribution variables for cloudalchemy.grafana - include_vars: "{{ appliances_repository_root }}/ansible/roles/cloudalchemy.grafana/vars/redhat.yml" - - import_role: - name: cloudalchemy.grafana - tasks_from: install.yml - - name: Run post.yml hook vars: appliances_environment_root: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}" From dd1e46495311512ce7c1337a65efb604857f2ac7 Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Mon, 14 Oct 2024 12:18:54 +0100 Subject: [PATCH 26/90] bump for CI test --- .../.stackhpc/terraform/cluster_image.auto.tfvars.json | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json index 4a434e02f..d30b46121 100644 --- a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json @@ -1,7 +1,7 @@ { "cluster_image": { - "RL8": "openhpc-ofed-RL8-241008-1531-2861edba", - "RL9": "openhpc-ofed-RL9-241008-1531-2861edba", - "RL9-cuda": "openhpc-cuda-RL9-241009-1523-354b048a" + "RL8": "openhpc-RL8-241014-0913-337c1017", + "RL9": "openhpc-RL9-241014-0913-337c1017", + "RL9-cuda": "openhpc-cuda-RL9-241014-1026-337c1017" } -} \ No newline at end of file +} From cc6bef133ab823d68602ffaf2a485ccba7459793 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Mon, 14 Oct 2024 12:26:15 +0100 Subject: [PATCH 27/90] ported node-exporter vars --- .../kube_prometheus_stack/defaults/main/helm.yml | 4 ++++ .../kube_prometheus_stack/defaults/main/main.yml | 11 +++++++++++ 2 files changed, 15 insertions(+) diff --git a/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml b/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml index a9d9cfdd9..4e66603ee 100644 --- a/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml +++ b/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml @@ -169,6 +169,10 @@ kube_prometheus_stack_release_defaults: nodeSelector: kubernetes.io/hostname: "{{ openhpc_cluster_name }}-control.{{ openhpc_cluster_name }}.{{ cluster_domain_suffix }}" + prometheus-node-exporter: + image: + tag: "{{ node_exporter_image_tag }}" + kube_prometheus_stack_release_overrides: {} kube_prometheus_stack_release_values: >- diff --git a/ansible/roles/kube_prometheus_stack/defaults/main/main.yml b/ansible/roles/kube_prometheus_stack/defaults/main/main.yml index 9c3575e17..c72dd6894 100644 --- a/ansible/roles/kube_prometheus_stack/defaults/main/main.yml +++ b/ansible/roles/kube_prometheus_stack/defaults/main/main.yml @@ -465,3 +465,14 @@ alertmanager_image_tag: v0.27.0 alertmanager_template_files: {} # alertmanager_http_config: {} + +### Previous node exporter vars + +node_exporter_image_tag: v1.8.2 + +# node_exporter_tls_server_config: {} + +# node_exporter_http_server_config: {} + +# node_exporter_basic_auth_users: {} + From 21a8d25723b2617718d0a9eee7e6e7998525a903 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Mon, 14 Oct 2024 14:26:11 +0100 Subject: [PATCH 28/90] non-atomic helm install for ci test --- ansible/roles/kube_prometheus_stack/tasks/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ansible/roles/kube_prometheus_stack/tasks/main.yml b/ansible/roles/kube_prometheus_stack/tasks/main.yml index e0efa7975..6c67c9947 100644 --- a/ansible/roles/kube_prometheus_stack/tasks/main.yml +++ b/ansible/roles/kube_prometheus_stack/tasks/main.yml @@ -189,7 +189,7 @@ release_namespace: "{{ kube_prometheus_stack_release_namespace }}" release_name: "{{ kube_prometheus_stack_release_name }}" release_values: "{{ kube_prometheus_stack_release_values }}" - atomic: yes + atomic: no create_namespace: no wait: yes wait_timeout: "{{ kube_prometheus_stack_wait_timeout }}" From 76842f1526bec4a05566a42bdac9c625f0c946bb Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Mon, 14 Oct 2024 15:26:49 +0100 Subject: [PATCH 29/90] fixed hostnames not recognised by selector and defaulted slack integration off --- .../kube_prometheus_stack/defaults/main/helm.yml | 12 +++++++----- .../kube_prometheus_stack/defaults/main/main.yml | 11 ----------- 2 files changed, 7 insertions(+), 16 deletions(-) diff --git a/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml b/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml index 4e66603ee..666b7cb76 100644 --- a/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml +++ b/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml @@ -43,6 +43,8 @@ kube_prometheus_stack_prometheus_external_url: >- else "" }} +control_hostname: "{{ openhpc_cluster_name + '-control.'+ openhpc_cluster_name + '.' + cluster_domain_suffix | lower }}" + # The values for the kube-prometheus-stack release kube_prometheus_stack_release_defaults: defaultRules: @@ -89,7 +91,7 @@ kube_prometheus_stack_release_defaults: runAsGroup: 0 fsGroup: 0 nodeSelector: - kubernetes.io/hostname: "{{ openhpc_cluster_name }}-control.{{ openhpc_cluster_name }}.{{ cluster_domain_suffix }}" + kubernetes.io/hostname: "{{ control_hostname }}" additionalPrometheusRulesMap: "{{ prometheus_alert_rules }}" @@ -120,7 +122,7 @@ kube_prometheus_stack_release_defaults: additionalDataSources: "{{ grafana_datasources }}" plugins: "{{ grafana_plugins }}" nodeSelector: - kubernetes.io/hostname: "{{ openhpc_cluster_name }}-control.{{ openhpc_cluster_name }}.{{ cluster_domain_suffix }}" + kubernetes.io/hostname: "{{ control_hostname }}" persistence: type: pvc enabled: true @@ -153,7 +155,7 @@ kube_prometheus_stack_release_defaults: alertmanagerConfigMatcherStrategy: type: None nodeSelector: - kubernetes.io/hostname: "{{ openhpc_cluster_name }}-control.{{ openhpc_cluster_name }}.{{ cluster_domain_suffix }}" + kubernetes.io/hostname: "{{ control_hostname }}" scheme: "{{ prometheus_alertmanager_config.scheme | default( '' ) }}" serviceMonitor: scheme: "{{ prometheus_alertmanager_config.scheme | default( '' ) }}" @@ -163,11 +165,11 @@ kube_prometheus_stack_release_defaults: prometheusOperator: nodeSelector: - kubernetes.io/hostname: "{{ openhpc_cluster_name }}-control.{{ openhpc_cluster_name }}.{{ cluster_domain_suffix }}" + kubernetes.io/hostname: "{{ control_hostname }}" kube-state-metrics: nodeSelector: - kubernetes.io/hostname: "{{ openhpc_cluster_name }}-control.{{ openhpc_cluster_name }}.{{ cluster_domain_suffix }}" + kubernetes.io/hostname: "{{ control_hostname }}" prometheus-node-exporter: image: diff --git a/ansible/roles/kube_prometheus_stack/defaults/main/main.yml b/ansible/roles/kube_prometheus_stack/defaults/main/main.yml index c72dd6894..50a10a0c3 100644 --- a/ansible/roles/kube_prometheus_stack/defaults/main/main.yml +++ b/ansible/roles/kube_prometheus_stack/defaults/main/main.yml @@ -27,21 +27,10 @@ slack_integration: alertmanager_config: route: group_by: ['...'] - receiver: slack-receiver global: resolve_timeout: "{{ prometheus_config_flags_extra.alertmanager.timeout | default( '5m' ) }}" receivers: - name: 'null' - - name: slack-receiver - slack_configs: - - channel: "{{ slack_integration.channel }}" - api_url: https://slack.com/api/chat.postMessage - http_config: - authorization: - credentials: "{{ slack_integration.app_creds }}" - text: "{{ '{{' }} .GroupLabels.alertname {{ '}}' }} : {{ '{{' }} .CommonAnnotations.description {{ '}}' }}" - title_link: "http://{{ control_ip }}/alertmanager/#/alerts?receiver=slack-receiver" - send_resolved: true alertmanager_replicas: 1 From 185eafb7afde49ef79421e53209703436e1943a4 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Mon, 14 Oct 2024 16:02:40 +0100 Subject: [PATCH 30/90] fixed k3s hostnames properly --- ansible/roles/kube_prometheus_stack/defaults/main/helm.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml b/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml index 666b7cb76..ed6aadf81 100644 --- a/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml +++ b/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml @@ -43,7 +43,7 @@ kube_prometheus_stack_prometheus_external_url: >- else "" }} -control_hostname: "{{ openhpc_cluster_name + '-control.'+ openhpc_cluster_name + '.' + cluster_domain_suffix | lower }}" +control_hostname: "{{ ( openhpc_cluster_name + '-control.'+ openhpc_cluster_name + '.' + cluster_domain_suffix ) | lower }}" # The values for the kube-prometheus-stack release kube_prometheus_stack_release_defaults: From 10d4e93eb895c7aa81131b4d1963bbdc345ea8d3 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Tue, 15 Oct 2024 10:07:29 +0100 Subject: [PATCH 31/90] increased control node CI memory --- environments/.stackhpc/terraform/LEAFCLOUD.tfvars | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environments/.stackhpc/terraform/LEAFCLOUD.tfvars b/environments/.stackhpc/terraform/LEAFCLOUD.tfvars index 5e73896c8..3954a34a8 100644 --- a/environments/.stackhpc/terraform/LEAFCLOUD.tfvars +++ b/environments/.stackhpc/terraform/LEAFCLOUD.tfvars @@ -1,6 +1,6 @@ cluster_net = "slurmapp-ci" cluster_subnet = "slurmapp-ci" -control_node_flavor = "ec1.medium" # small ran out of memory, medium gets down to ~100Mi mem free on deployment +control_node_flavor = "en1.medium" # min 8GB RAM other_node_flavor = "en1.xsmall" state_volume_type = "unencrypted" home_volume_type = "unencrypted" From d1e8c0ad2aa62e95da8f5922f06ba68697dcc73b Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Tue, 15 Oct 2024 13:45:56 +0100 Subject: [PATCH 32/90] Refactored monitoring config and removed redundant groups --- .../defaults/main/helm.yml | 6 +- .../defaults/main/main.yml | 137 +----------------- .../kube_prometheus_stack/tasks/main.yml | 6 +- .../inventory/group_vars/all/alertmanager.yml | 26 ++++ .../inventory/group_vars/all/defaults.yml | 25 +--- .../inventory/group_vars/all/grafana.yml | 44 +----- .../inventory/group_vars/all/monitoring.yml | 6 + .../inventory/group_vars/all/openondemand.yml | 8 +- .../inventory/group_vars/all/prometheus.yml | 45 +++--- environments/common/inventory/groups | 10 -- environments/common/layouts/everything | 9 -- 11 files changed, 78 insertions(+), 244 deletions(-) create mode 100644 environments/common/inventory/group_vars/all/alertmanager.yml create mode 100644 environments/common/inventory/group_vars/all/monitoring.yml diff --git a/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml b/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml index ed6aadf81..c6b65c110 100644 --- a/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml +++ b/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml @@ -73,7 +73,7 @@ kube_prometheus_stack_release_defaults: - ReadWriteOnce resources: requests: - storage: "{{ kube_prometheus_stack_volume_size }}" + storage: "{{ prometheus_volume_size }}" retention: "{{ prometheus_storage_retention }}" retentionSize: "{{ prometheus_storage_retention_size }}" additionalAlertRelabelConfigs: "{{ prometheus_alert_relabel_configs }}" @@ -102,7 +102,7 @@ kube_prometheus_stack_release_defaults: serviceMonitor: enabled: false ingress: - path: "/node/{{ groups['grafana'].0 }}/{{ grafana_port }}" + path: "/node/{{ groups['prometheus'].0 }}/{{ grafana_port }}" sidecar: dashboards: searchNamespace: ALL @@ -113,7 +113,7 @@ kube_prometheus_stack_release_defaults: serve_from_sub_path: true auth: "{{ grafana_auth }}" auth.anonymous: - enabled: "{{ grafana_anonymous_auth }}" + enabled: "{{ grafana_auth_anonymous }}" analytics: "{{ grafana_analytics }}" smtp: "{{ grafana_smtp }}" log: "{{ grafana_log }}" diff --git a/ansible/roles/kube_prometheus_stack/defaults/main/main.yml b/ansible/roles/kube_prometheus_stack/defaults/main/main.yml index 50a10a0c3..4815269c9 100644 --- a/ansible/roles/kube_prometheus_stack/defaults/main/main.yml +++ b/ansible/roles/kube_prometheus_stack/defaults/main/main.yml @@ -16,9 +16,9 @@ login_ip: "{{ hostvars[groups['openondemand'][0]]['ansible_host'] }}" # probably control_ip: "{{ ansible_default_ipv4.address| default(ansible_all_ipv4_addresses[0]) }}" control_sslip: "{{ control_ip | regex_replace('\\.', '-') }}.sslip.io" -grafana_claim_size: 10Gi +grafana_volume_size: 10Gi -grafana_anonymous_auth: true +grafana_auth_anonymous: true slack_integration: channel: "#alerts" @@ -64,7 +64,7 @@ prometheus_storage_retention: "30d" # supported: KB, MB, GB, TB, PB. prometheus_storage_retention_size: "40GB" -kube_prometheus_stack_volume_size: 40Gi +prometheus_volume_size: 40Gi prometheus_config_flags_extra: {} # prometheus_config_flags_extra: @@ -126,138 +126,13 @@ prometheus_scrape_configs: # - prometheus/targets/*.yml # - prometheus/targets/*.json +prometheus_extra_alert_rules: [] + prometheus_alert_rules: appliance-rules: groups: - name: all - rules: - - alert: Watchdog - expr: vector(1) - for: 10m - labels: - severity: warning - alertname: Watchdog - annotations: - description: "This is an alert meant to ensure that the entire alerting pipeline is functional.\nThis alert is always firing, therefore it should always be firing in Alertmanager\nand always fire against a receiver. There are integrations with various notification\nmechanisms that send a notification when this alert is not firing. For example the\n\"DeadMansSnitch\" integration in PagerDuty." - summary: 'Ensure entire alerting pipeline is functional' - - alert: InstanceDown - expr: 'up == 0' - for: 5m - labels: - severity: critical - annotations: - description: '{% raw %}{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes.{% endraw %}' - summary: '{% raw %}Instance {{ $labels.instance }} down{% endraw %}' - - alert: RebootRequired - expr: 'node_reboot_required > 0' - labels: - severity: warning - annotations: - description: '{% raw %}{{ $labels.instance }} requires a reboot.{% endraw %}' - summary: '{% raw %}Instance {{ $labels.instance }} - reboot required{% endraw %}' - - alert: NodeFilesystemSpaceFillingUp - annotations: - description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up.{% endraw %}' - summary: 'Filesystem is predicted to run out of space within the next 24 hours.' - expr: "(\n node_filesystem_avail_bytes{job=\"node\",fstype!=\"\"} / node_filesystem_size_bytes{job=\"node\",fstype!=\"\"} * 100 < 40\nand\n predict_linear(node_filesystem_avail_bytes{job=\"node\",fstype!=\"\"}[6h], 24*60*60) < 0\nand\n node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n" - for: 1h - labels: - severity: warning - - alert: NodeFilesystemSpaceFillingUp - annotations: - description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up fast.{% endraw %}' - summary: 'Filesystem is predicted to run out of space within the next 4 hours.' - expr: "(\n node_filesystem_avail_bytes{job=\"node\",fstype!=\"\"} / node_filesystem_size_bytes{job=\"node\",fstype!=\"\"} * 100 < 20\nand\n predict_linear(node_filesystem_avail_bytes{job=\"node\",fstype!=\"\"}[6h], 4*60*60) < 0\nand\n node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n" - for: 1h - labels: - severity: critical - - alert: NodeFilesystemAlmostOutOfSpace - annotations: - description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.{% endraw %}' - summary: 'Filesystem has less than 5% space left.' - expr: "(\n node_filesystem_avail_bytes{job=\"node\",fstype!=\"\"} / node_filesystem_size_bytes{job=\"node\",fstype!=\"\"} * 100 < 5\nand\n node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n" - for: 1h - labels: - severity: warning - - alert: NodeFilesystemAlmostOutOfSpace - annotations: - description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.{% endraw %}' - summary: 'Filesystem has less than 3% space left.' - expr: "(\n node_filesystem_avail_bytes{job=\"node\",fstype!=\"\"} / node_filesystem_size_bytes{job=\"node\",fstype!=\"\"} * 100 < 3\nand\n node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n" - for: 1h - labels: - severity: critical - - alert: NodeFilesystemFilesFillingUp - annotations: - description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up.{% endraw %}' - summary: 'Filesystem is predicted to run out of inodes within the next 24 hours.' - expr: "(\n node_filesystem_files_free{job=\"node\",fstype!=\"\"} / node_filesystem_files{job=\"node\",fstype!=\"\"} * 100 < 40\nand\n predict_linear(node_filesystem_files_free{job=\"node\",fstype!=\"\"}[6h], 24*60*60) < 0\nand\n node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n" - for: 1h - labels: - severity: warning - - alert: NodeFilesystemFilesFillingUp - annotations: - description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up fast.{% endraw %}' - summary: 'Filesystem is predicted to run out of inodes within the next 4 hours.' - expr: "(\n node_filesystem_files_free{job=\"node\",fstype!=\"\"} / node_filesystem_files{job=\"node\",fstype!=\"\"} * 100 < 20\nand\n predict_linear(node_filesystem_files_free{job=\"node\",fstype!=\"\"}[6h], 4*60*60) < 0\nand\n node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n" - for: 1h - labels: - severity: critical - - alert: NodeFilesystemAlmostOutOfFiles - annotations: - description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.{% endraw %}' - summary: 'Filesystem has less than 5% inodes left.' - expr: "(\n node_filesystem_files_free{job=\"node\",fstype!=\"\"} / node_filesystem_files{job=\"node\",fstype!=\"\"} * 100 < 5\nand\n node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n" - for: 1h - labels: - severity: warning - - alert: NodeFilesystemAlmostOutOfFiles - annotations: - description: '{% raw %}Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.{% endraw %}' - summary: 'Filesystem has less than 3% inodes left.' - expr: "(\n node_filesystem_files_free{job=\"node\",fstype!=\"\"} / node_filesystem_files{job=\"node\",fstype!=\"\"} * 100 < 3\nand\n node_filesystem_readonly{job=\"node\",fstype!=\"\"} == 0\n)\n" - for: 1h - labels: - severity: critical - - alert: NodeNetworkReceiveErrs - annotations: - description: '{% raw %}{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last two minutes.{% endraw %}' - summary: 'Network interface is reporting many receive errors.' - expr: "increase(node_network_receive_errs_total[2m]) > 10\n" - for: 1h - labels: - severity: warning - - alert: NodeNetworkTransmitErrs - annotations: - description: '{% raw %}{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last two minutes.{% endraw %}' - summary: 'Network interface is reporting many transmit errors.' - expr: "increase(node_network_transmit_errs_total[2m]) > 10\n" - for: 1h - labels: - severity: warning - - alert: NodeHighNumberConntrackEntriesUsed - annotations: - description: '{% raw %}{{ $value | humanizePercentage }} of conntrack entries are used{% endraw %}' - summary: 'Number of conntrack are getting close to the limit' - expr: "(node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75\n" - labels: - severity: warning - - alert: NodeClockSkewDetected - annotations: - message: '{% raw %}Clock on {{ $labels.instance }} is out of sync by more than 300s. Ensure NTP is configured correctly on this host.{% endraw %}' - summary: 'Clock skew detected.' - expr: "(\n node_timex_offset_seconds > 0.05\nand\n deriv(node_timex_offset_seconds[5m]) >= 0\n)\nor\n(\n node_timex_offset_seconds < -0.05\nand\n deriv(node_timex_offset_seconds[5m]) <= 0\n)\n" - for: 10m - labels: - severity: warning - - alert: NodeClockNotSynchronising - annotations: - message: '{% raw %}Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is configured on this host.{% endraw %}' - summary: 'Clock not synchronising.' - expr: "min_over_time(node_timex_sync_status[5m]) == 0\n" - for: 10m - labels: - severity: warning + rules: "{{ prometheus_extra_alert_rules }}" # ------------------------------------------------------------------------------------------ diff --git a/ansible/roles/kube_prometheus_stack/tasks/main.yml b/ansible/roles/kube_prometheus_stack/tasks/main.yml index 6c67c9947..a0b301e46 100644 --- a/ansible/roles/kube_prometheus_stack/tasks/main.yml +++ b/ansible/roles/kube_prometheus_stack/tasks/main.yml @@ -46,7 +46,7 @@ app.kubernetes.io/name: prometheus-dir spec: capacity: - storage: "{{ kube_prometheus_stack_volume_size }}" + storage: "{{ prometheus_volume_size }}" accessModes: - ReadWriteOnce hostPath: @@ -65,7 +65,7 @@ app.kubernetes.io/name: grafana-dir spec: capacity: - storage: "{{ grafana_claim_size }}" + storage: "{{ grafana_volume_size }}" accessModes: - ReadWriteOnce hostPath: @@ -85,7 +85,7 @@ - ReadWriteOnce resources: requests: - storage: "{{ grafana_claim_size }}" + storage: "{{ grafana_volume_size }}" volumeMode: Filesystem volumeName: grafana-dir diff --git a/environments/common/inventory/group_vars/all/alertmanager.yml b/environments/common/inventory/group_vars/all/alertmanager.yml new file mode 100644 index 000000000..44eaa315b --- /dev/null +++ b/environments/common/inventory/group_vars/all/alertmanager.yml @@ -0,0 +1,26 @@ +alertmanager_replicas: 1 +alertmanager_port: 30002 # Must be within K3s' reserved port range (default 30000-32767) + +# Add receivers here, uncomment below and add Slack bot app creds for Slack integration +alertmanager_config: + route: + group_by: ['...'] + # receiver: slack-receiver + global: + resolve_timeout: "{{ prometheus_config_flags_extra.alertmanager.timeout | default( '5m' ) }}" + receivers: + - name: 'null' + # - name: slack-receiver + # slack_configs: + # - channel: "{{ slack_integration.channel }}" + # api_url: https://slack.com/api/chat.postMessage + # http_config: + # authorization: + # credentials: "{{ slack_integration.app_creds }}" + # text: "{{ '{{' }} .GroupLabels.alertname {{ '}}' }} : {{ '{{' }} .CommonAnnotations.description {{ '}}' }}" + # title_link: "http://{{ control_ip }}/alertmanager/#/alerts?receiver=slack-receiver" + # send_resolved: true + +# slack_integration: +# channel: '#alerts' +# app_creds: diff --git a/environments/common/inventory/group_vars/all/defaults.yml b/environments/common/inventory/group_vars/all/defaults.yml index 15340820f..43c3c9d16 100644 --- a/environments/common/inventory/group_vars/all/defaults.yml +++ b/environments/common/inventory/group_vars/all/defaults.yml @@ -18,7 +18,7 @@ api_address: "{{ inventory_hostname }}" opensearch_address: "127.0.0.1" prometheus_address: "{{ hostvars[groups['prometheus'].0].api_address }}" openondemand_address: "{{ hostvars[groups['openondemand'].0].api_address if groups['openondemand'] | count > 0 else '' }}" -grafana_address: "{{ hostvars[groups['grafana'].0].api_address }}" +grafana_address: "{{ hostvars[groups['prometheus'].0].api_address }}" ############################# bootstrap: local user configuration ######################### @@ -50,29 +50,6 @@ appliances_local_users_default: shell: /sbin/nologin uid: 202 system: true - - - group: - name: prometheus - gid: 976 - user: - name: prometheus - uid: 981 - home: "{{ prometheus_db_dir }}" - shell: /usr/sbin/nologin - system: true - enable: "{{ 'prometheus' in group_names }}" - - - group: - name: grafana - gid: 979 - user: - name: grafana - comment: grafana user - uid: 984 - home: /usr/share/grafana - shell: /sbin/nologin - system: true - enable: "{{ 'grafana' in group_names }}" # Overide this to add extra users whilst keeping the defaults. appliances_local_users_extra: [] # see format of appliances_local_users_default above diff --git a/environments/common/inventory/group_vars/all/grafana.yml b/environments/common/inventory/group_vars/all/grafana.yml index cc6a325b4..d8f591c68 100644 --- a/environments/common/inventory/group_vars/all/grafana.yml +++ b/environments/common/inventory/group_vars/all/grafana.yml @@ -1,24 +1,15 @@ ---- - -# See: https://github.com/cloudalchemy/ansible-grafana -# for variable definitions. -grafana_version: '9.5.21' - -# need to copy some role defaults here so we can use in inventory: -grafana_port: 30001 +grafana_image_tag: '11.2.2' +grafana_port: 30001 # Must be within K3s' reserved port range (default 30000-32767) # Define where state is stored grafana_data_dir: "{{ appliances_state_dir | default('/var/lib') }}/grafana" -# Configure internal address & URL - note "api" means "internal" to cloudalchemy.grafana but "external" to appliance: -grafana_api_address: "{{ hostvars[groups['grafana'].0].internal_address }}" -grafana_api_url: "http://{{ grafana_api_address }}:{{ grafana_port }}" - # Configure external address, with external URL depending on whether we are using Open Ondemand as a proxy +grafana_api_address: "{{ hostvars[groups['prometheus'].0].internal_address }}" +grafana_api_url: "http://{{ grafana_api_address }}:{{ grafana_port }}" grafana_url_direct: "http://{{ grafana_address }}:{{ grafana_port }}" -grafana_url_openondemand_proxy: "https://{{ openondemand_servername | default('') }}/node/{{ groups['grafana'].0 }}/{{ grafana_port }}" +grafana_url_openondemand_proxy: "https://{{ openondemand_servername | default('') }}/node/{{ groups['prometheus'].0 }}/{{ grafana_port }}" grafana_url: "{{ grafana_url_openondemand_proxy if groups['openondemand'] | count > 0 else grafana_url_direct }}" -grafana_serve_from_sub_path: "{{ groups['openondemand'] | count > 0 }}" grafana_dashboards_default: # node exporter slurm: @@ -49,7 +40,6 @@ grafana_dashboards_default: - placeholder: DS_PROMETHEUS replacement: prometheus revision_id: 3 - grafana_dashboards: "{{ grafana_dashboards_default + (openondemand_dashboard if groups.get('openondemand') else []) }}" grafana_security: @@ -58,10 +48,6 @@ grafana_security: allow_embedding: true grafana_datasources: - # - name: prometheus - # type: prometheus - # url: "http://{{ prometheus_address }}:9090" # default prometheus port - # editable: true - name: slurmstats # see https://github.com/grafana/opensearch-datasource#configure-the-data-source-with-provisioning type: grafana-opensearch-datasource @@ -81,27 +67,11 @@ grafana_datasources: flavor: elasticsearch editable: true # readOnly: false - grafana_plugins: - grafana-opensearch-datasource 2.8.1 -# want to set grafana_server.serve_from_sub_path if have Open Ondemand to proxy: -grafana_server: - # role defaults: - protocol: http - enforce_domain: false - socket: "" - cert_key: "" - cert_file: "" - enable_gzip: false - static_root_path: public - router_logging: false - # appliance specific: - serve_from_sub_path: "{{ grafana_serve_from_sub_path }}" - - -grafana_auth_anonymous: false # Enable anonymous View-only login - see implications: https://grafana.com/docs/grafana/latest/administration/security/#implications-of-enabling-anonymous-access-to-dashboards - +grafana_auth_anonymous: true # Enable anonymous View-only login - see implications: https://grafana.com/docs/grafana/latest/administration/security/#implications-of-enabling-anonymous-access-to-dashboards +grafana_volume_size: 10Gi _grafana_auth_anon_cfg: anonymous: org_name: "Main Org." diff --git a/environments/common/inventory/group_vars/all/monitoring.yml b/environments/common/inventory/group_vars/all/monitoring.yml new file mode 100644 index 000000000..098039b44 --- /dev/null +++ b/environments/common/inventory/group_vars/all/monitoring.yml @@ -0,0 +1,6 @@ +kube_prometheus_stack_chart_version: 59.1.0 +kube_prometheus_stack_release_namespace: monitoring-system +kube_prometheus_stack_release_name: kube-prometheus-stack +kube_prometheus_stack_wait_timeout: 5m + +# See prometheus.yml, grafana.yml and alertmanager.yml for config of individual monitoring services diff --git a/environments/common/inventory/group_vars/all/openondemand.yml b/environments/common/inventory/group_vars/all/openondemand.yml index f206f8192..621735000 100644 --- a/environments/common/inventory/group_vars/all/openondemand.yml +++ b/environments/common/inventory/group_vars/all/openondemand.yml @@ -11,7 +11,7 @@ # e.g. if the group `compute` has hosts `compute-{0,1,2,..}` this will be '(compute-\d+)|(control)'. # The autogenerated regex may need overriding if compute node names do not contain numbers in a consistent position # or include regex special characters. -openondemand_host_regex: "{{ (groups['compute'] + groups['grafana']) | to_ood_regex }}" +openondemand_host_regex: "{{ (groups['compute'] + groups['prometheus']) | to_ood_regex }}" # Add grafana to dashboard links to OOD only if grafana group is available openondemand_dashboard_links_grafana: @@ -20,7 +20,7 @@ openondemand_dashboard_links_grafana: category: Monitoring description: Dashboards url: "{{ grafana_url_openondemand_proxy }}" -openondemand_dashboard_links: "{{ openondemand_dashboard_links_grafana if groups['grafana'] | length > 0 }}" +openondemand_dashboard_links: "{{ openondemand_dashboard_links_grafana if groups['prometheus'] | length > 0 }}" openondemand_login_host: localhost @@ -56,7 +56,7 @@ openondemand_clusters: export -f xfce4-session %s set_host: host=$(hostname -s) - custom: "{{ openondemand_clusters_grafana if groups['grafana'] | length > 0 else {} }}" + custom: "{{ openondemand_clusters_grafana if groups['prometheus'] | length > 0 else {} }}" openondemand_clusters_grafana: # embed grafana panels in Jobs app: https://osc.github.io/ood-documentation/latest/customization.html#grafana-support @@ -193,5 +193,5 @@ openondemand_dashboard: _opeonondemand_unset_auth: ' RequestHeader unset Authorization' # Fix grafana proxying for basic auth if anonymous grafana access enabled: -openondemand_node_proxy_directives: "{{ _opeonondemand_unset_auth if (openondemand_auth == 'basic_pam' and 'openondemand_host_regex' and groups['grafana'] | length > 0 and hostvars[ groups['grafana'] | first]._grafana_auth_is_anonymous) else '' }}" +openondemand_node_proxy_directives: "{{ _opeonondemand_unset_auth if (openondemand_auth == 'basic_pam' and 'openondemand_host_regex' and groups['prometheus'] | length > 0 and hostvars[ groups['prometheus'] | first]._grafana_auth_is_anonymous) else '' }}" # Reason: OOD server forwards headers to proxied servers, so when if using basic auth Grafana gets passed the Open Ondemand user. This probably isn't a Grafana user so it throws an auth error. If anonymous access is enabled we can work around this by not forwarding auth header. diff --git a/environments/common/inventory/group_vars/all/prometheus.yml b/environments/common/inventory/group_vars/all/prometheus.yml index 9c800e090..5fdef9728 100644 --- a/environments/common/inventory/group_vars/all/prometheus.yml +++ b/environments/common/inventory/group_vars/all/prometheus.yml @@ -1,26 +1,12 @@ ---- +# Must be within K3s' reserved port range (default 30000-32767) +prometheus_port: 30000 -# See: https://github.com/cloudalchemy/ansible-prometheus -# for variable definitions - -prometheus_version: 2.27.0 # default from ansible/roles/cloudalchemy.prometheus/defaults/main.yml -prometheus_web_external_url: "http://{{ prometheus_address }}:9090" -prometheus_storage_retention: "31d" -prometheus_storage_retention_size: "100GB" -prometheus_db_dir: "{{ appliances_state_dir | default('/var/lib') }}/prometheus" - -prometheus_alertmanager_config: [] - -prometheus_alert_rules_files: -- "{{ appliances_repository_root }}/environments/common/files/prometheus/rules/*.rules" - -prometheus_alert_rules: [] - -# Can set a hostvar 'env' to an arbitrary string to group prometheus targets, e.g. rack. -# env: location-1 -prometheus_targets: - node: "{{ groups.get('node_exporter', []) | reject('equalto', 'localhost') | prometheus_node_exporter_targets(env | default('ungrouped')) }}" +prometheus_image_tag: "v2.27.0" +prometheus_db_dir: "{{ appliances_state_dir }}/prometheus" +prometheus_storage_retention: "30d" +prometheus_volume_size: 40Gi +prometheus_storage_retention_size: "40GB" prometheus_scrape_configs_default: - job_name: "slurm_exporter" scrape_interval: 30s @@ -28,7 +14,20 @@ prometheus_scrape_configs_default: static_configs: - targets: - "{{ control_ip }}:{{ slurm_exporter_port }}" - -kube_prometheus_stack_release_namespace: monitoring-system + relabel_configs: + # strip off port + - source_labels: ['__address__'] + separator: ':' + regex: '(.*):.*' + target_label: 'instance' + replacement: '${1}' prometheus_scrape_configs: "{{ prometheus_scrape_configs_default + (openondemand_scrape_configs if groups['openondemand'] | count > 0 else []) }}" +prometheus_extra_alert_rules: + - alert: SlurmNodeDown + annotations: + message: '{% raw %}{{ $value }} Slurm nodes are in down status.{% endraw %}' + summary: 'At least one Slurm node is down.' + expr: "slurm_nodes_down > 0\n" + labels: + severity: critical diff --git a/environments/common/inventory/groups b/environments/common/inventory/groups index 353ffab4b..903f7f5ab 100644 --- a/environments/common/inventory/groups +++ b/environments/common/inventory/groups @@ -36,12 +36,6 @@ mysql [prometheus] # Single node to host monitoring server. -[grafana] -# Single node to host monitoring dashboards. - -[alertmanager] -# TODO: - [opensearch] # Single node to host ElasticSearch search engine for Slurm monitoring. @@ -58,9 +52,6 @@ mysql [mysql] # Single node to run database used for Slurm accounting. -[node_exporter] -# All hosts to monitor for hardware and OS metrics. - [selinux:children] # All hosts requiring control of SELinux status. cluster @@ -100,7 +91,6 @@ fail2ban [systemd:children] # Hosts to make systemd unit adjustments on opensearch -grafana control prometheus diff --git a/environments/common/layouts/everything b/environments/common/layouts/everything index 613d89497..51440bf9e 100644 --- a/environments/common/layouts/everything +++ b/environments/common/layouts/everything @@ -7,15 +7,6 @@ control [prometheus:children] control -[grafana:children] -control - -[alertmanager:children] -control - -[node_exporter:children] -cluster - [opensearch:children] control From 96723f818c5c50823d94c7fa6efe002b611f209e Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Tue, 15 Oct 2024 14:36:14 +0100 Subject: [PATCH 33/90] updated dashboard defaults --- ansible/roles/grafana-dashboards/defaults/main.yml | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/ansible/roles/grafana-dashboards/defaults/main.yml b/ansible/roles/grafana-dashboards/defaults/main.yml index 064b86498..6885a3e2a 100644 --- a/ansible/roles/grafana-dashboards/defaults/main.yml +++ b/ansible/roles/grafana-dashboards/defaults/main.yml @@ -1,15 +1,12 @@ --- - -grafana_address: "0.0.0.0" -grafana_port: 80 +grafana_port: 30001 # External Grafana address. Variable maps to "root_url" in grafana server section grafana_url: "http://{{ grafana_address }}:{{ grafana_port }}" grafana_api_url: "{{ grafana_url }}" grafana_security: - admin_user: admin - admin_password: "" + admin_user: grafana + admin_password: "{{ vault_grafana_admin_password }}" -grafana_data_dir: "/var/lib/grafana" -grafana_dashboards_dir: "dashboards" +grafana_data_dir: "{{ appliances_state_dir | default('/var/lib') }}/grafana" From bf9a473ecf4514728aeaefbff4766ff18c46f93c Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Tue, 15 Oct 2024 15:24:40 +0100 Subject: [PATCH 34/90] fixed caas cluster name --- environments/.caas/inventory/group_vars/all/monitoring.yml | 1 + 1 file changed, 1 insertion(+) create mode 100644 environments/.caas/inventory/group_vars/all/monitoring.yml diff --git a/environments/.caas/inventory/group_vars/all/monitoring.yml b/environments/.caas/inventory/group_vars/all/monitoring.yml new file mode 100644 index 000000000..ee7d9af35 --- /dev/null +++ b/environments/.caas/inventory/group_vars/all/monitoring.yml @@ -0,0 +1 @@ +control_hostname: "{{ ( openhpc_cluster_name + '-control.'+ openhpc_cluster_name ) | lower }}" \ No newline at end of file From fdb5c237348d79b2ca4f25f2056964d7e9c0a875 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Tue, 15 Oct 2024 16:21:00 +0100 Subject: [PATCH 35/90] nodeselectors now use custom labels --- ansible/roles/k3s/tasks/main.yml | 4 ++-- .../kube_prometheus_stack/defaults/main/helm.yml | 12 +++++------- .../.caas/inventory/group_vars/all/monitoring.yml | 1 - 3 files changed, 7 insertions(+), 10 deletions(-) delete mode 100644 environments/.caas/inventory/group_vars/all/monitoring.yml diff --git a/ansible/roles/k3s/tasks/main.yml b/ansible/roles/k3s/tasks/main.yml index 9e9e2ddf2..f455a0c67 100644 --- a/ansible/roles/k3s/tasks/main.yml +++ b/ansible/roles/k3s/tasks/main.yml @@ -28,8 +28,8 @@ INSTALL_K3S_BIN_DIR: "/usr/bin" changed_when: true loop: - - server --disable=traefik - - agent + - server --disable=traefik --node-label clusterrole=server + - agent --node-label clusterrole=agent - name: Install helm unarchive: diff --git a/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml b/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml index c6b65c110..45bbd6974 100644 --- a/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml +++ b/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml @@ -43,8 +43,6 @@ kube_prometheus_stack_prometheus_external_url: >- else "" }} -control_hostname: "{{ ( openhpc_cluster_name + '-control.'+ openhpc_cluster_name + '.' + cluster_domain_suffix ) | lower }}" - # The values for the kube-prometheus-stack release kube_prometheus_stack_release_defaults: defaultRules: @@ -91,7 +89,7 @@ kube_prometheus_stack_release_defaults: runAsGroup: 0 fsGroup: 0 nodeSelector: - kubernetes.io/hostname: "{{ control_hostname }}" + clusterrole: "server" additionalPrometheusRulesMap: "{{ prometheus_alert_rules }}" @@ -122,7 +120,7 @@ kube_prometheus_stack_release_defaults: additionalDataSources: "{{ grafana_datasources }}" plugins: "{{ grafana_plugins }}" nodeSelector: - kubernetes.io/hostname: "{{ control_hostname }}" + clusterrole: "server" persistence: type: pvc enabled: true @@ -155,7 +153,7 @@ kube_prometheus_stack_release_defaults: alertmanagerConfigMatcherStrategy: type: None nodeSelector: - kubernetes.io/hostname: "{{ control_hostname }}" + clusterrole: "server" scheme: "{{ prometheus_alertmanager_config.scheme | default( '' ) }}" serviceMonitor: scheme: "{{ prometheus_alertmanager_config.scheme | default( '' ) }}" @@ -165,11 +163,11 @@ kube_prometheus_stack_release_defaults: prometheusOperator: nodeSelector: - kubernetes.io/hostname: "{{ control_hostname }}" + clusterrole: "server" kube-state-metrics: nodeSelector: - kubernetes.io/hostname: "{{ control_hostname }}" + clusterrole: "server" prometheus-node-exporter: image: diff --git a/environments/.caas/inventory/group_vars/all/monitoring.yml b/environments/.caas/inventory/group_vars/all/monitoring.yml deleted file mode 100644 index ee7d9af35..000000000 --- a/environments/.caas/inventory/group_vars/all/monitoring.yml +++ /dev/null @@ -1 +0,0 @@ -control_hostname: "{{ ( openhpc_cluster_name + '-control.'+ openhpc_cluster_name ) | lower }}" \ No newline at end of file From 4bffe4b960675991757b75ebd4872b3f4bd2509b Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Wed, 16 Oct 2024 11:34:13 +0100 Subject: [PATCH 36/90] fixed (?) grafana zenith proxy --- environments/.caas/hooks/post.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/environments/.caas/hooks/post.yml b/environments/.caas/hooks/post.yml index 309610ff9..d349d3994 100644 --- a/environments/.caas/hooks/post.yml +++ b/environments/.caas/hooks/post.yml @@ -1,6 +1,6 @@ # Configure the Zenith clients that are required # Note zenith hosts are in podman group -- hosts: grafana +- hosts: prometheus tasks: - name: Deploy the Zenith client for Grafana include_role: @@ -15,8 +15,8 @@ tenancy-id: "{{ openstack_project_id }}" zenith_proxy_mitm_enabled: yes zenith_proxy_mitm_auth_inject: basic - zenith_proxy_mitm_auth_basic_username: "{{ grafana_security.admin_user }}" - zenith_proxy_mitm_auth_basic_password: "{{ grafana_security.admin_password }}" + zenith_proxy_mitm_auth_basic_username: azimuth + zenith_proxy_mitm_auth_basic_password: "{{ vault_azimuth_user_password }}" when: zenith_subdomain_monitoring is defined - hosts: openondemand From b0f856ee38b42b7f99a371568b907b3ffa431ecc Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Wed, 16 Oct 2024 12:45:18 +0100 Subject: [PATCH 37/90] bump images --- .../.stackhpc/terraform/cluster_image.auto.tfvars.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json index d30b46121..fc7efcf07 100644 --- a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json @@ -1,7 +1,7 @@ { "cluster_image": { - "RL8": "openhpc-RL8-241014-0913-337c1017", - "RL9": "openhpc-RL9-241014-0913-337c1017", - "RL9-cuda": "openhpc-cuda-RL9-241014-1026-337c1017" + "RL8": "openhpc-RL8-241015-1524-fdb5c237", + "RL9": "openhpc-RL9-241015-1524-fdb5c237", + "RL9-cuda": "openhpc-cuda-RL9-241015-1524-fdb5c237" } } From 12e11660b46fe3c4972adf18b390f90722b800a6 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Wed, 16 Oct 2024 12:50:49 +0100 Subject: [PATCH 38/90] added old recording rules to defaults --- .../kube_prometheus_stack/defaults/main/helm.yml | 2 +- .../kube_prometheus_stack/defaults/main/main.yml | 6 +++--- .../inventory/group_vars/all/prometheus.yml | 16 +++++++++++++++- 3 files changed, 19 insertions(+), 5 deletions(-) diff --git a/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml b/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml index 45bbd6974..b81306649 100644 --- a/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml +++ b/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml @@ -91,7 +91,7 @@ kube_prometheus_stack_release_defaults: nodeSelector: clusterrole: "server" - additionalPrometheusRulesMap: "{{ prometheus_alert_rules }}" + additionalPrometheusRulesMap: "{{ prometheus_rules }}" grafana: service: diff --git a/ansible/roles/kube_prometheus_stack/defaults/main/main.yml b/ansible/roles/kube_prometheus_stack/defaults/main/main.yml index 4815269c9..114630825 100644 --- a/ansible/roles/kube_prometheus_stack/defaults/main/main.yml +++ b/ansible/roles/kube_prometheus_stack/defaults/main/main.yml @@ -126,13 +126,13 @@ prometheus_scrape_configs: # - prometheus/targets/*.yml # - prometheus/targets/*.json -prometheus_extra_alert_rules: [] +prometheus_extra_rules: [] -prometheus_alert_rules: +prometheus_rules: appliance-rules: groups: - name: all - rules: "{{ prometheus_extra_alert_rules }}" + rules: "{{ prometheus_extra_rules }}" # ------------------------------------------------------------------------------------------ diff --git a/environments/common/inventory/group_vars/all/prometheus.yml b/environments/common/inventory/group_vars/all/prometheus.yml index 5fdef9728..0216bdc40 100644 --- a/environments/common/inventory/group_vars/all/prometheus.yml +++ b/environments/common/inventory/group_vars/all/prometheus.yml @@ -23,7 +23,7 @@ prometheus_scrape_configs_default: replacement: '${1}' prometheus_scrape_configs: "{{ prometheus_scrape_configs_default + (openondemand_scrape_configs if groups['openondemand'] | count > 0 else []) }}" -prometheus_extra_alert_rules: +prometheus_extra_rules: - alert: SlurmNodeDown annotations: message: '{% raw %}{{ $value }} Slurm nodes are in down status.{% endraw %}' @@ -31,3 +31,17 @@ prometheus_extra_alert_rules: expr: "slurm_nodes_down > 0\n" labels: severity: critical + - record: node_cpu_system_seconds:record + expr: (100 * sum by(instance)(increase(node_cpu_seconds_total{mode="system",job="node-exporter"}[60s]))) / (sum by(instance)(increase(node_cpu_seconds_total{job="node-exporter"}[60s]))) + - record: node_cpu_user_seconds:record + expr: (100 * sum by(instance)(increase(node_cpu_seconds_total{mode="user",job="node-exporter"}[60s]))) / (sum by(instance)(increase(node_cpu_seconds_total{job="node-exporter"}[60s]))) + - record: node_cpu_iowait_seconds:record + expr: (100 * sum by(instance)(increase(node_cpu_seconds_total{mode="iowait",job="node-exporter"}[60s]))) / (sum by(instance)(increase(node_cpu_seconds_total{job="node-exporter"}[60s]))) + - record: node_cpu_other_seconds:record + expr: (100 * sum by(instance)(increase(node_cpu_seconds_total{mode!="idle",mode!="user",mode!="system",mode!="iowait",job="node-exporter"}[60s]))) / (sum by(instance)(increase(node_cpu_seconds_total{job="node-exporter"}[60s]))) + - record: node_cpu_scaling_frequency_hertz_avg:record # frequency rules aren't working + expr: avg by (instance) (node_cpu_scaling_frequency_hertz) + - record: node_cpu_scaling_frequency_hertz_min:record + expr: min by (instance) (node_cpu_scaling_frequency_hertz) + - record: node_cpu_scaling_frequency_hertz_max:record + expr: max by (instance) (node_cpu_scaling_frequency_hertz) From 5f89be8c1ab26270c9bd6f0f52bde4e2cccdafda Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Thu, 17 Oct 2024 14:08:04 +0100 Subject: [PATCH 39/90] fixed openhpc dashboard --- .../files/openhpc-slurm.json | 20 +++++++++---------- .../defaults/main/helm.yml | 13 ++++++++++++ 2 files changed, 23 insertions(+), 10 deletions(-) diff --git a/ansible/roles/grafana-dashboards/files/openhpc-slurm.json b/ansible/roles/grafana-dashboards/files/openhpc-slurm.json index fb4078c5e..820fc4973 100644 --- a/ansible/roles/grafana-dashboards/files/openhpc-slurm.json +++ b/ansible/roles/grafana-dashboards/files/openhpc-slurm.json @@ -711,7 +711,7 @@ "steppedLine": false, "targets": [ { - "expr": "100 - (100 * node_memory_MemAvailable_bytes{job=~\"node\", instance=~\"$instance\"} / node_memory_MemTotal_bytes{job=~\"node\", instance=~\"$instance\"})", + "expr": "100 - (100 * node_memory_MemAvailable_bytes{job=~\"node-exporter\", instance=~\"$instance\"} / node_memory_MemTotal_bytes{job=~\"node-exporter\", instance=~\"$instance\"})", "format": "time_series", "interval": "", "intervalFactor": 1, @@ -818,7 +818,7 @@ "steppedLine": false, "targets": [ { - "expr": "(100 * sum by(instance)(increase(node_cpu_seconds_total{mode=\"iowait\",job=~\"node_fast\"}[1s]))) / (sum by(instance)(increase(node_cpu_seconds_total{job=~\"node_fast\"}[1s])))", + "expr": "(100 * sum by(instance)(increase(node_cpu_seconds_total{mode=\"iowait\",job=~\"node-exporter_fast\"}[1s]))) / (sum by(instance)(increase(node_cpu_seconds_total{job=~\"node-exporter_fast\"}[1s])))", "hide": true, "interval": "", "legendFormat": "{{ instance }}", @@ -1489,7 +1489,7 @@ "steppedLine": false, "targets": [ { - "expr": "increase(node_infiniband_port_transmit_wait_total{job=\"node\",instance=~\"$instance\"}[1m]) / increase(node_infiniband_port_packets_transmitted_total{job=\"node\",instance=~\"$instance\"}[1m])", + "expr": "increase(node_infiniband_port_transmit_wait_total{job=\"node-exporter\",instance=~\"$instance\"}[1m]) / increase(node_infiniband_port_packets_transmitted_total{job=\"node-exporter\",instance=~\"$instance\"}[1m])", "hide": false, "interval": "", "legendFormat": "{{ instance }} {{ device }} {{ port }}", @@ -1950,15 +1950,15 @@ "allValue": null, "current": {}, "datasource": "${DS_PROMETHEUS}", - "definition": "label_values(node_cpu_seconds_total{job=~\"node\"}, env)", + "definition": "label_values(node_cpu_seconds_total{job=~\"node-exporter\"}, cluster_env)", "error": null, "hide": 0, "includeAll": true, "label": "Environment", "multi": true, - "name": "env", + "name": "cluster_env", "options": [], - "query": "label_values(node_cpu_seconds_total{job=~\"node\"}, env)", + "query": "label_values(node_cpu_seconds_total{job=~\"node-exporter\"}, cluster_env)", "refresh": 1, "regex": "", "skipUrlSync": false, @@ -1994,7 +1994,7 @@ "allValue": null, "current": {}, "datasource": "${DS_PROMETHEUS}", - "definition": "label_values(node_cpu_seconds_total{job=~\"node\", env=~\"$env\", instance=~\"$host_filter\"}, instance)", + "definition": "label_values(node_cpu_seconds_total{job=~\"node-exporter\", cluster_env=~\"$cluster_env\", instance=~\"$host_filter\"}, instance)", "error": null, "hide": 0, "includeAll": true, @@ -2002,7 +2002,7 @@ "multi": true, "name": "instance", "options": [], - "query": "label_values(node_cpu_seconds_total{job=~\"node\", env=~\"$env\", instance=~\"$host_filter\"}, instance)", + "query": "label_values(node_cpu_seconds_total{job=~\"node-exporter\", cluster_env=~\"$cluster_env\", instance=~\"$host_filter\"}, instance)", "refresh": 1, "regex": "", "skipUrlSync": false, @@ -2017,7 +2017,7 @@ "allValue": null, "current": {}, "datasource": "${DS_PROMETHEUS}", - "definition": "label_values(node_infiniband_info{job=~\"node\"}, device)", + "definition": "label_values(node_infiniband_info{job=~\"node-exporter\"}, device)", "error": null, "hide": 0, "includeAll": true, @@ -2025,7 +2025,7 @@ "multi": true, "name": "device", "options": [], - "query": "label_values(node_infiniband_info{job=~\"node\"}, device)", + "query": "label_values(node_infiniband_info{job=~\"node-exporter\"}, device)", "refresh": 1, "regex": "", "skipUrlSync": false, diff --git a/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml b/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml index b81306649..cf0ee434b 100644 --- a/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml +++ b/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml @@ -172,6 +172,19 @@ kube_prometheus_stack_release_defaults: prometheus-node-exporter: image: tag: "{{ node_exporter_image_tag }}" + prometheus: + monitor: + relabelings: + # relabels instances to hostnames with suffixes stripped + - sourceLabels: [__meta_kubernetes_pod_node_name] + separator: ; + regex: ([^.]+).* + targetLabel: instance + replacement: $1 + action: replace + metricRelabelings: + - targetLabel: cluster_env + replacement: ungrouped kube_prometheus_stack_release_overrides: {} From 2d1dab53caa047b9a6866ba93a075dd537eaec75 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Fri, 18 Oct 2024 10:54:18 +0100 Subject: [PATCH 40/90] Refactored and fixed slack integration --- .../inventory/group_vars/all/alertmanager.yml | 41 ++++++++++--------- .../inventory/group_vars/all/defaults.yml | 2 +- .../inventory/group_vars/all/prometheus.yml | 4 +- 3 files changed, 24 insertions(+), 23 deletions(-) diff --git a/environments/common/inventory/group_vars/all/alertmanager.yml b/environments/common/inventory/group_vars/all/alertmanager.yml index 44eaa315b..a78ffee7e 100644 --- a/environments/common/inventory/group_vars/all/alertmanager.yml +++ b/environments/common/inventory/group_vars/all/alertmanager.yml @@ -1,26 +1,27 @@ -alertmanager_replicas: 1 -alertmanager_port: 30002 # Must be within K3s' reserved port range (default 30000-32767) - -# Add receivers here, uncomment below and add Slack bot app creds for Slack integration alertmanager_config: route: group_by: ['...'] - # receiver: slack-receiver - global: - resolve_timeout: "{{ prometheus_config_flags_extra.alertmanager.timeout | default( '5m' ) }}" - receivers: - - name: 'null' - # - name: slack-receiver - # slack_configs: - # - channel: "{{ slack_integration.channel }}" - # api_url: https://slack.com/api/chat.postMessage - # http_config: - # authorization: - # credentials: "{{ slack_integration.app_creds }}" - # text: "{{ '{{' }} .GroupLabels.alertname {{ '}}' }} : {{ '{{' }} .CommonAnnotations.description {{ '}}' }}" - # title_link: "http://{{ control_ip }}/alertmanager/#/alerts?receiver=slack-receiver" - # send_resolved: true + receiver: "{{ 'slack-receiver' if alertmanager_slack_integration is defined else 'null' }}" + receivers: "{{ alertmanager_default_receivers + alertmanager_extra_receivers }}" + +alertmanager_default_receivers: + - name: 'null' + +alertmanager_extra_receivers: "{{ [alertmanager_slack_receiver] if alertmanager_slack_integration is defined else [] }}" + +alertmanager_slack_receiver: + name: slack-receiver + slack_configs: + - channel: "{{ alertmanager_slack_integration.channel | default('none') }}" + api_url: https://slack.com/api/chat.postMessage + http_config: + authorization: + credentials: "{{ alertmanager_slack_integration.app_creds | default('none') }}" + text: "{{ '{{' }} .GroupLabels.alertname {{ '}}' }} : {{ '{{' }} .CommonAnnotations.description {{ '}}' }}" + title_link: "http://{{ prometheus_address }}/alertmanager/#/alerts?receiver=slack-receiver" + send_resolved: true -# slack_integration: +# Uncomment below and add Slack bot app creds for Slack integration +# alertmanager_slack_integration: # channel: '#alerts' # app_creds: diff --git a/environments/common/inventory/group_vars/all/defaults.yml b/environments/common/inventory/group_vars/all/defaults.yml index 43c3c9d16..422602754 100644 --- a/environments/common/inventory/group_vars/all/defaults.yml +++ b/environments/common/inventory/group_vars/all/defaults.yml @@ -16,7 +16,7 @@ api_address: "{{ inventory_hostname }}" # Service endpoints opensearch_address: "127.0.0.1" -prometheus_address: "{{ hostvars[groups['prometheus'].0].api_address }}" +prometheus_address: "{{ hostvars[groups['prometheus'].0].api_address }}:{{ prometheus_port }}" openondemand_address: "{{ hostvars[groups['openondemand'].0].api_address if groups['openondemand'] | count > 0 else '' }}" grafana_address: "{{ hostvars[groups['prometheus'].0].api_address }}" diff --git a/environments/common/inventory/group_vars/all/prometheus.yml b/environments/common/inventory/group_vars/all/prometheus.yml index 0216bdc40..2783dc7bb 100644 --- a/environments/common/inventory/group_vars/all/prometheus.yml +++ b/environments/common/inventory/group_vars/all/prometheus.yml @@ -26,7 +26,7 @@ prometheus_scrape_configs: "{{ prometheus_scrape_configs_default + (openondemand prometheus_extra_rules: - alert: SlurmNodeDown annotations: - message: '{% raw %}{{ $value }} Slurm nodes are in down status.{% endraw %}' + description: '{% raw %}{{ $value }} Slurm nodes are in down status.{% endraw %}' summary: 'At least one Slurm node is down.' expr: "slurm_nodes_down > 0\n" labels: @@ -39,7 +39,7 @@ prometheus_extra_rules: expr: (100 * sum by(instance)(increase(node_cpu_seconds_total{mode="iowait",job="node-exporter"}[60s]))) / (sum by(instance)(increase(node_cpu_seconds_total{job="node-exporter"}[60s]))) - record: node_cpu_other_seconds:record expr: (100 * sum by(instance)(increase(node_cpu_seconds_total{mode!="idle",mode!="user",mode!="system",mode!="iowait",job="node-exporter"}[60s]))) / (sum by(instance)(increase(node_cpu_seconds_total{job="node-exporter"}[60s]))) - - record: node_cpu_scaling_frequency_hertz_avg:record # frequency rules aren't working + - record: node_cpu_scaling_frequency_hertz_avg:record # Warning: frequency rules will not work when deploying appliance on VMs expr: avg by (instance) (node_cpu_scaling_frequency_hertz) - record: node_cpu_scaling_frequency_hertz_min:record expr: min by (instance) (node_cpu_scaling_frequency_hertz) From 43f27a54ce0051214ed21cbc16d07a59d2fbdb39 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Fri, 18 Oct 2024 12:29:45 +0100 Subject: [PATCH 41/90] removed unused config options --- .../grafana-dashboards/defaults/main.yml | 12 - .../defaults/main/helm.yml | 13 - .../defaults/main/main.yml | 246 +----------------- 3 files changed, 3 insertions(+), 268 deletions(-) delete mode 100644 ansible/roles/grafana-dashboards/defaults/main.yml diff --git a/ansible/roles/grafana-dashboards/defaults/main.yml b/ansible/roles/grafana-dashboards/defaults/main.yml deleted file mode 100644 index 6885a3e2a..000000000 --- a/ansible/roles/grafana-dashboards/defaults/main.yml +++ /dev/null @@ -1,12 +0,0 @@ ---- -grafana_port: 30001 - -# External Grafana address. Variable maps to "root_url" in grafana server section -grafana_url: "http://{{ grafana_address }}:{{ grafana_port }}" -grafana_api_url: "{{ grafana_url }}" - -grafana_security: - admin_user: grafana - admin_password: "{{ vault_grafana_admin_password }}" - -grafana_data_dir: "{{ appliances_state_dir | default('/var/lib') }}/grafana" diff --git a/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml b/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml index cf0ee434b..e6b8cfe7b 100644 --- a/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml +++ b/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml @@ -78,8 +78,6 @@ kube_prometheus_stack_release_defaults: scrapeInterval: "{{ prometheus_global.scrape_interval }}" scrapeTimeout: "{{ prometheus_global.scrape_timeout }}" evaluationInterval: "{{ prometheus_global.evaluation_interval }}" - remoteRead: "{{ prometheus_remote_read }}" - remoteWrite: "{{ prometheus_remote_write }}" externalLabels: "{{ prometheus_external_labels }}" additionalScrapeConfigs: "{{ prometheus_scrape_configs }}" # may need to change these @@ -112,11 +110,6 @@ kube_prometheus_stack_release_defaults: auth: "{{ grafana_auth }}" auth.anonymous: enabled: "{{ grafana_auth_anonymous }}" - analytics: "{{ grafana_analytics }}" - smtp: "{{ grafana_smtp }}" - log: "{{ grafana_log }}" - tracing: "{{ grafana_tracing }}" - panels: "{{ grafana_panels }}" additionalDataSources: "{{ grafana_datasources }}" plugins: "{{ grafana_plugins }}" nodeSelector: @@ -130,7 +123,6 @@ kube_prometheus_stack_release_defaults: config: "{{ grafana_ldap | to_yaml }}" image: tag: "{{ grafana_image_tag }}" - env: "{{ grafana_environment }}" adminUser: "{{ grafana_security.admin_user }}" adminPassword: "{{ grafana_security.admin_password }}" @@ -154,11 +146,6 @@ kube_prometheus_stack_release_defaults: type: None nodeSelector: clusterrole: "server" - scheme: "{{ prometheus_alertmanager_config.scheme | default( '' ) }}" - serviceMonitor: - scheme: "{{ prometheus_alertmanager_config.scheme | default( '' ) }}" - proxyUrl: "{{ prometheus_alertmanager_config.proxy_url | default( '' ) }}" - templateFiles: "{{ alertmanager_template_files }}" prometheusOperator: diff --git a/ansible/roles/kube_prometheus_stack/defaults/main/main.yml b/ansible/roles/kube_prometheus_stack/defaults/main/main.yml index 114630825..1d0155295 100644 --- a/ansible/roles/kube_prometheus_stack/defaults/main/main.yml +++ b/ansible/roles/kube_prometheus_stack/defaults/main/main.yml @@ -27,8 +27,6 @@ slack_integration: alertmanager_config: route: group_by: ['...'] - global: - resolve_timeout: "{{ prometheus_config_flags_extra.alertmanager.timeout | default( '5m' ) }}" receivers: - name: 'null' @@ -39,24 +37,9 @@ prometheus_port: 30000 grafana_port: 30001 alertmanager_port: 30002 -### PREVIOUS ROLE VALUES - prometheus_image_tag: "v2.27.0" -# prometheus_config_dir: /etc/prometheus -prometheus_db_dir: "{{ appliances_state_dir }}/prometheus" -# prometheus_read_only_dirs: [] - -# prometheus_binary_local_dir: '' -# prometheus_skip_install: false - -# prometheus_web_listen_address: "0.0.0.0:9090" -# prometheus_web_external_url: '' -# See https://github.com/prometheus/exporter-toolkit/blob/master/docs/web-configuration.md -# prometheus_web_config: -# tls_server_config: {} -# http_server_config: {} -# basic_auth_users: {} +prometheus_db_dir: "/var/lib/prometheus" prometheus_storage_retention: "30d" # Available since Prometheus 2.7.0 @@ -66,44 +49,13 @@ prometheus_storage_retention_size: "40GB" prometheus_volume_size: 40Gi -prometheus_config_flags_extra: {} -# prometheus_config_flags_extra: -# storage.tsdb.retention: 15d -# alertmanager.timeout: 10s - -prometheus_alertmanager_config: [] -# prometheus_alertmanager_config: -# - scheme: https -# path_prefix: alertmanager/ -# basic_auth: -# username: user -# password: pass -# static_configs: -# - targets: ["127.0.0.1:9093"] -# proxy_url: "127.0.0.2" - prometheus_alert_relabel_configs: [] -# prometheus_alert_relabel_configs: -# - action: labeldrop -# regex: replica prometheus_global: scrape_interval: 30s scrape_timeout: 10s evaluation_interval: 15s -prometheus_remote_write: [] -# prometheus_remote_write: -# - url: https://dev.kausal.co/prom/push -# basic_auth: -# password: FOO - -prometheus_remote_read: [] -# prometheus_remote_read: -# - url: https://demo.cloudalchemy.org:9201/read -# basic_auth: -# password: FOO - prometheus_external_labels: environment: "{{ ansible_fqdn | default(ansible_host) | default(inventory_hostname) }}" # environment: "{{ control_sslip }}" @@ -116,16 +68,6 @@ prometheus_scrape_configs: - targets: - "{{ control_ip }}:{{ slurm_exporter_port }}" -# Alternative config file name, searched in ansible templates path. -# prometheus_config_file: 'prometheus.yml.j2' - -# prometheus_alert_rules_files: -# - prometheus/rules/*.rules - -# prometheus_static_targets_files: -# - prometheus/targets/*.yml -# - prometheus/targets/*.json - prometheus_extra_rules: [] prometheus_rules: @@ -135,208 +77,26 @@ prometheus_rules: rules: "{{ prometheus_extra_rules }}" # ------------------------------------------------------------------------------------------ - -### PREVIOUS GRAFANA VARS grafana_image_tag: 11.2.2 -grafana_instance: "{{ ansible_fqdn | default(ansible_host) | default(inventory_hostname) }}" - grafana_data_dir: "/var/lib/grafana" -# Additional options for grafana "server" section -# This section WILL omit options for: http_addr, http_port, domain, and root_url, as those settings are set by variables listed before -# grafana_server: -# protocol: http -# enforce_domain: false -# socket: "" -# cert_key: "" -# cert_file: "" -# enable_gzip: false -# static_root_path: public -# router_logging: false -# serve_from_sub_path: false - -# # Variables correspond to ones in grafana.ini configuration file -# # Security grafana_security: admin_user: grafana admin_password: "{{ vault_grafana_admin_password }}" -# secret_key: "" # would we want to template the rest of this into the ini? its not currently -# login_remember_days: 7 -# cookie_username: grafana_user -# cookie_remember_name: grafana_remember -# disable_gravatar: true -# data_source_proxy_whitelist: - -# User management and registration -# grafana_welcome_email_on_sign_up: false -# grafana_users: -# allow_sign_up: false -# # allow_org_create: true -# # auto_assign_org: true -# auto_assign_org_role: Viewer -# # login_hint: "email or username" -# default_theme: dark -# # external_manage_link_url: "" -# # external_manage_link_name: "" -# # external_manage_info: "" -# grafana authentication mechanisms +# see https://github.com/grafana/helm-charts/blob/main/charts/grafana/values.yaml for configuration options grafana_auth: {} -# disable_login_form: false -# oauth_auto_login: false -# disable_signout_menu: false -# signout_redirect_url: "" -# ldap: -# config_file: "/etc/grafana/ldap.toml" -# allow_sign_up: false -# basic: -# enabled: true - grafana_ldap: {} -# verbose_logging: false -# servers: -# host: 127.0.0.1 -# port: 389 # 636 for SSL -# use_ssl: false -# start_tls: false -# ssl_skip_verify: false -# root_ca_cert: /path/to/certificate.crt -# bind_dn: "cn=admin,dc=grafana,dc=org" -# bind_password: grafana -# search_filter: "(cn=%s)" # "(sAMAccountName=%s)" on AD -# search_base_dns: -# - "dc=grafana,dc=org" -# group_search_filter: "(&(objectClass=posixGroup)(memberUid=%s))" -# group_search_base_dns: -# - "ou=groups,dc=grafana,dc=org" -# attributes: -# name: givenName -# surname: sn -# username: sAMAccountName -# member_of: memberOf -# email: mail -# group_mappings: -# - name: Main Org. -# id: 1 -# groups: -# - group_dn: "cn=admins,ou=groups,dc=grafana,dc=org" -# org_role: Admin -# - group_dn: "cn=editors,ou=groups,dc=grafana,dc=org" -# org_role: Editor -# - group_dn: "*" -# org_role: Viewer -# - name: Alternative Org -# id: 2 -# groups: -# - group_dn: "cn=alternative_admins,ou=groups,dc=grafana,dc=org" -# org_role: Admin - -grafana_analytics: {} -# reporting_enabled: true -# google_analytics_ua_id: "" - -# Set this for mail notifications -grafana_smtp: {} -# host: -# user: -# password: -# from_address: - -# Grafana logging configuration -grafana_log: -# mode: 'console file' -# level: info - -# Distributed tracing options -grafana_tracing: {} -# address: "localhost:6831" -# always_included_tag: "tag1:value1,tag2:value2" -# sampler_type: const -# sampler_param: 1 - -grafana_snapshots: {} -# external_enabled: true -# external_snapshot_url: "https://snapshots-origin.raintank.io" -# external_snapshot_name: "Publish to snapshot.raintank.io" -# snapshot_remove_expired: true -# snapshot_TTL_days: 90 - -# # External image store -# grafana_image_storage: {} -# # provider: gcs -# # key_file: -# # bucket: -# # path: - ####### # Plugins from https://grafana.com/plugins grafana_plugins: [] -# - raintank-worldping-app - -# # Dashboards from https://grafana.com/dashboards -# grafana_dashboards: [] -# # - dashboard_id: '4271' -# # revision_id: '3' -# # datasource: 'Prometheus' -# # - dashboard_id: '1860' -# # revision_id: '4' -# # datasource: 'Prometheus' -# # - dashboard_id: '358' -# # revision_id: '1' -# # datasource: 'Prometheus' -# grafana_dashboards_dir: "dashboards" - -# # Alert notification channels to configure -# grafana_alert_notifications: [] -# # - name: "Email Alert" -# # type: "email" -# # uid: channel1 -# # is_default: true -# # settings: -# # addresses: "example@example.com" - -# Datasources to configure +# Additional datasources to configure alongside kube-prometheus-stack defaults grafana_datasources: [] -# - name: "Prometheus" -# type: "prometheus" -# access: "proxy" -# url: "http://prometheus.mydomain" -# basicAuth: true -# basicAuthUser: "admin" -# basicAuthPassword: "password" -# isDefault: true -# jsonData: -# tlsAuth: false -# tlsAuthWithCACert: false -# tlsSkipVerify: true - -grafana_environment: {} - -# Panels configurations -grafana_panels: {} -# disable_sanitize_html: false -# enable_alpha: false - -### PREVIOUS ALERTMANAGER ROLEVARS alertmanager_image_tag: v0.27.0 -# alertmanager_config_file: 'alertmanager.yml.j2' - -alertmanager_template_files: {} - -# alertmanager_http_config: {} - -### Previous node exporter vars - node_exporter_image_tag: v1.8.2 -# node_exporter_tls_server_config: {} - -# node_exporter_http_server_config: {} - -# node_exporter_basic_auth_users: {} - From bb928ad55a304763b0e9d17ad08b025ab5980045 Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Fri, 18 Oct 2024 12:43:51 +0100 Subject: [PATCH 42/90] review suggestions Co-authored-by: Steve Brasier <33413598+sjpb@users.noreply.github.com> --- environments/common/inventory/groups | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environments/common/inventory/groups b/environments/common/inventory/groups index 903f7f5ab..7be67fc32 100644 --- a/environments/common/inventory/groups +++ b/environments/common/inventory/groups @@ -34,7 +34,7 @@ filebeat mysql [prometheus] -# Single node to host monitoring server. +# Single node to host monitoring stack. [opensearch] # Single node to host ElasticSearch search engine for Slurm monitoring. From db91120c2f527b4e2b3b3c5f57e40456ac1f2a8d Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Fri, 18 Oct 2024 12:45:41 +0100 Subject: [PATCH 43/90] updated defaults --- .../kube_prometheus_stack/defaults/main/main.yml | 15 ++++----------- .../common/inventory/group_vars/all/grafana.yml | 2 +- 2 files changed, 5 insertions(+), 12 deletions(-) diff --git a/ansible/roles/kube_prometheus_stack/defaults/main/main.yml b/ansible/roles/kube_prometheus_stack/defaults/main/main.yml index 1d0155295..e08f5d925 100644 --- a/ansible/roles/kube_prometheus_stack/defaults/main/main.yml +++ b/ansible/roles/kube_prometheus_stack/defaults/main/main.yml @@ -18,7 +18,7 @@ control_sslip: "{{ control_ip | regex_replace('\\.', '-') }}.sslip.io" grafana_volume_size: 10Gi -grafana_auth_anonymous: true +grafana_auth_anonymous: false slack_integration: channel: "#alerts" @@ -57,16 +57,9 @@ prometheus_global: evaluation_interval: 15s prometheus_external_labels: - environment: "{{ ansible_fqdn | default(ansible_host) | default(inventory_hostname) }}" - # environment: "{{ control_sslip }}" - -prometheus_scrape_configs: - - job_name: "slurm_exporter" - scrape_interval: 30s - scrape_timeout: 30s - static_configs: - - targets: - - "{{ control_ip }}:{{ slurm_exporter_port }}" + environment: "{{ ansible_fqdn | default(inventory_hostname) | default(ansible_host) }}" + +prometheus_scrape_configs: [] prometheus_extra_rules: [] diff --git a/environments/common/inventory/group_vars/all/grafana.yml b/environments/common/inventory/group_vars/all/grafana.yml index d8f591c68..5464f6a3e 100644 --- a/environments/common/inventory/group_vars/all/grafana.yml +++ b/environments/common/inventory/group_vars/all/grafana.yml @@ -70,7 +70,7 @@ grafana_datasources: grafana_plugins: - grafana-opensearch-datasource 2.8.1 -grafana_auth_anonymous: true # Enable anonymous View-only login - see implications: https://grafana.com/docs/grafana/latest/administration/security/#implications-of-enabling-anonymous-access-to-dashboards +grafana_auth_anonymous: false # Enable anonymous View-only login - see implications: https://grafana.com/docs/grafana/latest/administration/security/#implications-of-enabling-anonymous-access-to-dashboards grafana_volume_size: 10Gi _grafana_auth_anon_cfg: anonymous: From 7e1370d6108ca5a622581ea52109b2115d855401 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Fri, 18 Oct 2024 13:38:13 +0100 Subject: [PATCH 44/90] removed grafana data volume --- .../defaults/main/helm.yml | 4 --- .../defaults/main/main.yml | 4 --- .../kube_prometheus_stack/tasks/main.yml | 36 ------------------- .../inventory/group_vars/all/grafana.yml | 6 ---- 4 files changed, 50 deletions(-) diff --git a/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml b/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml index e6b8cfe7b..d4046419f 100644 --- a/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml +++ b/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml @@ -114,10 +114,6 @@ kube_prometheus_stack_release_defaults: plugins: "{{ grafana_plugins }}" nodeSelector: clusterrole: "server" - persistence: - type: pvc - enabled: true - existingClaim: grafana-pvc ldap: enabled: "{{ false if grafana_ldap == {} else true }}" config: "{{ grafana_ldap | to_yaml }}" diff --git a/ansible/roles/kube_prometheus_stack/defaults/main/main.yml b/ansible/roles/kube_prometheus_stack/defaults/main/main.yml index e08f5d925..da4fcc34d 100644 --- a/ansible/roles/kube_prometheus_stack/defaults/main/main.yml +++ b/ansible/roles/kube_prometheus_stack/defaults/main/main.yml @@ -16,8 +16,6 @@ login_ip: "{{ hostvars[groups['openondemand'][0]]['ansible_host'] }}" # probably control_ip: "{{ ansible_default_ipv4.address| default(ansible_all_ipv4_addresses[0]) }}" control_sslip: "{{ control_ip | regex_replace('\\.', '-') }}.sslip.io" -grafana_volume_size: 10Gi - grafana_auth_anonymous: false slack_integration: @@ -72,8 +70,6 @@ prometheus_rules: # ------------------------------------------------------------------------------------------ grafana_image_tag: 11.2.2 -grafana_data_dir: "/var/lib/grafana" - grafana_security: admin_user: grafana admin_password: "{{ vault_grafana_admin_password }}" diff --git a/ansible/roles/kube_prometheus_stack/tasks/main.yml b/ansible/roles/kube_prometheus_stack/tasks/main.yml index a0b301e46..fc3f15fb4 100644 --- a/ansible/roles/kube_prometheus_stack/tasks/main.yml +++ b/ansible/roles/kube_prometheus_stack/tasks/main.yml @@ -53,42 +53,6 @@ path: "{{ prometheus_db_dir }}" type: DirectoryOrCreate -- name: Create Grafana hostPath volume in /var/lib/state - kubernetes.core.k8s: - namespace: "{{ kube_prometheus_stack_release_namespace }}" - definition: - apiVersion: v1 - kind: PersistentVolume - metadata: - name: grafana-dir - labels: - app.kubernetes.io/name: grafana-dir - spec: - capacity: - storage: "{{ grafana_volume_size }}" - accessModes: - - ReadWriteOnce - hostPath: - path: "{{ grafana_data_dir }}" - type: DirectoryOrCreate - -- name: Create PVC for Grafana - kubernetes.core.k8s: - namespace: "{{ kube_prometheus_stack_release_namespace }}" - definition: - apiVersion: v1 - kind: PersistentVolumeClaim - metadata: - name: grafana-pvc - spec: - accessModes: - - ReadWriteOnce - resources: - requests: - storage: "{{ grafana_volume_size }}" - volumeMode: Filesystem - volumeName: grafana-dir - # not looping through these because templating doesn't set ports as integer - name: Creating headless service for OOD exporter kubernetes.core.k8s: diff --git a/environments/common/inventory/group_vars/all/grafana.yml b/environments/common/inventory/group_vars/all/grafana.yml index 5464f6a3e..44febaefc 100644 --- a/environments/common/inventory/group_vars/all/grafana.yml +++ b/environments/common/inventory/group_vars/all/grafana.yml @@ -1,9 +1,3 @@ -grafana_image_tag: '11.2.2' -grafana_port: 30001 # Must be within K3s' reserved port range (default 30000-32767) - -# Define where state is stored -grafana_data_dir: "{{ appliances_state_dir | default('/var/lib') }}/grafana" - # Configure external address, with external URL depending on whether we are using Open Ondemand as a proxy grafana_api_address: "{{ hostvars[groups['prometheus'].0].internal_address }}" grafana_api_url: "http://{{ grafana_api_address }}:{{ grafana_port }}" From 934ec7a6258614229c3d51b5d4825d012906c946 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Fri, 18 Oct 2024 15:19:51 +0100 Subject: [PATCH 45/90] set default dashboard to slurm exporter --- ansible/roles/kube_prometheus_stack/defaults/main/helm.yml | 2 ++ ansible/roles/kube_prometheus_stack/defaults/main/main.yml | 2 ++ 2 files changed, 4 insertions(+) diff --git a/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml b/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml index d4046419f..5b188490f 100644 --- a/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml +++ b/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml @@ -110,6 +110,8 @@ kube_prometheus_stack_release_defaults: auth: "{{ grafana_auth }}" auth.anonymous: enabled: "{{ grafana_auth_anonymous }}" + dashboards: + default_home_dashboard_path: "/tmp/dashboards/{{ grafana_home_dashboard }}" additionalDataSources: "{{ grafana_datasources }}" plugins: "{{ grafana_plugins }}" nodeSelector: diff --git a/ansible/roles/kube_prometheus_stack/defaults/main/main.yml b/ansible/roles/kube_prometheus_stack/defaults/main/main.yml index da4fcc34d..febbbba6d 100644 --- a/ansible/roles/kube_prometheus_stack/defaults/main/main.yml +++ b/ansible/roles/kube_prometheus_stack/defaults/main/main.yml @@ -85,6 +85,8 @@ grafana_plugins: [] # Additional datasources to configure alongside kube-prometheus-stack defaults grafana_datasources: [] +grafana_home_dashboard: 13427.json # node exporter slurm + alertmanager_image_tag: v0.27.0 node_exporter_image_tag: v1.8.2 From a03f7f99dd3b4e62ced71923a09981eb2a1c4ef9 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Fri, 18 Oct 2024 16:12:56 +0100 Subject: [PATCH 46/90] added play to remove unwanted default dashboards --- .../roles/kube_prometheus_stack/tasks/main.yml | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/ansible/roles/kube_prometheus_stack/tasks/main.yml b/ansible/roles/kube_prometheus_stack/tasks/main.yml index fc3f15fb4..27aafd601 100644 --- a/ansible/roles/kube_prometheus_stack/tasks/main.yml +++ b/ansible/roles/kube_prometheus_stack/tasks/main.yml @@ -157,3 +157,18 @@ create_namespace: no wait: yes wait_timeout: "{{ kube_prometheus_stack_wait_timeout }}" + +- name: Delete unwanted default dashboards # currently no way to selectively enabled in helm chart + kubernetes.core.k8s: + state: absent + kind: ConfigMap + namespace: "{{ kube_prometheus_stack_release_namespace }}" + definition: + metadata: + name: "{{ item }}" + loop: + - kube-prometheus-stack-nodes-darwin + - kube-prometheus-stack-grafana-overview + - kube-prometheus-stack-proxy + - kube-prometheus-stack-etcd + - kube-prometheus-stack-alertmanager-overview From 886c22dd2491db7bfdf0aad3b4abff17d31ca906 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Fri, 18 Oct 2024 16:22:32 +0100 Subject: [PATCH 47/90] updated grafana groupvars --- environments/common/inventory/group_vars/all/grafana.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/environments/common/inventory/group_vars/all/grafana.yml b/environments/common/inventory/group_vars/all/grafana.yml index 44febaefc..3288c8aa4 100644 --- a/environments/common/inventory/group_vars/all/grafana.yml +++ b/environments/common/inventory/group_vars/all/grafana.yml @@ -1,3 +1,5 @@ +grafana_port: 30001 + # Configure external address, with external URL depending on whether we are using Open Ondemand as a proxy grafana_api_address: "{{ hostvars[groups['prometheus'].0].internal_address }}" grafana_api_url: "http://{{ grafana_api_address }}:{{ grafana_port }}" @@ -65,7 +67,6 @@ grafana_plugins: - grafana-opensearch-datasource 2.8.1 grafana_auth_anonymous: false # Enable anonymous View-only login - see implications: https://grafana.com/docs/grafana/latest/administration/security/#implications-of-enabling-anonymous-access-to-dashboards -grafana_volume_size: 10Gi _grafana_auth_anon_cfg: anonymous: org_name: "Main Org." From c4fa2a61cd5cc99aa3bbde46e5ac9bcd0439e5b3 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Mon, 21 Oct 2024 16:14:10 +0100 Subject: [PATCH 48/90] added node exporter collection config --- ansible/roles/kube_prometheus_stack/defaults/main/helm.yml | 1 + ansible/roles/kube_prometheus_stack/defaults/main/main.yml | 4 +++- ansible/roles/kube_prometheus_stack/tasks/main.yml | 2 +- .../common/inventory/group_vars/all/node_exporter.yml | 4 ++++ 4 files changed, 9 insertions(+), 2 deletions(-) create mode 100644 environments/common/inventory/group_vars/all/node_exporter.yml diff --git a/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml b/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml index 5b188490f..7b025ff8d 100644 --- a/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml +++ b/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml @@ -155,6 +155,7 @@ kube_prometheus_stack_release_defaults: clusterrole: "server" prometheus-node-exporter: + extraArgs: "{{ node_exporter_args }}" image: tag: "{{ node_exporter_image_tag }}" prometheus: diff --git a/ansible/roles/kube_prometheus_stack/defaults/main/main.yml b/ansible/roles/kube_prometheus_stack/defaults/main/main.yml index febbbba6d..414e8c754 100644 --- a/ansible/roles/kube_prometheus_stack/defaults/main/main.yml +++ b/ansible/roles/kube_prometheus_stack/defaults/main/main.yml @@ -90,4 +90,6 @@ grafana_home_dashboard: 13427.json # node exporter slurm alertmanager_image_tag: v0.27.0 node_exporter_image_tag: v1.8.2 - +node_exporter_args: + - --collector.filesystem.mount-points-exclude=^/(dev|proc|sys|var/lib/docker/.+|var/lib/kubelet/.+)($|/) + - --collector.filesystem.fs-types-exclude=^(autofs|binfmt_misc|bpf|cgroup2?|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|iso9660|mqueue|nsfs|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|selinuxfs|squashfs|sysfs|tracefs)$ diff --git a/ansible/roles/kube_prometheus_stack/tasks/main.yml b/ansible/roles/kube_prometheus_stack/tasks/main.yml index 27aafd601..5592ffd81 100644 --- a/ansible/roles/kube_prometheus_stack/tasks/main.yml +++ b/ansible/roles/kube_prometheus_stack/tasks/main.yml @@ -158,7 +158,7 @@ wait: yes wait_timeout: "{{ kube_prometheus_stack_wait_timeout }}" -- name: Delete unwanted default dashboards # currently no way to selectively enabled in helm chart +- name: Delete unwanted default dashboards # currently no way to selectively enable in helm chart kubernetes.core.k8s: state: absent kind: ConfigMap diff --git a/environments/common/inventory/group_vars/all/node_exporter.yml b/environments/common/inventory/group_vars/all/node_exporter.yml new file mode 100644 index 000000000..3c0add5d2 --- /dev/null +++ b/environments/common/inventory/group_vars/all/node_exporter.yml @@ -0,0 +1,4 @@ +# To disable unwanted metrics or enable non-default metrics, add --no-collector or --collector flags to list below (see https://github.com/prometheus/node_exporter?tab=readme-ov-file#collectors) +node_exporter_args: + - --collector.filesystem.mount-points-exclude=^/(dev|proc|sys|var/lib/docker/.+|var/lib/kubelet/.+)($|/) + - --collector.filesystem.fs-types-exclude=^(autofs|binfmt_misc|bpf|cgroup2?|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|iso9660|mqueue|nsfs|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|selinuxfs|squashfs|sysfs|tracefs)$ From 3e1f019f297ad9c5b2e46dd016e3a9205b02acff Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Mon, 21 Oct 2024 16:42:12 +0100 Subject: [PATCH 49/90] removed unenforced volume size config option --- ansible/roles/kube_prometheus_stack/defaults/main/helm.yml | 2 +- ansible/roles/kube_prometheus_stack/defaults/main/main.yml | 2 -- ansible/roles/kube_prometheus_stack/tasks/main.yml | 2 +- environments/common/inventory/group_vars/all/prometheus.yml | 3 --- 4 files changed, 2 insertions(+), 7 deletions(-) diff --git a/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml b/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml index 7b025ff8d..1da23524d 100644 --- a/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml +++ b/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml @@ -71,7 +71,7 @@ kube_prometheus_stack_release_defaults: - ReadWriteOnce resources: requests: - storage: "{{ prometheus_volume_size }}" + storage: 1Gi # not enforced but requires value > 0 retention: "{{ prometheus_storage_retention }}" retentionSize: "{{ prometheus_storage_retention_size }}" additionalAlertRelabelConfigs: "{{ prometheus_alert_relabel_configs }}" diff --git a/ansible/roles/kube_prometheus_stack/defaults/main/main.yml b/ansible/roles/kube_prometheus_stack/defaults/main/main.yml index 414e8c754..1d57a7dbc 100644 --- a/ansible/roles/kube_prometheus_stack/defaults/main/main.yml +++ b/ansible/roles/kube_prometheus_stack/defaults/main/main.yml @@ -45,8 +45,6 @@ prometheus_storage_retention: "30d" # supported: KB, MB, GB, TB, PB. prometheus_storage_retention_size: "40GB" -prometheus_volume_size: 40Gi - prometheus_alert_relabel_configs: [] prometheus_global: diff --git a/ansible/roles/kube_prometheus_stack/tasks/main.yml b/ansible/roles/kube_prometheus_stack/tasks/main.yml index 5592ffd81..3e27cdf8c 100644 --- a/ansible/roles/kube_prometheus_stack/tasks/main.yml +++ b/ansible/roles/kube_prometheus_stack/tasks/main.yml @@ -46,7 +46,7 @@ app.kubernetes.io/name: prometheus-dir spec: capacity: - storage: "{{ prometheus_volume_size }}" + storage: 1Gi # not enforced for hostpath volumes but requires value >0 to work accessModes: - ReadWriteOnce hostPath: diff --git a/environments/common/inventory/group_vars/all/prometheus.yml b/environments/common/inventory/group_vars/all/prometheus.yml index 2783dc7bb..4bde55b3d 100644 --- a/environments/common/inventory/group_vars/all/prometheus.yml +++ b/environments/common/inventory/group_vars/all/prometheus.yml @@ -1,11 +1,8 @@ # Must be within K3s' reserved port range (default 30000-32767) prometheus_port: 30000 -prometheus_image_tag: "v2.27.0" - prometheus_db_dir: "{{ appliances_state_dir }}/prometheus" prometheus_storage_retention: "30d" -prometheus_volume_size: 40Gi prometheus_storage_retention_size: "40GB" prometheus_scrape_configs_default: - job_name: "slurm_exporter" From 8d242f7a6fca032dab87a9c0c108afa5fab378c2 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Tue, 22 Oct 2024 11:36:35 +0100 Subject: [PATCH 50/90] ondemand grafana proxying now conditional on ondemand having groups defined --- .../defaults/main/helm.yml | 6 +- .../defaults/main/main.yml | 2 - .../kube_prometheus_stack/tasks/main.yml | 59 ++++++++++--------- .../inventory/group_vars/all/defaults.yml | 3 + .../inventory/group_vars/all/grafana.yml | 2 + .../inventory/group_vars/all/openondemand.yml | 2 +- 6 files changed, 39 insertions(+), 35 deletions(-) diff --git a/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml b/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml index 1da23524d..f4797cd8c 100644 --- a/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml +++ b/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml @@ -104,9 +104,9 @@ kube_prometheus_stack_release_defaults: searchNamespace: ALL grafana.ini: server: - domain: "{{ openondemand_servername }}" - root_url: "{{ grafana_url_openondemand_proxy }}" - serve_from_sub_path: true + domain: "{{ grafana_domain }}" + root_url: "{{ grafana_url }}" + serve_from_sub_path: "{{ grafana_serve_from_sub_path }}" auth: "{{ grafana_auth }}" auth.anonymous: enabled: "{{ grafana_auth_anonymous }}" diff --git a/ansible/roles/kube_prometheus_stack/defaults/main/main.yml b/ansible/roles/kube_prometheus_stack/defaults/main/main.yml index 1d57a7dbc..1d6cccff3 100644 --- a/ansible/roles/kube_prometheus_stack/defaults/main/main.yml +++ b/ansible/roles/kube_prometheus_stack/defaults/main/main.yml @@ -12,9 +12,7 @@ kube_prometheus_stack_release_name: kube-prometheus-stack # The timeout to wait for the release to become ready kube_prometheus_stack_wait_timeout: 5m -login_ip: "{{ hostvars[groups['openondemand'][0]]['ansible_host'] }}" # probably needs to be more robust control_ip: "{{ ansible_default_ipv4.address| default(ansible_all_ipv4_addresses[0]) }}" -control_sslip: "{{ control_ip | regex_replace('\\.', '-') }}.sslip.io" grafana_auth_anonymous: false diff --git a/ansible/roles/kube_prometheus_stack/tasks/main.yml b/ansible/roles/kube_prometheus_stack/tasks/main.yml index 3e27cdf8c..8952cd140 100644 --- a/ansible/roles/kube_prometheus_stack/tasks/main.yml +++ b/ansible/roles/kube_prometheus_stack/tasks/main.yml @@ -33,7 +33,6 @@ annotations: storageclass.kubernetes.io/is-default-class: "false" -# variables would need refactoring to let us loop through the data directories nicely - name: Create Prometheus hostPath volume in /var/lib/state kubernetes.core.k8s: namespace: "{{ kube_prometheus_stack_release_namespace }}" @@ -53,36 +52,38 @@ path: "{{ prometheus_db_dir }}" type: DirectoryOrCreate -# not looping through these because templating doesn't set ports as integer -- name: Creating headless service for OOD exporter - kubernetes.core.k8s: - namespace: "{{ kube_prometheus_stack_release_namespace }}" - definition: - kind: Service - metadata: - name: ood-exporter - spec: - clusterIP: None - ports: - - name: ood-exporter - port: 9301 - protocol: TCP - -- name: Binding OOD exporter service to host - kubernetes.core.k8s: - namespace: "{{ kube_prometheus_stack_release_namespace }}" - definition: - kind: Endpoints - metadata: - name: ood-exporter - subsets: - - addresses: - - ip: "{{ login_ip }}" - ports: - - port: 9301 - name: ood-exporter +- name: Setting up k3s services for OnDemand Exporter + when: groups['openondemand'] | count > 0 + block: + - name: Creating headless service for OOD exporter + kubernetes.core.k8s: + namespace: "{{ kube_prometheus_stack_release_namespace }}" + definition: + kind: Service + metadata: + name: ood-exporter + spec: + clusterIP: None + ports: + - name: ood-exporter + port: 9301 protocol: TCP + - name: Binding OOD exporter service to host + kubernetes.core.k8s: + namespace: "{{ kube_prometheus_stack_release_namespace }}" + definition: + kind: Endpoints + metadata: + name: ood-exporter + subsets: + - addresses: + - ip: "{{ openondemand_ip }}" + ports: + - port: 9301 + name: ood-exporter + protocol: TCP + - name: Creating headless service for slurm exporter kubernetes.core.k8s: namespace: "{{ kube_prometheus_stack_release_namespace }}" diff --git a/environments/common/inventory/group_vars/all/defaults.yml b/environments/common/inventory/group_vars/all/defaults.yml index 422602754..ceded234b 100644 --- a/environments/common/inventory/group_vars/all/defaults.yml +++ b/environments/common/inventory/group_vars/all/defaults.yml @@ -20,6 +20,9 @@ prometheus_address: "{{ hostvars[groups['prometheus'].0].api_address }}:{{ prome openondemand_address: "{{ hostvars[groups['openondemand'].0].api_address if groups['openondemand'] | count > 0 else '' }}" grafana_address: "{{ hostvars[groups['prometheus'].0].api_address }}" +# Service IP addresses +openondemand_ip: "{{ hostvars[groups['openondemand'].0].ansible_host if groups['openondemand'] else '' }}" + ############################# bootstrap: local user configuration ######################### # Note RockyLinux 8.5 defines system user/groups in range 201-999 diff --git a/environments/common/inventory/group_vars/all/grafana.yml b/environments/common/inventory/group_vars/all/grafana.yml index 3288c8aa4..930bb5988 100644 --- a/environments/common/inventory/group_vars/all/grafana.yml +++ b/environments/common/inventory/group_vars/all/grafana.yml @@ -6,6 +6,8 @@ grafana_api_url: "http://{{ grafana_api_address }}:{{ grafana_port }}" grafana_url_direct: "http://{{ grafana_address }}:{{ grafana_port }}" grafana_url_openondemand_proxy: "https://{{ openondemand_servername | default('') }}/node/{{ groups['prometheus'].0 }}/{{ grafana_port }}" grafana_url: "{{ grafana_url_openondemand_proxy if groups['openondemand'] | count > 0 else grafana_url_direct }}" +grafana_serve_from_sub_path: "{{ groups['openondemand'] | count > 0 }}" +grafana_domain: "{{ ( openondemand_servername | default('') ) if groups['openondemand'] | count > 0 else grafana_api_address }}" grafana_dashboards_default: # node exporter slurm: diff --git a/environments/common/inventory/group_vars/all/openondemand.yml b/environments/common/inventory/group_vars/all/openondemand.yml index 621735000..71ffff844 100644 --- a/environments/common/inventory/group_vars/all/openondemand.yml +++ b/environments/common/inventory/group_vars/all/openondemand.yml @@ -178,7 +178,7 @@ openondemand_scrape_configs: scrape_interval: 2m static_configs: - targets: - - "{{ login_ip }}:9301" + - "{{ openondemand_address }}:9301" labels: environment: "{{ appliances_environment_name }}" service: "openondemand" From e6fbda805853e97503169d485ae6229363f69343 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Tue, 22 Oct 2024 11:52:25 +0100 Subject: [PATCH 51/90] standardised control ip resolution --- ansible/roles/kube_prometheus_stack/defaults/main/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ansible/roles/kube_prometheus_stack/defaults/main/main.yml b/ansible/roles/kube_prometheus_stack/defaults/main/main.yml index 1d6cccff3..9d607c2d7 100644 --- a/ansible/roles/kube_prometheus_stack/defaults/main/main.yml +++ b/ansible/roles/kube_prometheus_stack/defaults/main/main.yml @@ -12,7 +12,7 @@ kube_prometheus_stack_release_name: kube-prometheus-stack # The timeout to wait for the release to become ready kube_prometheus_stack_wait_timeout: 5m -control_ip: "{{ ansible_default_ipv4.address| default(ansible_all_ipv4_addresses[0]) }}" +control_ip: "{{ hostvars[groups['control'].0].ansible_host }}" grafana_auth_anonymous: false From e6a4e4bd9df56c4273b9d68f71230b4c73fea140 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Tue, 22 Oct 2024 13:05:41 +0100 Subject: [PATCH 52/90] reduced collectors to minimal set --- ansible/roles/kube_prometheus_stack/tasks/main.yml | 1 + .../common/inventory/group_vars/all/node_exporter.yml | 10 +++++++++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/ansible/roles/kube_prometheus_stack/tasks/main.yml b/ansible/roles/kube_prometheus_stack/tasks/main.yml index 8952cd140..5ea7c9453 100644 --- a/ansible/roles/kube_prometheus_stack/tasks/main.yml +++ b/ansible/roles/kube_prometheus_stack/tasks/main.yml @@ -173,3 +173,4 @@ - kube-prometheus-stack-proxy - kube-prometheus-stack-etcd - kube-prometheus-stack-alertmanager-overview + - kube-prometheus-stack-scheduler diff --git a/environments/common/inventory/group_vars/all/node_exporter.yml b/environments/common/inventory/group_vars/all/node_exporter.yml index 3c0add5d2..f9ffc0312 100644 --- a/environments/common/inventory/group_vars/all/node_exporter.yml +++ b/environments/common/inventory/group_vars/all/node_exporter.yml @@ -1,4 +1,12 @@ -# To disable unwanted metrics or enable non-default metrics, add --no-collector or --collector flags to list below (see https://github.com/prometheus/node_exporter?tab=readme-ov-file#collectors) +# To enabled additional metrics, add --collector flags to list below (see https://github.com/prometheus/node_exporter?tab=readme-ov-file#collectors) node_exporter_args: - --collector.filesystem.mount-points-exclude=^/(dev|proc|sys|var/lib/docker/.+|var/lib/kubelet/.+)($|/) - --collector.filesystem.fs-types-exclude=^(autofs|binfmt_misc|bpf|cgroup2?|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|iso9660|mqueue|nsfs|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|selinuxfs|squashfs|sysfs|tracefs)$ + - --collector.disable-defaults + - --collector.netdev + - --collector.cpu + - --collector.meminfo + - --collector.infiniband + - --collector.cpufreq + - --collector.diskstats + - --collector.filesystem From e5dff96e648405f1f514cc64beba437e758adb2b Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Tue, 22 Oct 2024 13:48:15 +0100 Subject: [PATCH 53/90] updated docs --- docs/monitoring-and-logging.README.md | 58 ++++++++++++--------------- 1 file changed, 25 insertions(+), 33 deletions(-) diff --git a/docs/monitoring-and-logging.README.md b/docs/monitoring-and-logging.README.md index 3e3de38c0..b7060e8db 100644 --- a/docs/monitoring-and-logging.README.md +++ b/docs/monitoring-and-logging.README.md @@ -2,6 +2,9 @@ ## Components overview +### [kube-prometheus-stack](https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack) +An umbrella Helm chart which the appliance uses to deploy and manages containerised versions of Grafana and Prometheus. + ### [filebeat](https://www.elastic.co/beats/filebeat) Parses log files and ships them to elasticsearch. Note we use the version shipped by Open Distro. @@ -85,18 +88,15 @@ This section details the configuration of grafana. ### Defaults -Internally, we use the [cloudalchemy.grafana](https://github.com/cloudalchemy/ansible-grafana) role. You can customise any of the variables that the role supports. For a full list, please see the -[upstream documentation](https://github.com/cloudalchemy/ansible-grafana). The appliance defaults can be found here: - -> [environments/common/inventory/group_vars/all/grafana.yml](../environments/common/inventory/group_vars/all/grafana.yml) +Internally, we configure Grafana using the [kube-prometheus-stack](https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack) Helm chart which passes values to a [Grafana subchart.](https://github.com/grafana/helm-charts/tree/main/charts/grafana) Common configuration options for the chart are exposed in [ansible/roles/kube-prometheus-stack/defaults/main/main.yml](../ansible/roles/kube-prometheus-stack/defaults/main/main.yml), with sensible defaults for Grafana being set in [environments/common/inventory/group_vars/all/grafana.yml](../environments/common/inventory/group_vars/all/grafana.yml). For more control over configuration, the exact values that will be merged with the Helm chart defaults can be found in [ansible/roles/kube-prometheus-stack/defaults/main/helm.yml](../ansible/roles/kube-prometheus-stack/defaults/main/helm.yml). ### Placement -The `grafana` group controls the placement of the grafana service. Load balancing is currently unsupported so it is important that you only assign one host to this group. +The `prometheus` group controls the placement of the Kubernetes monitoring stack. Load balancing is currently unsupported so it is important that you only assign one host to this group. ### Access -If Open Ondemand is enabled then by default this is used to proxy Grafana, otherwise Grafana is accessed through the first . See `grafana_url` in [environments/common/inventory/group_vars/all/grafana.yml](../environments/common/inventory/group_vars/all/grafana.yml). The port used (variable `grafana_port`) defaults to `3000`. +If Open Ondemand is enabled then by default this is used to proxy Grafana, otherwise Grafana is accessed through the first host in the Prometheus group (note that currently there is no support for load balancing so only one host should be in this group, the control node is used by default). See `grafana_url` in [environments/common/inventory/group_vars/all/grafana.yml](../environments/common/inventory/group_vars/all/grafana.yml). The port used (variable `grafana_port`) defaults to `30001`. The default credentials for the admin user are: @@ -105,11 +105,13 @@ The default credentials for the admin user are: Where `vault_grafana_admin_password` is a variable containing the actual password. This is generated by the `generate-passwords.yml` adhoc playbook (see [README.md](../README.md#creating-a-slurm-appliance)). +Note that if Open OnDemand is enabled, Grafana is only accessible through OOD's proxy. Requests to `grafana_url_direct` will be redirected through the proxy, which will ask you to authenticate against Open OnDemand (NOT Grafana credentials). See [Open OnDemand docs.](openondemand.README.md) + ### grafana dashboards -The appliance ships with a default set of dashboards. The set of dashboards can be configured via the `grafana_dashboards` variable. The dashboards are either internal to the [grafana-dashboards role](../ansible/roles/grafana-dashboards/files/) or downloaded from grafana.com. +In addition to the default set of dashboards that are deployed by kube-prometheus-stack, the appliance ships with a default set of dashboards (listed below). The set of appliance-specific dashboards can be configured via the `grafana_dashboards` variable. The dashboards are either internal to the [grafana-dashboards role](../ansible/roles/grafana-dashboards/files/) or downloaded from grafana.com. -#### node exporter +#### node exporter slurm This shows detailed metrics about an individual host. The metric source is `node exporter` (See [prometheus section](#prometheus-1) for more details). A slurm job annotation can optionally be enabled which will highlight the period of time where a given slurm job was running. The slurm job that is highlighted is controlled by the `Slurm Job ID` variable. An example is shown below: @@ -210,42 +212,32 @@ This section details the configuration of prometheus. ### Defaults -Internally, we use the [cloudalchemy.prometheus](https://github.com/cloudalchemy/ansible-prometheus) role. You can customise any of the variables that the role supports. For a full list, please see the -[upstream documentation](https://github.com/cloudalchemy/ansible-prometheus). The appliance defaults can be found here: - -> [environments/common/inventory/group_vars/all/prometheus.yml](../environments/common/inventory/group_vars/all/prometheus.yml) +Like Grafana, Prometheus is internally configured using kube-prometheus-stack with rolevars in [ansible/roles/kube_prometheus_stack/defaults/main](../ansible/roles/kube_prometheus_stack/defaults/main) (see [Grafana defaults section](#grafana-1) for more detail). Sensible defaults are defined in [environments/common/inventory/group_vars/all/prometheus.yml](../environments/common/inventory/group_vars/all/prometheus.yml). ### Placement -The `prometheus` group determines the placement of the prometheus service. Load balancing is currently unsupported so it is important that you only assign one host to this group. +The `prometheus` group controls the placement of the Kubernetes monitoring stack. Load balancing is currently unsupported so it is important that you only assign one host to this group. ### Access -Prometheus is exposed on port `9090` on all hosts in the prometheus group. Currently, the configuration assumes a single host. Following the reference layout in `environments/common/layouts/everything`, this will be set to the slurm `control` node, prometheus would then be accessible from: +Prometheus is exposed on port `30000` on all hosts in the prometheus group. Currently, the configuration assumes a single host. Following the reference layout in `environments/common/layouts/everything`, this will be set to the slurm `control` node, prometheus would then be accessible from: - > http://:9090 + > http://:30000 -The port can customised by overriding the `prometheus_web_external_url` variable. +The port can customised by overriding the `prometheus_port` variable. Note that this service is not password protected, allowing anyone with access to the URL to make queries. -### Recording rules +### Alerting and recording rules -The upstream documentation can be found [here](https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/). +See the upstream documentation for [alerting](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/) and [recording](https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/) rules. -This appliance provides a default set of recording rules which can be found here: - -> [environments/common/files/prometheus/rules/precompute.rules](../environments/common/files/prometheus/rules/precompute.rules) - -The intended purpose is to pre-compute some expensive queries that are used -in the reference set of grafana dashboards. - -To add new, or to remove rules you will be to adjust the `prometheus_alert_rules_files` variable. The default value can be found in: +In addition to the default recording and alerting rules set by kube-prometheus-stack, the appliances provides a default set of rules which can be found in the `prometheus_extra_rules` list in: > [environments/common/inventory/group_vars/all/prometheus.yml](../environments/common/inventory/group_vars/all/prometheus.yml) -You can extend this variable in your environment specific configuration to reference extra files or to remove the defaults. The reference set of dashboards expect these variables to be defined, so if you remove them, you -will also have to update your dashboards. +The provided default recording rules are intended to pre-compute some expensive queries that are used +in the reference set of grafana dashboards. The default alerting rules define alerts for issues with Slurm nodes. ### node_exporter @@ -262,18 +254,18 @@ This appliance customises the default set of collectors to a minimal set, these - meminfo - infiniband - cpufreq +- diskstats +- filesystem -The list can be customised by overriding the `collect[]` parameter of the `node` job in the `prometheus_scrape_configs` dictionary. The defaults can be found in: - -> [environments/common/inventory/group_vars/all/prometheus.yml](../environments/common/inventory/group_vars/all/prometheus.yml). +The list can be customised by adding or removing `--collector` flags to Node Exporter's command line arguments. The defaults can be found in: -Variables in this file should *not* be customised directly, but should be overridden in your `environment`. See [README.md](../README.md#environments) which details the process of overriding default variables in more detail. +> [environments/common/inventory/group_vars/all/node_exporter.yml](../environments/common/inventory/group_vars/all/node_exporter.yml). ### custom ansible filters #### prometheus_node_exporter_targets -Groups prometheus targets into per environment groups. The ansible variable, `env` is used to determine the grouping. The metrics for each target in the group are given the prometheus label, `env: $env`, where `$env` is the value of the `env` variable for that host. +Groups prometheus targets into per environment groups. The ansible variable, `cluster_env` is used to determine the grouping. The metrics for each target in the group are given the prometheus label, `cluster_env: $cluster_env`, where `$cluster_env` is the value of the `cluster_env` variable for that host. ## slurm-stats From 4c44261a3a50a791ed09efce1ca4588d60e88ef6 Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Wed, 23 Oct 2024 09:53:20 +0100 Subject: [PATCH 54/90] bump images --- .../.stackhpc/terraform/cluster_image.auto.tfvars.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json index fc7efcf07..7e4f69621 100644 --- a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json @@ -1,7 +1,7 @@ { "cluster_image": { - "RL8": "openhpc-RL8-241015-1524-fdb5c237", - "RL9": "openhpc-RL9-241015-1524-fdb5c237", - "RL9-cuda": "openhpc-cuda-RL9-241015-1524-fdb5c237" + "RL8": "openhpc-RL8-241023-0809-ce90ab0b", + "RL9": "openhpc-RL9-241022-1628-ce90ab0b", + "RL9-cuda": "openhpc-cuda-RL9-241023-0809-ce90ab0b" } } From f93348ed3060d487b89f0162a0394566c7c5c23c Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Wed, 23 Oct 2024 17:47:00 +0100 Subject: [PATCH 55/90] monitoring stack images now pre-pulled --- ansible/fatimage.yml | 33 +++++++++++++++++++ .../defaults/main/helm.yml | 13 ++++++++ .../defaults/main/main.yml | 5 +++ .../inventory/group_vars/all/alertmanager.yml | 2 ++ .../inventory/group_vars/all/grafana.yml | 2 ++ .../inventory/group_vars/all/monitoring.yml | 10 ++++++ .../group_vars/all/node_exporter.yml | 2 ++ .../inventory/group_vars/all/prometheus.yml | 2 ++ 8 files changed, 69 insertions(+) diff --git a/ansible/fatimage.yml b/ansible/fatimage.yml index d6c250c5d..fe6abb5db 100644 --- a/ansible/fatimage.yml +++ b/ansible/fatimage.yml @@ -114,6 +114,39 @@ slurm_exporter_state: stopped when: "'slurm_exporter' in group_names" +- hosts: builder + name: Pre-pull kube-prometheus-stack images and import to k3s + vars: + image_list: + - { name: "docker.io/grafana/grafana", tag: "{{ grafana_image_tag }}" } + - { name: "quay.io/prometheus/prometheus", tag: "{{ prometheus_image_tag }}" } + - { name: "quay.io/prometheus/alertmanager", tag: "{{ alertmanager_image_tag }}" } + - { name: "quay.io/prometheus-operator/node-exporter", tag: "{{ node_exporter_image_tag }}" } + - { name: "quay.io/prometheus-operator/prometheus-config-reloader", tag: "{{ kube_prometheus_stack_app_version }}" } + - { name: "quay.io/prometheus-operator/prometheus-operator", tag: "{{ kube_prometheus_stack_app_version }}" } + - { name: "quay.io/kiwigrid/k8s-sidecar", tag: "{{ grafana_sidecar_image_tag }}" } + - { name: "registry.k8s.io/kube-state-metrics/kube-state-metrics", tag: "{{ kube_prometheus_stack_metrics_image_tag }}" } + - { name: "registry.k8s.io/ingress-nginx/kube-webhook-certgen", tag: "{{ kube_prometheus_stack_patch_image_tag }}" } + tasks: + - name: Pull with images with podman + containers.podman.podman_image: + name: "{{ item.name }}" + tag: "{{ item.tag }}" + loop: "{{ image_list }}" + + - name: Export images to k3s + containers.podman.podman_save: + image: "{{ item.name }}:{{ item.tag }}" + dest: "/var/lib/rancher/k3s/agent/images/{{ item.name | regex_replace('\\/|\\.','-')}}.tar" + loop: "{{ image_list }}" + + - name: Clean up podman images + containers.podman.podman_image: + state: absent + name: "{{ item.name }}" + tag: "{{ item.tag }}" + loop: "{{ image_list }}" + - name: Run post.yml hook vars: appliances_environment_root: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}" diff --git a/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml b/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml index f4797cd8c..a4f43c815 100644 --- a/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml +++ b/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml @@ -100,6 +100,8 @@ kube_prometheus_stack_release_defaults: ingress: path: "/node/{{ groups['prometheus'].0 }}/{{ grafana_port }}" sidecar: + image: + tag: "{{ grafana_sidecar_image_tag }}" dashboards: searchNamespace: ALL grafana.ini: @@ -147,10 +149,21 @@ kube_prometheus_stack_release_defaults: prometheusOperator: + image: + tag: "{{ kube_prometheus_stack_app_version }}" + prometheusConfigReloader: + image: + tag: "{{ kube_prometheus_stack_app_version }}" + admissionWebhooks: + patch: + image: + tag: "{{ kube_prometheus_stack_patch_image_tag }}" nodeSelector: clusterrole: "server" kube-state-metrics: + image: + tag: "{{ kube_prometheus_stack_metrics_image_tag }}" nodeSelector: clusterrole: "server" diff --git a/ansible/roles/kube_prometheus_stack/defaults/main/main.yml b/ansible/roles/kube_prometheus_stack/defaults/main/main.yml index 9d607c2d7..f47550da6 100644 --- a/ansible/roles/kube_prometheus_stack/defaults/main/main.yml +++ b/ansible/roles/kube_prometheus_stack/defaults/main/main.yml @@ -4,6 +4,7 @@ kube_prometheus_stack_chart_repo: https://prometheus-community.github.io/helm-charts kube_prometheus_stack_chart_name: kube-prometheus-stack kube_prometheus_stack_chart_version: 59.1.0 +kube_prometheus_stack_app_version: v0.74.0 # Release information kube_prometheus_stack_release_namespace: monitoring-system @@ -12,6 +13,9 @@ kube_prometheus_stack_release_name: kube-prometheus-stack # The timeout to wait for the release to become ready kube_prometheus_stack_wait_timeout: 5m +kube_prometheus_stack_metrics_image_tag: v2.12.0 +kube_prometheus_stack_patch_image_tag: v20221220-controller-v1.5.1-58-g787ea74b6 + control_ip: "{{ hostvars[groups['control'].0].ansible_host }}" grafana_auth_anonymous: false @@ -84,6 +88,7 @@ grafana_datasources: [] grafana_home_dashboard: 13427.json # node exporter slurm alertmanager_image_tag: v0.27.0 +grafana_sidecar_image_tag: 1.26.1 node_exporter_image_tag: v1.8.2 node_exporter_args: diff --git a/environments/common/inventory/group_vars/all/alertmanager.yml b/environments/common/inventory/group_vars/all/alertmanager.yml index a78ffee7e..6d95ae233 100644 --- a/environments/common/inventory/group_vars/all/alertmanager.yml +++ b/environments/common/inventory/group_vars/all/alertmanager.yml @@ -1,3 +1,5 @@ +alertmanager_image_tag: v0.27.0 + alertmanager_config: route: group_by: ['...'] diff --git a/environments/common/inventory/group_vars/all/grafana.yml b/environments/common/inventory/group_vars/all/grafana.yml index 930bb5988..d028b85c9 100644 --- a/environments/common/inventory/group_vars/all/grafana.yml +++ b/environments/common/inventory/group_vars/all/grafana.yml @@ -1,4 +1,6 @@ grafana_port: 30001 +grafana_image_tag: 11.2.2 +grafana_sidecar_image_tag: 1.26.1 # Configure external address, with external URL depending on whether we are using Open Ondemand as a proxy grafana_api_address: "{{ hostvars[groups['prometheus'].0].internal_address }}" diff --git a/environments/common/inventory/group_vars/all/monitoring.yml b/environments/common/inventory/group_vars/all/monitoring.yml index 098039b44..c80c145f9 100644 --- a/environments/common/inventory/group_vars/all/monitoring.yml +++ b/environments/common/inventory/group_vars/all/monitoring.yml @@ -4,3 +4,13 @@ kube_prometheus_stack_release_name: kube-prometheus-stack kube_prometheus_stack_wait_timeout: 5m # See prometheus.yml, grafana.yml and alertmanager.yml for config of individual monitoring services + + +# used as tag prometheus-operator image and helpers, +# should be kept in sync with the appVersion provided in the Chart.yaml of whichever +# kube_prometheus_stack_chart_version is used +kube_prometheus_stack_app_version: v0.74.0 + +# helper images +kube_prometheus_stack_metrics_image_tag: v2.12.0 +kube_prometheus_stack_patch_image_tag: v20221220-controller-v1.5.1-58-g787ea74b6 diff --git a/environments/common/inventory/group_vars/all/node_exporter.yml b/environments/common/inventory/group_vars/all/node_exporter.yml index f9ffc0312..311527048 100644 --- a/environments/common/inventory/group_vars/all/node_exporter.yml +++ b/environments/common/inventory/group_vars/all/node_exporter.yml @@ -10,3 +10,5 @@ node_exporter_args: - --collector.cpufreq - --collector.diskstats - --collector.filesystem + +node_exporter_image_tag: v1.8.2 diff --git a/environments/common/inventory/group_vars/all/prometheus.yml b/environments/common/inventory/group_vars/all/prometheus.yml index 4bde55b3d..2b8fc3bac 100644 --- a/environments/common/inventory/group_vars/all/prometheus.yml +++ b/environments/common/inventory/group_vars/all/prometheus.yml @@ -1,6 +1,8 @@ # Must be within K3s' reserved port range (default 30000-32767) prometheus_port: 30000 +prometheus_image_tag: "v2.27.0" + prometheus_db_dir: "{{ appliances_state_dir }}/prometheus" prometheus_storage_retention: "30d" prometheus_storage_retention_size: "40GB" From e8d2e81a60c8963ca3f63f5f5840b29909585de3 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Wed, 23 Oct 2024 18:04:14 +0100 Subject: [PATCH 56/90] moved monitoring pre-pulls to role --- ansible/fatimage.yml | 39 +++---------------- .../kube_prometheus_stack/tasks/install.yml | 33 ++++++++++++++++ 2 files changed, 39 insertions(+), 33 deletions(-) create mode 100644 ansible/roles/kube_prometheus_stack/tasks/install.yml diff --git a/ansible/fatimage.yml b/ansible/fatimage.yml index fe6abb5db..bbce9687c 100644 --- a/ansible/fatimage.yml +++ b/ansible/fatimage.yml @@ -113,39 +113,12 @@ vars: slurm_exporter_state: stopped when: "'slurm_exporter' in group_names" - -- hosts: builder - name: Pre-pull kube-prometheus-stack images and import to k3s - vars: - image_list: - - { name: "docker.io/grafana/grafana", tag: "{{ grafana_image_tag }}" } - - { name: "quay.io/prometheus/prometheus", tag: "{{ prometheus_image_tag }}" } - - { name: "quay.io/prometheus/alertmanager", tag: "{{ alertmanager_image_tag }}" } - - { name: "quay.io/prometheus-operator/node-exporter", tag: "{{ node_exporter_image_tag }}" } - - { name: "quay.io/prometheus-operator/prometheus-config-reloader", tag: "{{ kube_prometheus_stack_app_version }}" } - - { name: "quay.io/prometheus-operator/prometheus-operator", tag: "{{ kube_prometheus_stack_app_version }}" } - - { name: "quay.io/kiwigrid/k8s-sidecar", tag: "{{ grafana_sidecar_image_tag }}" } - - { name: "registry.k8s.io/kube-state-metrics/kube-state-metrics", tag: "{{ kube_prometheus_stack_metrics_image_tag }}" } - - { name: "registry.k8s.io/ingress-nginx/kube-webhook-certgen", tag: "{{ kube_prometheus_stack_patch_image_tag }}" } - tasks: - - name: Pull with images with podman - containers.podman.podman_image: - name: "{{ item.name }}" - tag: "{{ item.tag }}" - loop: "{{ image_list }}" - - - name: Export images to k3s - containers.podman.podman_save: - image: "{{ item.name }}:{{ item.tag }}" - dest: "/var/lib/rancher/k3s/agent/images/{{ item.name | regex_replace('\\/|\\.','-')}}.tar" - loop: "{{ image_list }}" - - - name: Clean up podman images - containers.podman.podman_image: - state: absent - name: "{{ item.name }}" - tag: "{{ item.tag }}" - loop: "{{ image_list }}" + + - name: kube prometheus stack + import_role: + name: kube_prometheus_stack + tasks_from: install.yml + when: "'prometheus' in group_names" - name: Run post.yml hook vars: diff --git a/ansible/roles/kube_prometheus_stack/tasks/install.yml b/ansible/roles/kube_prometheus_stack/tasks/install.yml new file mode 100644 index 000000000..d5d494c41 --- /dev/null +++ b/ansible/roles/kube_prometheus_stack/tasks/install.yml @@ -0,0 +1,33 @@ +--- + +- name: Pre-pull kube-prometheus-stack images and import to k3s + vars: + image_list: + - { name: "docker.io/grafana/grafana", tag: "{{ grafana_image_tag }}" } + - { name: "quay.io/prometheus/prometheus", tag: "{{ prometheus_image_tag }}" } + - { name: "quay.io/prometheus/alertmanager", tag: "{{ alertmanager_image_tag }}" } + - { name: "quay.io/prometheus-operator/node-exporter", tag: "{{ node_exporter_image_tag }}" } + - { name: "quay.io/prometheus-operator/prometheus-config-reloader", tag: "{{ kube_prometheus_stack_app_version }}" } + - { name: "quay.io/prometheus-operator/prometheus-operator", tag: "{{ kube_prometheus_stack_app_version }}" } + - { name: "quay.io/kiwigrid/k8s-sidecar", tag: "{{ grafana_sidecar_image_tag }}" } + - { name: "registry.k8s.io/kube-state-metrics/kube-state-metrics", tag: "{{ kube_prometheus_stack_metrics_image_tag }}" } + - { name: "registry.k8s.io/ingress-nginx/kube-webhook-certgen", tag: "{{ kube_prometheus_stack_patch_image_tag }}" } + block: + - name: Pull with images with podman + containers.podman.podman_image: + name: "{{ item.name }}" + tag: "{{ item.tag }}" + loop: "{{ image_list }}" + + - name: Export images to k3s + containers.podman.podman_save: + image: "{{ item.name }}:{{ item.tag }}" + dest: "/var/lib/rancher/k3s/agent/images/{{ item.name | regex_replace('\\/|\\.','-')}}.tar" + loop: "{{ image_list }}" + + - name: Clean up podman images + containers.podman.podman_image: + state: absent + name: "{{ item.name }}" + tag: "{{ item.tag }}" + loop: "{{ image_list }}" \ No newline at end of file From d00245400e44d344a0edf6be187b8f7578522f53 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Thu, 24 Oct 2024 08:41:41 +0100 Subject: [PATCH 57/90] fixed build typo --- ansible/roles/kube_prometheus_stack/tasks/install.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ansible/roles/kube_prometheus_stack/tasks/install.yml b/ansible/roles/kube_prometheus_stack/tasks/install.yml index d5d494c41..d3ff3582a 100644 --- a/ansible/roles/kube_prometheus_stack/tasks/install.yml +++ b/ansible/roles/kube_prometheus_stack/tasks/install.yml @@ -6,7 +6,7 @@ - { name: "docker.io/grafana/grafana", tag: "{{ grafana_image_tag }}" } - { name: "quay.io/prometheus/prometheus", tag: "{{ prometheus_image_tag }}" } - { name: "quay.io/prometheus/alertmanager", tag: "{{ alertmanager_image_tag }}" } - - { name: "quay.io/prometheus-operator/node-exporter", tag: "{{ node_exporter_image_tag }}" } + - { name: "quay.io/prometheus/node-exporter", tag: "{{ node_exporter_image_tag }}" } - { name: "quay.io/prometheus-operator/prometheus-config-reloader", tag: "{{ kube_prometheus_stack_app_version }}" } - { name: "quay.io/prometheus-operator/prometheus-operator", tag: "{{ kube_prometheus_stack_app_version }}" } - { name: "quay.io/kiwigrid/k8s-sidecar", tag: "{{ grafana_sidecar_image_tag }}" } From 991613e2a9791a0410dbafe61edddc48194d3efd Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Thu, 24 Oct 2024 08:48:37 +0100 Subject: [PATCH 58/90] removed unused groupvars --- .../roles/kube_prometheus_stack/defaults/main/main.yml | 4 ++++ .../common/inventory/group_vars/all/grafana.yml | 2 -- .../common/inventory/group_vars/all/monitoring.yml | 10 ---------- .../common/inventory/group_vars/all/node_exporter.yml | 2 -- .../common/inventory/group_vars/all/prometheus.yml | 2 -- 5 files changed, 4 insertions(+), 16 deletions(-) diff --git a/ansible/roles/kube_prometheus_stack/defaults/main/main.yml b/ansible/roles/kube_prometheus_stack/defaults/main/main.yml index f47550da6..b272c36e1 100644 --- a/ansible/roles/kube_prometheus_stack/defaults/main/main.yml +++ b/ansible/roles/kube_prometheus_stack/defaults/main/main.yml @@ -4,6 +4,10 @@ kube_prometheus_stack_chart_repo: https://prometheus-community.github.io/helm-charts kube_prometheus_stack_chart_name: kube-prometheus-stack kube_prometheus_stack_chart_version: 59.1.0 + +# used as tag prometheus-operator image and helpers, +# should be kept in sync with the appVersion provided in the Chart.yaml of whichever +# kube_prometheus_stack_chart_version is used kube_prometheus_stack_app_version: v0.74.0 # Release information diff --git a/environments/common/inventory/group_vars/all/grafana.yml b/environments/common/inventory/group_vars/all/grafana.yml index d028b85c9..930bb5988 100644 --- a/environments/common/inventory/group_vars/all/grafana.yml +++ b/environments/common/inventory/group_vars/all/grafana.yml @@ -1,6 +1,4 @@ grafana_port: 30001 -grafana_image_tag: 11.2.2 -grafana_sidecar_image_tag: 1.26.1 # Configure external address, with external URL depending on whether we are using Open Ondemand as a proxy grafana_api_address: "{{ hostvars[groups['prometheus'].0].internal_address }}" diff --git a/environments/common/inventory/group_vars/all/monitoring.yml b/environments/common/inventory/group_vars/all/monitoring.yml index c80c145f9..098039b44 100644 --- a/environments/common/inventory/group_vars/all/monitoring.yml +++ b/environments/common/inventory/group_vars/all/monitoring.yml @@ -4,13 +4,3 @@ kube_prometheus_stack_release_name: kube-prometheus-stack kube_prometheus_stack_wait_timeout: 5m # See prometheus.yml, grafana.yml and alertmanager.yml for config of individual monitoring services - - -# used as tag prometheus-operator image and helpers, -# should be kept in sync with the appVersion provided in the Chart.yaml of whichever -# kube_prometheus_stack_chart_version is used -kube_prometheus_stack_app_version: v0.74.0 - -# helper images -kube_prometheus_stack_metrics_image_tag: v2.12.0 -kube_prometheus_stack_patch_image_tag: v20221220-controller-v1.5.1-58-g787ea74b6 diff --git a/environments/common/inventory/group_vars/all/node_exporter.yml b/environments/common/inventory/group_vars/all/node_exporter.yml index 311527048..f9ffc0312 100644 --- a/environments/common/inventory/group_vars/all/node_exporter.yml +++ b/environments/common/inventory/group_vars/all/node_exporter.yml @@ -10,5 +10,3 @@ node_exporter_args: - --collector.cpufreq - --collector.diskstats - --collector.filesystem - -node_exporter_image_tag: v1.8.2 diff --git a/environments/common/inventory/group_vars/all/prometheus.yml b/environments/common/inventory/group_vars/all/prometheus.yml index 2b8fc3bac..4bde55b3d 100644 --- a/environments/common/inventory/group_vars/all/prometheus.yml +++ b/environments/common/inventory/group_vars/all/prometheus.yml @@ -1,8 +1,6 @@ # Must be within K3s' reserved port range (default 30000-32767) prometheus_port: 30000 -prometheus_image_tag: "v2.27.0" - prometheus_db_dir: "{{ appliances_state_dir }}/prometheus" prometheus_storage_retention: "30d" prometheus_storage_retention_size: "40GB" From b4b69b55b1b26f08c3cc0cba840d20d4ecaf5ccf Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Thu, 24 Oct 2024 08:55:39 +0100 Subject: [PATCH 59/90] removed cloudalchemy roles from install --- requirements.yml | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/requirements.yml b/requirements.yml index e4b6bd6ef..2c0230b06 100644 --- a/requirements.yml +++ b/requirements.yml @@ -5,17 +5,6 @@ roles: - src: https://github.com/stackhpc/ansible-role-openhpc.git version: v0.26.0 # https://github.com/stackhpc/ansible-role-openhpc/pull/168 name: stackhpc.openhpc - - src: https://github.com/stackhpc/ansible-node-exporter.git - version: stackhpc - name: cloudalchemy.node_exporter - - src: https://github.com/cloudalchemy/ansible-prometheus.git - version: 4d2c8d742de39e50387e0aa6d5510b21c7451343 # need fix in preceeding commit for rocky - name: cloudalchemy.prometheus - - src: cloudalchemy.alertmanager - version: 0.19.1 - - src: https://github.com/stackhpc/ansible-grafana.git - name: cloudalchemy.grafana - version: stackhpc-0.19.0 # fix grafana install - src: https://github.com/OSC/ood-ansible.git name: osc.ood version: v3.1.5 From b0f48fd0f7a386b7d031ae5439e1eb60a3bb4e90 Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Thu, 24 Oct 2024 10:32:06 +0100 Subject: [PATCH 60/90] bump images --- .../.stackhpc/terraform/cluster_image.auto.tfvars.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json index 7e4f69621..45ef20938 100644 --- a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json @@ -1,7 +1,7 @@ { "cluster_image": { - "RL8": "openhpc-RL8-241023-0809-ce90ab0b", - "RL9": "openhpc-RL9-241022-1628-ce90ab0b", - "RL9-cuda": "openhpc-cuda-RL9-241023-0809-ce90ab0b" + "RL8": "openhpc-RL8-241024-0744-d0024540", + "RL9": "openhpc-RL9-241024-0744-d0024540", + "RL9-cuda": "openhpc-cuda-RL9-241024-0744-d0024540" } } From ec57a21bc6023347f6c638b1802ae31eb05f7880 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Thu, 24 Oct 2024 13:34:44 +0100 Subject: [PATCH 61/90] fixed some incompatibilities with old metrics --- .../roles/grafana-dashboards/files/openhpc-slurm.json | 10 +++++----- .../roles/kube_prometheus_stack/defaults/main/helm.yml | 5 +++-- docs/monitoring-and-logging.README.md | 3 ++- .../common/inventory/group_vars/all/node_exporter.yml | 1 + 4 files changed, 11 insertions(+), 8 deletions(-) diff --git a/ansible/roles/grafana-dashboards/files/openhpc-slurm.json b/ansible/roles/grafana-dashboards/files/openhpc-slurm.json index 820fc4973..5458ba9a4 100644 --- a/ansible/roles/grafana-dashboards/files/openhpc-slurm.json +++ b/ansible/roles/grafana-dashboards/files/openhpc-slurm.json @@ -1950,15 +1950,15 @@ "allValue": null, "current": {}, "datasource": "${DS_PROMETHEUS}", - "definition": "label_values(node_cpu_seconds_total{job=~\"node-exporter\"}, cluster_env)", + "definition": "label_values(node_cpu_seconds_total{job=~\"node-exporter\"}, env)", "error": null, "hide": 0, "includeAll": true, "label": "Environment", "multi": true, - "name": "cluster_env", + "name": "env", "options": [], - "query": "label_values(node_cpu_seconds_total{job=~\"node-exporter\"}, cluster_env)", + "query": "label_values(node_cpu_seconds_total{job=~\"node-exporter\"}, env)", "refresh": 1, "regex": "", "skipUrlSync": false, @@ -1994,7 +1994,7 @@ "allValue": null, "current": {}, "datasource": "${DS_PROMETHEUS}", - "definition": "label_values(node_cpu_seconds_total{job=~\"node-exporter\", cluster_env=~\"$cluster_env\", instance=~\"$host_filter\"}, instance)", + "definition": "label_values(node_cpu_seconds_total{job=~\"node-exporter\", env=~\"$env\", instance=~\"$host_filter\"}, instance)", "error": null, "hide": 0, "includeAll": true, @@ -2002,7 +2002,7 @@ "multi": true, "name": "instance", "options": [], - "query": "label_values(node_cpu_seconds_total{job=~\"node-exporter\", cluster_env=~\"$cluster_env\", instance=~\"$host_filter\"}, instance)", + "query": "label_values(node_cpu_seconds_total{job=~\"node-exporter\", env=~\"$env\", instance=~\"$host_filter\"}, instance)", "refresh": 1, "regex": "", "skipUrlSync": false, diff --git a/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml b/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml index a4f43c815..e9c04632f 100644 --- a/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml +++ b/ansible/roles/kube_prometheus_stack/defaults/main/helm.yml @@ -181,8 +181,9 @@ kube_prometheus_stack_release_defaults: targetLabel: instance replacement: $1 action: replace - metricRelabelings: - - targetLabel: cluster_env + - targetLabel: env + sourceLabels: [env] + regex: ^$ replacement: ungrouped kube_prometheus_stack_release_overrides: {} diff --git a/docs/monitoring-and-logging.README.md b/docs/monitoring-and-logging.README.md index b7060e8db..961634419 100644 --- a/docs/monitoring-and-logging.README.md +++ b/docs/monitoring-and-logging.README.md @@ -256,6 +256,7 @@ This appliance customises the default set of collectors to a minimal set, these - cpufreq - diskstats - filesystem +- uname The list can be customised by adding or removing `--collector` flags to Node Exporter's command line arguments. The defaults can be found in: @@ -265,7 +266,7 @@ The list can be customised by adding or removing `--collector` flags to Node Exp #### prometheus_node_exporter_targets -Groups prometheus targets into per environment groups. The ansible variable, `cluster_env` is used to determine the grouping. The metrics for each target in the group are given the prometheus label, `cluster_env: $cluster_env`, where `$cluster_env` is the value of the `cluster_env` variable for that host. +Groups prometheus targets into per environment groups. The ansible variable, `env` is used to determine the grouping. The metrics for each target in the group are given the prometheus label, `env: $env`, where `$env` is the value of the `env` variable for that host. ## slurm-stats diff --git a/environments/common/inventory/group_vars/all/node_exporter.yml b/environments/common/inventory/group_vars/all/node_exporter.yml index f9ffc0312..3d1e122c9 100644 --- a/environments/common/inventory/group_vars/all/node_exporter.yml +++ b/environments/common/inventory/group_vars/all/node_exporter.yml @@ -10,3 +10,4 @@ node_exporter_args: - --collector.cpufreq - --collector.diskstats - --collector.filesystem + - --collector.uname From a0edab7ccf890e46576d27d233426e5abe045c24 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Thu, 24 Oct 2024 15:22:16 +0100 Subject: [PATCH 62/90] removed container internal networking devices from grafana --- ansible/roles/grafana-dashboards/files/openhpc-slurm.json | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ansible/roles/grafana-dashboards/files/openhpc-slurm.json b/ansible/roles/grafana-dashboards/files/openhpc-slurm.json index 5458ba9a4..164927b38 100644 --- a/ansible/roles/grafana-dashboards/files/openhpc-slurm.json +++ b/ansible/roles/grafana-dashboards/files/openhpc-slurm.json @@ -1728,7 +1728,7 @@ "steppedLine": false, "targets": [ { - "expr": "irate(node_network_receive_bytes_total{job=~\"$job\",instance=~\"$instance\"}[60s])*8", + "expr": "irate(node_network_receive_bytes_total{job=~\"$job\",instance=~\"$instance\",device=~\"^(([^v].{3}|.[^e].{2}|.{2}[^t].|.{3}[^h]).*|.{0,3})$\",device=~\"^(([^c].{2}|.[^n].{1}|.{2}[^i]).*|.{0,2})$\"}[60s])*8", "hide": true, "interval": "", "intervalFactor": 1, @@ -1739,7 +1739,7 @@ "target": "" }, { - "expr": "irate(node_network_transmit_bytes_total{job=~\"$job\",instance=~\"$instance\"}[60s])*8", + "expr": "irate(node_network_transmit_bytes_total{job=~\"$job\",instance=~\"$instance\",device=~\"^(([^v].{3}|.[^e].{2}|.{2}[^t].|.{3}[^h]).*|.{0,3})$\",device=~\"^(([^c].{2}|.[^n].{1}|.{2}[^i]).*|.{0,2})$\"}[60s])*8", "hide": false, "interval": "", "intervalFactor": 1, @@ -1853,7 +1853,7 @@ "steppedLine": false, "targets": [ { - "expr": "irate(node_network_receive_bytes_total{job=~\"$job\",instance=~\"$instance\"}[60s])*8", + "expr": "irate(node_network_receive_bytes_total{job=~\"$job\",instance=~\"$instance\",device=~\"^(([^v].{3}|.[^e].{2}|.{2}[^t].|.{3}[^h]).*|.{0,3})$\",device=~\"^(([^c].{2}|.[^n].{1}|.{2}[^i]).*|.{0,2})$\"}[60s])*8", "hide": false, "interval": "", "intervalFactor": 1, @@ -1864,7 +1864,7 @@ "target": "" }, { - "expr": "irate(node_network_transmit_bytes_total{job=~\"$job\",instance=~\"$instance\"}[60s])*8", + "expr": "irate(node_network_transmit_bytes_total{job=~\"$job\",instance=~\"$instance\",device=~\"^(([^v].{3}|.[^e].{2}|.{2}[^t].|.{3}[^h]).*|.{0,3})$\",device=~\"^(([^c].{2}|.[^n].{1}|.{2}[^i]).*|.{0,2})$\"}[60s])*8", "hide": true, "interval": "", "intervalFactor": 1, From 8acc2b53866317484f1be0a999d7a1702790f0f7 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Fri, 25 Oct 2024 10:56:29 +0100 Subject: [PATCH 63/90] openhpc dashboard now job agnostic --- .../files/openhpc-slurm.json | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/ansible/roles/grafana-dashboards/files/openhpc-slurm.json b/ansible/roles/grafana-dashboards/files/openhpc-slurm.json index 164927b38..19d4a83c5 100644 --- a/ansible/roles/grafana-dashboards/files/openhpc-slurm.json +++ b/ansible/roles/grafana-dashboards/files/openhpc-slurm.json @@ -711,7 +711,7 @@ "steppedLine": false, "targets": [ { - "expr": "100 - (100 * node_memory_MemAvailable_bytes{job=~\"node-exporter\", instance=~\"$instance\"} / node_memory_MemTotal_bytes{job=~\"node-exporter\", instance=~\"$instance\"})", + "expr": "100 - (100 * node_memory_MemAvailable_bytes{job=~\"$job\", instance=~\"$instance\"} / node_memory_MemTotal_bytes{job=~\"$job\", instance=~\"$instance\"})", "format": "time_series", "interval": "", "intervalFactor": 1, @@ -818,7 +818,7 @@ "steppedLine": false, "targets": [ { - "expr": "(100 * sum by(instance)(increase(node_cpu_seconds_total{mode=\"iowait\",job=~\"node-exporter_fast\"}[1s]))) / (sum by(instance)(increase(node_cpu_seconds_total{job=~\"node-exporter_fast\"}[1s])))", + "expr": "(100 * sum by(instance)(increase(node_cpu_seconds_total{mode=\"iowait\",job=~\"$job\"}[1s]))) / (sum by(instance)(increase(node_cpu_seconds_total{job=~\"$job\"}[1s])))", "hide": true, "interval": "", "legendFormat": "{{ instance }}", @@ -1489,7 +1489,7 @@ "steppedLine": false, "targets": [ { - "expr": "increase(node_infiniband_port_transmit_wait_total{job=\"node-exporter\",instance=~\"$instance\"}[1m]) / increase(node_infiniband_port_packets_transmitted_total{job=\"node-exporter\",instance=~\"$instance\"}[1m])", + "expr": "increase(node_infiniband_port_transmit_wait_total{job=\"$job\",instance=~\"$instance\"}[1m]) / increase(node_infiniband_port_packets_transmitted_total{job=\"$job\",instance=~\"$instance\"}[1m])", "hide": false, "interval": "", "legendFormat": "{{ instance }} {{ device }} {{ port }}", @@ -1950,7 +1950,7 @@ "allValue": null, "current": {}, "datasource": "${DS_PROMETHEUS}", - "definition": "label_values(node_cpu_seconds_total{job=~\"node-exporter\"}, env)", + "definition": "label_values(node_cpu_seconds_total{job=~\"$job\"}, env)", "error": null, "hide": 0, "includeAll": true, @@ -1958,7 +1958,7 @@ "multi": true, "name": "env", "options": [], - "query": "label_values(node_cpu_seconds_total{job=~\"node-exporter\"}, env)", + "query": "label_values(node_cpu_seconds_total{job=~\"$job\"}, env)", "refresh": 1, "regex": "", "skipUrlSync": false, @@ -1994,7 +1994,7 @@ "allValue": null, "current": {}, "datasource": "${DS_PROMETHEUS}", - "definition": "label_values(node_cpu_seconds_total{job=~\"node-exporter\", env=~\"$env\", instance=~\"$host_filter\"}, instance)", + "definition": "label_values(node_cpu_seconds_total{job=~\"$job\", env=~\"$env\", instance=~\"$host_filter\"}, instance)", "error": null, "hide": 0, "includeAll": true, @@ -2002,7 +2002,7 @@ "multi": true, "name": "instance", "options": [], - "query": "label_values(node_cpu_seconds_total{job=~\"node-exporter\", env=~\"$env\", instance=~\"$host_filter\"}, instance)", + "query": "label_values(node_cpu_seconds_total{job=~\"$job\", env=~\"$env\", instance=~\"$host_filter\"}, instance)", "refresh": 1, "regex": "", "skipUrlSync": false, @@ -2017,7 +2017,7 @@ "allValue": null, "current": {}, "datasource": "${DS_PROMETHEUS}", - "definition": "label_values(node_infiniband_info{job=~\"node-exporter\"}, device)", + "definition": "label_values(node_infiniband_info{job=~\"$job\"}, device)", "error": null, "hide": 0, "includeAll": true, @@ -2025,7 +2025,7 @@ "multi": true, "name": "device", "options": [], - "query": "label_values(node_infiniband_info{job=~\"node-exporter\"}, device)", + "query": "label_values(node_infiniband_info{job=~\"$job\"}, device)", "refresh": 1, "regex": "", "skipUrlSync": false, From ee945b5c4e7abec913ea769139333449f454a386 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Fri, 25 Oct 2024 11:14:42 +0100 Subject: [PATCH 64/90] added local copy of slurm exporter dashboard without container network devices --- .../files/node-exporter-slurm.json | 13926 ++++++++++++++++ .../defaults/main/main.yml | 2 +- .../inventory/group_vars/all/grafana.yml | 2 +- 3 files changed, 13928 insertions(+), 2 deletions(-) create mode 100644 ansible/roles/grafana-dashboards/files/node-exporter-slurm.json diff --git a/ansible/roles/grafana-dashboards/files/node-exporter-slurm.json b/ansible/roles/grafana-dashboards/files/node-exporter-slurm.json new file mode 100644 index 000000000..3e4cc0f06 --- /dev/null +++ b/ansible/roles/grafana-dashboards/files/node-exporter-slurm.json @@ -0,0 +1,13926 @@ +{ + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + }, + { + "name": "DS_ELASTICSEARCH", + "label": "Elasticsearch", + "description": "", + "type": "datasource", + "pluginId": "elasticsearch", + "pluginName": "Elasticsearch" + } + ], + "__requires": [ + { + "type": "datasource", + "id": "elasticsearch", + "name": "Elasticsearch", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "gauge", + "name": "Gauge", + "version": "" + }, + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "7.3.2" + }, + { + "type": "panel", + "id": "graph", + "name": "Graph", + "version": "" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "singlestat", + "name": "Singlestat", + "version": "" + } + ], + "annotations": { + "list": [ + { + "datasource": "${DS_ELASTICSEARCH}", + "enable": false, + "hide": false, + "iconColor": "rgba(255, 96, 96, 1)", + "name": "All slurm jobs on node", + "query": "json.AllNodes:( \"$node\")", + "showIn": 0, + "tagsField": "json.AllNodes", + "textField": "json.JobID", + "timeEndField": "event.end", + "timeField": "event.start" + }, + { + "$$hashKey": "object:1058", + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "limit": 100, + "name": "Annotations & Alerts", + "showIn": 0, + "type": "dashboard" + }, + { + "datasource": "${DS_ELASTICSEARCH}", + "enable": true, + "hide": false, + "iconColor": "rgba(255, 96, 96, 1)", + "name": "Slurm job", + "query": "json.AllNodes:( \"$node\") AND json.JobID:$job_id", + "showIn": 0, + "tagsField": "json.AllNodes", + "textField": "json.JobID", + "timeEndField": "event.end", + "timeField": "event.start" + } + ] + }, + "editable": true, + "gnetId": 13427, + "graphTooltip": 0, + "id": null, + "iteration": 1605806643218, + "links": [], + "panels": [ + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "${DS_PROMETHEUS}", + "decimals": 0, + "description": "Total RAM", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "format": "bytes", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 2, + "w": 2, + "x": 20, + "y": 0 + }, + "id": 75, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "maxPerRow": 6, + "nullPointMode": "null", + "nullText": null, + "postfix": "", + "postfixFontSize": "70%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "node_memory_MemTotal_bytes{env=\"demo\", instance=\"wjs-ohpc-compute-0\", job=\"node\"}", + "targets": [ + { + "expr": "node_memory_MemTotal_bytes{instance=\"$node\",job=\"$job\"}", + "intervalFactor": 1, + "refId": "A", + "step": 900 + } + ], + "thresholds": "", + "title": "RAM Total", + "type": "singlestat", + "valueFontSize": "50%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "${DS_PROMETHEUS}", + "decimals": 0, + "description": "Total SWAP", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "format": "bytes", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 2, + "w": 2, + "x": 22, + "y": 0 + }, + "id": 18, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "maxPerRow": 6, + "nullPointMode": "null", + "nullText": null, + "postfix": "", + "postfixFontSize": "70%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "node_memory_SwapTotal_bytes{env=\"demo\", instance=\"wjs-ohpc-compute-0\", job=\"node\"}", + "targets": [ + { + "expr": "node_memory_SwapTotal_bytes{instance=\"$node\",job=\"$job\"}", + "intervalFactor": 1, + "refId": "A", + "step": 900 + } + ], + "thresholds": "", + "title": "SWAP Total", + "type": "singlestat", + "valueFontSize": "50%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "collapsed": false, + "datasource": "${DS_PROMETHEUS}", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 2 + }, + "id": 261, + "panels": [], + "repeat": null, + "title": "Quick CPU / Mem / Disk", + "type": "row" + }, + { + "cacheTimeout": null, + "datasource": "${DS_PROMETHEUS}", + "description": "Busy state of all CPU cores together", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": {}, + "mappings": [ + { + "id": 0, + "op": "=", + "text": "N/A", + "type": 1, + "value": "null" + } + ], + "max": 100, + "min": 0, + "nullValueMode": "null", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(50, 172, 45, 0.97)", + "value": null + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 85 + }, + { + "color": "rgba(245, 54, 54, 0.9)", + "value": 95 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 0, + "y": 3 + }, + "id": 20, + "links": [], + "options": { + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "7.3.2", + "targets": [ + { + "expr": "(((count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu))) - avg(sum by (mode)(irate(node_cpu_seconds_total{mode='idle',instance=\"$node\",job=\"$job\"}[5m])))) * 100) / count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu))", + "hide": false, + "intervalFactor": 1, + "legendFormat": "", + "refId": "A", + "step": 900 + } + ], + "title": "CPU Busy", + "type": "gauge" + }, + { + "cacheTimeout": null, + "datasource": "${DS_PROMETHEUS}", + "description": "Busy state of all CPU cores together (5 min average)", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": {}, + "mappings": [ + { + "id": 0, + "op": "=", + "text": "N/A", + "type": 1, + "value": "null" + } + ], + "max": 100, + "min": 0, + "nullValueMode": "null", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(50, 172, 45, 0.97)", + "value": null + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 85 + }, + { + "color": "rgba(245, 54, 54, 0.9)", + "value": 95 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 3, + "y": 3 + }, + "id": 155, + "links": [], + "options": { + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "7.3.2", + "targets": [ + { + "expr": "avg(node_load5{instance=\"$node\",job=\"$job\"}) / count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)) * 100", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "refId": "A", + "step": 900 + } + ], + "title": "Sys Load (5m avg)", + "type": "gauge" + }, + { + "cacheTimeout": null, + "datasource": "${DS_PROMETHEUS}", + "description": "Busy state of all CPU cores together (15 min average)", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": {}, + "mappings": [ + { + "id": 0, + "op": "=", + "text": "N/A", + "type": 1, + "value": "null" + } + ], + "max": 100, + "min": 0, + "nullValueMode": "null", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(50, 172, 45, 0.97)", + "value": null + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 85 + }, + { + "color": "rgba(245, 54, 54, 0.9)", + "value": 95 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 6, + "y": 3 + }, + "id": 19, + "links": [], + "options": { + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "7.3.2", + "targets": [ + { + "expr": "avg(node_load15{instance=\"$node\",job=\"$job\"}) / count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu)) * 100", + "hide": false, + "intervalFactor": 1, + "refId": "A", + "step": 900 + } + ], + "title": "Sys Load (15m avg)", + "type": "gauge" + }, + { + "cacheTimeout": null, + "datasource": "${DS_PROMETHEUS}", + "description": "Non available RAM memory", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": {}, + "decimals": 0, + "mappings": [], + "max": 100, + "min": 0, + "nullValueMode": "null", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(50, 172, 45, 0.97)", + "value": null + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 80 + }, + { + "color": "rgba(245, 54, 54, 0.9)", + "value": 90 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 9, + "y": 3 + }, + "hideTimeOverride": false, + "id": 16, + "links": [], + "options": { + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "7.3.2", + "targets": [ + { + "expr": "((node_memory_MemTotal_bytes{instance=\"$node\",job=\"$job\"} - node_memory_MemFree_bytes{instance=\"$node\",job=\"$job\"}) / (node_memory_MemTotal_bytes{instance=\"$node\",job=\"$job\"} )) * 100", + "format": "time_series", + "hide": true, + "intervalFactor": 1, + "refId": "A", + "step": 900 + }, + { + "expr": "100 - ((node_memory_MemAvailable_bytes{instance=\"$node\",job=\"$job\"} * 100) / node_memory_MemTotal_bytes{instance=\"$node\",job=\"$job\"})", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "refId": "B", + "step": 900 + } + ], + "title": "RAM Used", + "type": "gauge" + }, + { + "cacheTimeout": null, + "datasource": "${DS_PROMETHEUS}", + "description": "Used Swap", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": {}, + "mappings": [ + { + "id": 0, + "op": "=", + "text": "N/A", + "type": 1, + "value": "null" + } + ], + "max": 100, + "min": 0, + "nullValueMode": "null", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(50, 172, 45, 0.97)", + "value": null + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 10 + }, + { + "color": "rgba(245, 54, 54, 0.9)", + "value": 25 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 12, + "y": 3 + }, + "id": 21, + "links": [], + "options": { + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "7.3.2", + "targets": [ + { + "expr": "((node_memory_SwapTotal_bytes{instance=\"$node\",job=\"$job\"} - node_memory_SwapFree_bytes{instance=\"$node\",job=\"$job\"}) / (node_memory_SwapTotal_bytes{instance=\"$node\",job=\"$job\"} )) * 100", + "intervalFactor": 1, + "refId": "A", + "step": 900 + } + ], + "title": "SWAP Used", + "type": "gauge" + }, + { + "cacheTimeout": null, + "datasource": "${DS_PROMETHEUS}", + "description": "Used Root FS", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": {}, + "mappings": [ + { + "id": 0, + "op": "=", + "text": "N/A", + "type": 1, + "value": "null" + } + ], + "max": 100, + "min": 0, + "nullValueMode": "null", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgba(50, 172, 45, 0.97)", + "value": null + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 80 + }, + { + "color": "rgba(245, 54, 54, 0.9)", + "value": 90 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 3, + "x": 15, + "y": 3 + }, + "id": 154, + "links": [], + "options": { + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "7.3.2", + "targets": [ + { + "expr": "100 - ((node_filesystem_avail_bytes{instance=\"$node\",job=\"$job\",mountpoint=\"/\",fstype!=\"rootfs\"} * 100) / node_filesystem_size_bytes{instance=\"$node\",job=\"$job\",mountpoint=\"/\",fstype!=\"rootfs\"})", + "format": "time_series", + "intervalFactor": 1, + "refId": "A", + "step": 900 + } + ], + "title": "Root FS Used", + "type": "gauge" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "${DS_PROMETHEUS}", + "description": "Total number of CPU cores", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "format": "short", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 2, + "w": 2, + "x": 18, + "y": 3 + }, + "id": 14, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "maxPerRow": 6, + "nullPointMode": "null", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "count(count(node_cpu_seconds_total{instance=\"$node\",job=\"$job\"}) by (cpu))", + "interval": "", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A", + "step": 900 + } + ], + "thresholds": "", + "title": "CPU Cores", + "type": "singlestat", + "valueFontSize": "50%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "${DS_PROMETHEUS}", + "decimals": 1, + "description": "System uptime", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "format": "s", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 2, + "w": 4, + "x": 20, + "y": 3 + }, + "hideTimeOverride": true, + "id": 15, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "$$hashKey": "object:1094", + "name": "value to text", + "value": 1 + }, + { + "$$hashKey": "object:1095", + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "null", + "nullText": null, + "postfix": "s", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "{env=\"demo\", instance=\"wjs-ohpc-compute-0\", job=\"node\"}", + "targets": [ + { + "expr": "node_time_seconds{instance=\"$node\",job=\"$job\"} - node_boot_time_seconds{instance=\"$node\",job=\"$job\"}", + "intervalFactor": 2, + "refId": "A", + "step": 1800 + } + ], + "thresholds": "", + "title": "Uptime", + "type": "singlestat", + "valueFontSize": "50%", + "valueMaps": [ + { + "$$hashKey": "object:1097", + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "decimals": 0, + "description": "Total RootFS", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "format": "bytes", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 2, + "w": 2, + "x": 18, + "y": 5 + }, + "id": 23, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "maxPerRow": 6, + "nullPointMode": "null", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "node_filesystem_size_bytes{device=\"/dev/vda1\", env=\"demo\", fstype=\"xfs\", instance=\"wjs-ohpc-compute-0\", job=\"node\", mountpoint=\"/\"}", + "targets": [ + { + "expr": "node_filesystem_size_bytes{instance=\"$node\",job=\"$job\",mountpoint=\"/\",fstype!=\"rootfs\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "refId": "A", + "step": 900 + } + ], + "thresholds": "70,90", + "title": "RootFS Total", + "type": "singlestat", + "valueFontSize": "50%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "collapsed": false, + "datasource": "${DS_PROMETHEUS}", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 7 + }, + "id": 263, + "panels": [], + "repeat": null, + "title": "Basic CPU / Mem / Net / Disk", + "type": "row" + }, + { + "aliasColors": { + "Busy": "#EAB839", + "Busy Iowait": "#890F02", + "Busy other": "#1F78C1", + "Idle": "#052B51", + "Idle - Waiting for something to happen": "#052B51", + "guest": "#9AC48A", + "idle": "#052B51", + "iowait": "#EAB839", + "irq": "#BF1B00", + "nice": "#C15C17", + "softirq": "#E24D42", + "steal": "#FCE2DE", + "system": "#508642", + "user": "#5195CE" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "description": "Basic CPU info", + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [] + }, + "overrides": [] + }, + "fill": 4, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 8 + }, + "hiddenSeries": false, + "id": 77, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": 250, + "sort": null, + "sortDesc": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": true, + "pluginVersion": "7.3.2", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "Busy Iowait", + "color": "#890F02" + }, + { + "alias": "Idle", + "color": "#7EB26D" + }, + { + "alias": "Busy System", + "color": "#EAB839" + }, + { + "alias": "Busy User", + "color": "#0A437C" + }, + { + "alias": "Busy Other", + "color": "#6D1F62" + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (instance)(irate(node_cpu_seconds_total{mode=\"system\",instance=\"$node\",job=\"$job\"}[5m])) * 100", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "Busy System", + "refId": "A", + "step": 240 + }, + { + "expr": "sum by (instance)(irate(node_cpu_seconds_total{mode='user',instance=\"$node\",job=\"$job\"}[5m])) * 100", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "Busy User", + "refId": "B", + "step": 240 + }, + { + "expr": "sum by (instance)(irate(node_cpu_seconds_total{mode='iowait',instance=\"$node\",job=\"$job\"}[5m])) * 100", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Busy Iowait", + "refId": "C", + "step": 240 + }, + { + "expr": "sum by (instance)(irate(node_cpu_seconds_total{mode=~\".*irq\",instance=\"$node\",job=\"$job\"}[5m])) * 100", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Busy IRQs", + "refId": "D", + "step": 240 + }, + { + "expr": "sum (irate(node_cpu_seconds_total{mode!='idle',mode!='user',mode!='system',mode!='iowait',mode!='irq',mode!='softirq',instance=\"$node\",job=\"$job\"}[5m])) * 100", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Busy Other", + "refId": "E", + "step": 240 + }, + { + "expr": "sum by (mode)(irate(node_cpu_seconds_total{mode='idle',instance=\"$node\",job=\"$job\"}[5m])) * 100", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Idle", + "refId": "F", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "CPU Basic", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "", + "logBase": 1, + "max": "100", + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Apps": "#629E51", + "Buffers": "#614D93", + "Cache": "#6D1F62", + "Cached": "#511749", + "Committed": "#508642", + "Free": "#0A437C", + "Hardware Corrupted - Amount of RAM that the kernel identified as corrupted / not working": "#CFFAFF", + "Inactive": "#584477", + "PageTables": "#0A50A1", + "Page_Tables": "#0A50A1", + "RAM_Free": "#E0F9D7", + "SWAP Used": "#BF1B00", + "Slab": "#806EB7", + "Slab_Cache": "#E0752D", + "Swap": "#BF1B00", + "Swap Used": "#BF1B00", + "Swap_Cache": "#C15C17", + "Swap_Free": "#2F575E", + "Unused": "#EAB839" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "description": "Basic memory usage", + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [] + }, + "overrides": [] + }, + "fill": 4, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 8 + }, + "hiddenSeries": false, + "id": 78, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": 350, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.3.2", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "RAM Total", + "color": "#E0F9D7", + "fill": 0, + "stack": false + }, + { + "alias": "RAM Cache + Buffer", + "color": "#052B51" + }, + { + "alias": "RAM Free", + "color": "#7EB26D" + }, + { + "alias": "Avaliable", + "color": "#DEDAF7", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "node_memory_MemTotal_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "RAM Total", + "refId": "A", + "step": 240 + }, + { + "expr": "node_memory_MemTotal_bytes{instance=\"$node\",job=\"$job\"} - node_memory_MemFree_bytes{instance=\"$node\",job=\"$job\"} - (node_memory_Cached_bytes{instance=\"$node\",job=\"$job\"} + node_memory_Buffers_bytes{instance=\"$node\",job=\"$job\"})", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "RAM Used", + "refId": "B", + "step": 240 + }, + { + "expr": "node_memory_Cached_bytes{instance=\"$node\",job=\"$job\"} + node_memory_Buffers_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "RAM Cache + Buffer", + "refId": "C", + "step": 240 + }, + { + "expr": "node_memory_MemFree_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "RAM Free", + "refId": "D", + "step": 240 + }, + { + "expr": "(node_memory_SwapTotal_bytes{instance=\"$node\",job=\"$job\"} - node_memory_SwapFree_bytes{instance=\"$node\",job=\"$job\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "SWAP Used", + "refId": "E", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Memory Basic", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Recv_bytes_eth2": "#7EB26D", + "Recv_bytes_lo": "#0A50A1", + "Recv_drop_eth2": "#6ED0E0", + "Recv_drop_lo": "#E0F9D7", + "Recv_errs_eth2": "#BF1B00", + "Recv_errs_lo": "#CCA300", + "Trans_bytes_eth2": "#7EB26D", + "Trans_bytes_lo": "#0A50A1", + "Trans_drop_eth2": "#6ED0E0", + "Trans_drop_lo": "#E0F9D7", + "Trans_errs_eth2": "#BF1B00", + "Trans_errs_lo": "#CCA300", + "recv_bytes_lo": "#0A50A1", + "recv_drop_eth0": "#99440A", + "recv_drop_lo": "#967302", + "recv_errs_eth0": "#BF1B00", + "recv_errs_lo": "#890F02", + "trans_bytes_eth0": "#7EB26D", + "trans_bytes_lo": "#0A50A1", + "trans_drop_eth0": "#99440A", + "trans_drop_lo": "#967302", + "trans_errs_eth0": "#BF1B00", + "trans_errs_lo": "#890F02" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "description": "Basic network info per interface", + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [] + }, + "overrides": [] + }, + "fill": 4, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 15 + }, + "hiddenSeries": false, + "id": 74, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": false, + "hideZero": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.3.2", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*trans.*/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(node_network_receive_bytes_total{instance=\"$node\",job=\"$job\",device=~\"^(([^v].{3}|.[^e].{2}|.{2}[^t].|.{3}[^h]).*|.{0,3})$\",device=~\"^(([^c].{2}|.[^n].{1}|.{2}[^i]).*|.{0,2})$\"}[5m])*8", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "recv {{device}}", + "refId": "A", + "step": 240 + }, + { + "expr": "irate(node_network_transmit_bytes_total{instance=\"$node\",job=\"$job\",device=~\"^(([^v].{3}|.[^e].{2}|.{2}[^t].|.{3}[^h]).*|.{0,3})$\",device=~\"^(([^c].{2}|.[^n].{1}|.{2}[^i]).*|.{0,2})$\"}[5m])*8", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "trans {{device}} ", + "refId": "B", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Network Traffic Basic", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bps", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "pps", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 3, + "description": "Disk space used of all filesystems mounted", + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [] + }, + "overrides": [] + }, + "fill": 4, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 15 + }, + "height": "", + "hiddenSeries": false, + "id": 152, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.3.2", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "100 - ((node_filesystem_avail_bytes{instance=\"$node\",job=\"$job\",device!~'rootfs'} * 100) / node_filesystem_size_bytes{instance=\"$node\",job=\"$job\",device!~'rootfs'})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{mountpoint}}", + "refId": "A", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Disk Space Used Basic", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percent", + "label": null, + "logBase": 1, + "max": "100", + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "collapsed": true, + "datasource": "${DS_PROMETHEUS}", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 22 + }, + "id": 265, + "panels": [ + { + "aliasColors": { + "Idle - Waiting for something to happen": "#052B51", + "guest": "#9AC48A", + "idle": "#052B51", + "iowait": "#EAB839", + "irq": "#BF1B00", + "nice": "#C15C17", + "softirq": "#E24D42", + "steal": "#FCE2DE", + "system": "#508642", + "user": "#5195CE" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "description": "", + "fill": 4, + "fillGradient": 0, + "gridPos": { + "h": 12, + "w": 12, + "x": 0, + "y": 21 + }, + "hiddenSeries": false, + "id": 3, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": 250, + "sort": null, + "sortDesc": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": true, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (mode)(irate(node_cpu_seconds_total{mode=\"system\",instance=\"$node\",job=\"$job\"}[5m])) * 100", + "format": "time_series", + "interval": "10s", + "intervalFactor": 2, + "legendFormat": "System - Processes executing in kernel mode", + "refId": "A", + "step": 20 + }, + { + "expr": "sum by (mode)(irate(node_cpu_seconds_total{mode='user',instance=\"$node\",job=\"$job\"}[5m])) * 100", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "User - Normal processes executing in user mode", + "refId": "B", + "step": 240 + }, + { + "expr": "sum by (mode)(irate(node_cpu_seconds_total{mode='nice',instance=\"$node\",job=\"$job\"}[5m])) * 100", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Nice - Niced processes executing in user mode", + "refId": "C", + "step": 240 + }, + { + "expr": "sum by (mode)(irate(node_cpu_seconds_total{mode='idle',instance=\"$node\",job=\"$job\"}[5m])) * 100", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Idle - Waiting for something to happen", + "refId": "D", + "step": 240 + }, + { + "expr": "sum by (mode)(irate(node_cpu_seconds_total{mode='iowait',instance=\"$node\",job=\"$job\"}[5m])) * 100", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Iowait - Waiting for I/O to complete", + "refId": "E", + "step": 240 + }, + { + "expr": "sum by (mode)(irate(node_cpu_seconds_total{mode='irq',instance=\"$node\",job=\"$job\"}[5m])) * 100", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Irq - Servicing interrupts", + "refId": "F", + "step": 240 + }, + { + "expr": "sum by (mode)(irate(node_cpu_seconds_total{mode='softirq',instance=\"$node\",job=\"$job\"}[5m])) * 100", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Softirq - Servicing softirqs", + "refId": "G", + "step": 240 + }, + { + "expr": "sum by (mode)(irate(node_cpu_seconds_total{mode='steal',instance=\"$node\",job=\"$job\"}[5m])) * 100", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Steal - Time spent in other operating systems when running in a virtualized environment", + "refId": "H", + "step": 240 + }, + { + "expr": "sum by (mode)(irate(node_cpu_seconds_total{mode='guest',instance=\"$node\",job=\"$job\"}[5m])) * 100", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Guest - Time spent running a virtual CPU for a guest operating system", + "refId": "I", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "CPU", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "percentage", + "logBase": 1, + "max": "100", + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Apps": "#629E51", + "Buffers": "#614D93", + "Cache": "#6D1F62", + "Cached": "#511749", + "Committed": "#508642", + "Free": "#0A437C", + "Hardware Corrupted - Amount of RAM that the kernel identified as corrupted / not working": "#CFFAFF", + "Inactive": "#584477", + "PageTables": "#0A50A1", + "Page_Tables": "#0A50A1", + "RAM_Free": "#E0F9D7", + "Slab": "#806EB7", + "Slab_Cache": "#E0752D", + "Swap": "#BF1B00", + "Swap - Swap memory usage": "#BF1B00", + "Swap_Cache": "#C15C17", + "Swap_Free": "#2F575E", + "Unused": "#EAB839", + "Unused - Free memory unassigned": "#052B51" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "description": "", + "fill": 4, + "fillGradient": 0, + "gridPos": { + "h": 12, + "w": 12, + "x": 12, + "y": 21 + }, + "hiddenSeries": false, + "id": 24, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": 350, + "sort": null, + "sortDesc": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*Hardware Corrupted - *./", + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "node_memory_MemTotal_bytes{instance=\"$node\",job=\"$job\"} - node_memory_MemFree_bytes{instance=\"$node\",job=\"$job\"} - node_memory_Buffers_bytes{instance=\"$node\",job=\"$job\"} - node_memory_Cached_bytes{instance=\"$node\",job=\"$job\"} - node_memory_Slab_bytes{instance=\"$node\",job=\"$job\"} - node_memory_PageTables_bytes{instance=\"$node\",job=\"$job\"} - node_memory_SwapCached_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "Apps - Memory used by user-space applications", + "refId": "A", + "step": 240 + }, + { + "expr": "node_memory_PageTables_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "PageTables - Memory used to map between virtual and physical memory addresses", + "refId": "B", + "step": 240 + }, + { + "expr": "node_memory_SwapCached_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "SwapCache - Memory that keeps track of pages that have been fetched from swap but not yet been modified", + "refId": "C", + "step": 240 + }, + { + "expr": "node_memory_Slab_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "Slab - Memory used by the kernel to cache data structures for its own use (caches like inode, dentry, etc)", + "refId": "D", + "step": 240 + }, + { + "expr": "node_memory_Cached_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "Cache - Parked file data (file content) cache", + "refId": "E", + "step": 240 + }, + { + "expr": "node_memory_Buffers_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "Buffers - Block device (e.g. harddisk) cache", + "refId": "F", + "step": 240 + }, + { + "expr": "node_memory_MemFree_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "Unused - Free memory unassigned", + "refId": "G", + "step": 240 + }, + { + "expr": "(node_memory_SwapTotal_bytes{instance=\"$node\",job=\"$job\"} - node_memory_SwapFree_bytes{instance=\"$node\",job=\"$job\"})", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "Swap - Swap space used", + "refId": "H", + "step": 240 + }, + { + "expr": "node_memory_HardwareCorrupted_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "Hardware Corrupted - Amount of RAM that the kernel identified as corrupted / not working", + "refId": "I", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Memory Stack", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": "bytes", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "receive_packets_eth0": "#7EB26D", + "receive_packets_lo": "#E24D42", + "transmit_packets_eth0": "#7EB26D", + "transmit_packets_lo": "#E24D42" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 4, + "fillGradient": 0, + "gridPos": { + "h": 12, + "w": 12, + "x": 0, + "y": 33 + }, + "hiddenSeries": false, + "id": 84, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:5871", + "alias": "/.*Trans.*/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(node_network_receive_bytes_total{instance=\"$node\",job=\"$job\",device=~\"^(([^v].{3}|.[^e].{2}|.{2}[^t].|.{3}[^h]).*|.{0,3})$\",device=~\"^(([^c].{2}|.[^n].{1}|.{2}[^i]).*|.{0,2})$\"}[5m])*8", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{device}} - Receive", + "refId": "A", + "step": 240 + }, + { + "expr": "irate(node_network_transmit_bytes_total{instance=\"$node\",job=\"$job\",device=~\"^(([^v].{3}|.[^e].{2}|.{2}[^t].|.{3}[^h]).*|.{0,3})$\",device=~\"^(([^c].{2}|.[^n].{1}|.{2}[^i]).*|.{0,2})$\"}[5m])*8", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{device}} - Transmit", + "refId": "B", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Network Traffic", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:5884", + "format": "bps", + "label": "bits out (-) / in (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:5885", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 3, + "description": "", + "fill": 4, + "fillGradient": 0, + "gridPos": { + "h": 12, + "w": 12, + "x": 12, + "y": 33 + }, + "height": "", + "hiddenSeries": false, + "id": 156, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": false, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_filesystem_size_bytes{instance=\"$node\",job=\"$job\",device!~'rootfs'} - node_filesystem_avail_bytes{instance=\"$node\",job=\"$job\",device!~'rootfs'}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{mountpoint}}", + "refId": "A", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Disk Space Used", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": "bytes", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "description": "", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 12, + "w": 12, + "x": 0, + "y": 45 + }, + "hiddenSeries": false, + "id": 229, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideZero": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*Read.*/", + "transform": "negative-Y" + }, + { + "alias": "/.*sda_.*/", + "color": "#7EB26D" + }, + { + "alias": "/.*sdb_.*/", + "color": "#EAB839" + }, + { + "alias": "/.*sdc_.*/", + "color": "#6ED0E0" + }, + { + "alias": "/.*sdd_.*/", + "color": "#EF843C" + }, + { + "alias": "/.*sde_.*/", + "color": "#E24D42" + }, + { + "alias": "/.*sda1.*/", + "color": "#584477" + }, + { + "alias": "/.*sda2_.*/", + "color": "#BA43A9" + }, + { + "alias": "/.*sda3_.*/", + "color": "#F4D598" + }, + { + "alias": "/.*sdb1.*/", + "color": "#0A50A1" + }, + { + "alias": "/.*sdb2.*/", + "color": "#BF1B00" + }, + { + "alias": "/.*sdb2.*/", + "color": "#BF1B00" + }, + { + "alias": "/.*sdb3.*/", + "color": "#E0752D" + }, + { + "alias": "/.*sdc1.*/", + "color": "#962D82" + }, + { + "alias": "/.*sdc2.*/", + "color": "#614D93" + }, + { + "alias": "/.*sdc3.*/", + "color": "#9AC48A" + }, + { + "alias": "/.*sdd1.*/", + "color": "#65C5DB" + }, + { + "alias": "/.*sdd2.*/", + "color": "#F9934E" + }, + { + "alias": "/.*sdd3.*/", + "color": "#EA6460" + }, + { + "alias": "/.*sde1.*/", + "color": "#E0F9D7" + }, + { + "alias": "/.*sdd2.*/", + "color": "#FCEACA" + }, + { + "alias": "/.*sde3.*/", + "color": "#F9E2D2" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(node_disk_reads_completed_total{instance=\"$node\",job=\"$job\",device=~\"$diskdevices\"}[5m])", + "intervalFactor": 4, + "legendFormat": "{{device}} - Reads completed", + "refId": "A", + "step": 480 + }, + { + "expr": "irate(node_disk_writes_completed_total{instance=\"$node\",job=\"$job\",device=~\"$diskdevices\"}[5m])", + "intervalFactor": 2, + "legendFormat": "{{device}} - Writes completed", + "refId": "B", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Disk IOps", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "iops", + "label": "IO read (-) / write (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "io time": "#890F02" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 3, + "description": "", + "fill": 4, + "fillGradient": 0, + "gridPos": { + "h": 12, + "w": 12, + "x": 12, + "y": 45 + }, + "hiddenSeries": false, + "id": 42, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sort": null, + "sortDesc": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*read*./", + "transform": "negative-Y" + }, + { + "alias": "/.*sda.*/", + "color": "#7EB26D" + }, + { + "alias": "/.*sdb.*/", + "color": "#EAB839" + }, + { + "alias": "/.*sdc.*/", + "color": "#6ED0E0" + }, + { + "alias": "/.*sdd.*/", + "color": "#EF843C" + }, + { + "alias": "/.*sde.*/", + "color": "#E24D42" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(node_disk_read_bytes_total{instance=\"$node\",job=\"$job\",device=~\"$diskdevices\"}[5m])", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "{{device}} - Successfully read bytes", + "refId": "A", + "step": 240 + }, + { + "expr": "irate(node_disk_written_bytes_total{instance=\"$node\",job=\"$job\",device=~\"$diskdevices\"}[5m])", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "{{device}} - Successfully written bytes", + "refId": "B", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "I/O Usage Read / Write", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": false, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": "bytes read (-) / write (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "ms", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "io time": "#890F02" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 3, + "description": "", + "fill": 4, + "fillGradient": 0, + "gridPos": { + "h": 12, + "w": 12, + "x": 0, + "y": 57 + }, + "hiddenSeries": false, + "id": 127, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sort": null, + "sortDesc": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(node_disk_io_time_seconds_total{instance=\"$node\",job=\"$job\",device=~\"$diskdevices\"} [5m])", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "{{device}} - Time spent doing I/Os", + "refId": "A", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "I/O Usage Times", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": false, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "label": "time", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "s", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "repeat": null, + "title": "CPU / Memory / Net / Disk", + "type": "row" + }, + { + "collapsed": true, + "datasource": "${DS_PROMETHEUS}", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 23 + }, + "id": 266, + "panels": [ + { + "aliasColors": { + "Apps": "#629E51", + "Buffers": "#614D93", + "Cache": "#6D1F62", + "Cached": "#511749", + "Committed": "#508642", + "Free": "#0A437C", + "Hardware Corrupted - Amount of RAM that the kernel identified as corrupted / not working": "#CFFAFF", + "Inactive": "#584477", + "PageTables": "#0A50A1", + "Page_Tables": "#0A50A1", + "RAM_Free": "#E0F9D7", + "Slab": "#806EB7", + "Slab_Cache": "#E0752D", + "Swap": "#BF1B00", + "Swap_Cache": "#C15C17", + "Swap_Free": "#2F575E", + "Unused": "#EAB839" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 70 + }, + "hiddenSeries": false, + "id": 136, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": 350, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 2, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "node_memory_Inactive_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Inactive - Memory which has been less recently used. It is more eligible to be reclaimed for other purposes", + "refId": "A", + "step": 4 + }, + { + "expr": "node_memory_Active_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Active - Memory that has been used more recently and usually not reclaimed unless absolutely necessary", + "refId": "B", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Memory Active / Inactive", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": "bytes", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Apps": "#629E51", + "Buffers": "#614D93", + "Cache": "#6D1F62", + "Cached": "#511749", + "Committed": "#508642", + "Free": "#0A437C", + "Hardware Corrupted - Amount of RAM that the kernel identified as corrupted / not working": "#CFFAFF", + "Inactive": "#584477", + "PageTables": "#0A50A1", + "Page_Tables": "#0A50A1", + "RAM_Free": "#E0F9D7", + "Slab": "#806EB7", + "Slab_Cache": "#E0752D", + "Swap": "#BF1B00", + "Swap_Cache": "#C15C17", + "Swap_Free": "#2F575E", + "Unused": "#EAB839" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 70 + }, + "hiddenSeries": false, + "id": 135, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": 350, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*Committed_AS - *./" + }, + { + "alias": "/.*CommitLimit - *./", + "color": "#BF1B00", + "fill": 0 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_memory_Committed_AS_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Committed_AS - Amount of memory presently allocated on the system", + "refId": "A", + "step": 4 + }, + { + "expr": "node_memory_CommitLimit_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "CommitLimit - Amount of memory currently available to be allocated on the system", + "refId": "B", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Memory Commited", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": "bytes", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Apps": "#629E51", + "Buffers": "#614D93", + "Cache": "#6D1F62", + "Cached": "#511749", + "Committed": "#508642", + "Free": "#0A437C", + "Hardware Corrupted - Amount of RAM that the kernel identified as corrupted / not working": "#CFFAFF", + "Inactive": "#584477", + "PageTables": "#0A50A1", + "Page_Tables": "#0A50A1", + "RAM_Free": "#E0F9D7", + "Slab": "#806EB7", + "Slab_Cache": "#E0752D", + "Swap": "#BF1B00", + "Swap_Cache": "#C15C17", + "Swap_Free": "#2F575E", + "Unused": "#EAB839" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 80 + }, + "hiddenSeries": false, + "id": 191, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": 350, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "node_memory_Inactive_file_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "Inactive_file - File-backed memory on inactive LRU list", + "refId": "A", + "step": 4 + }, + { + "expr": "node_memory_Inactive_anon_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "Inactive_anon - Anonymous and swap cache on inactive LRU list, including tmpfs (shmem)", + "refId": "B", + "step": 4 + }, + { + "expr": "node_memory_Active_file_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "Active_file - File-backed memory on active LRU list", + "refId": "C", + "step": 4 + }, + { + "expr": "node_memory_Active_anon_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "Active_anon - Anonymous and swap cache on active least-recently-used (LRU) list, including tmpfs", + "refId": "D", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Memory Active / Inactive Detail", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": "bytes", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Active": "#99440A", + "Buffers": "#58140C", + "Cache": "#6D1F62", + "Cached": "#511749", + "Committed": "#508642", + "Dirty": "#6ED0E0", + "Free": "#B7DBAB", + "Inactive": "#EA6460", + "Mapped": "#052B51", + "PageTables": "#0A50A1", + "Page_Tables": "#0A50A1", + "Slab_Cache": "#EAB839", + "Swap": "#BF1B00", + "Swap_Cache": "#C15C17", + "Total": "#511749", + "Total RAM": "#052B51", + "Total RAM + Swap": "#052B51", + "Total Swap": "#614D93", + "VmallocUsed": "#EA6460" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 80 + }, + "hiddenSeries": false, + "id": 130, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 2, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_memory_Writeback_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Writeback - Memory which is actively being written back to disk", + "refId": "A", + "step": 4 + }, + { + "expr": "node_memory_WritebackTmp_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "WritebackTmp - Memory used by FUSE for temporary writeback buffers", + "refId": "B", + "step": 4 + }, + { + "expr": "node_memory_Dirty_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Dirty - Memory which is waiting to get written back to the disk", + "refId": "C", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Memory Writeback and Dirty", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": "bytes", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Apps": "#629E51", + "Buffers": "#614D93", + "Cache": "#6D1F62", + "Cached": "#511749", + "Committed": "#508642", + "Free": "#0A437C", + "Hardware Corrupted - Amount of RAM that the kernel identified as corrupted / not working": "#CFFAFF", + "Inactive": "#584477", + "PageTables": "#0A50A1", + "Page_Tables": "#0A50A1", + "RAM_Free": "#E0F9D7", + "Slab": "#806EB7", + "Slab_Cache": "#E0752D", + "Swap": "#BF1B00", + "Swap_Cache": "#C15C17", + "Swap_Free": "#2F575E", + "Unused": "#EAB839" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 90 + }, + "hiddenSeries": false, + "id": 138, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": 350, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:4131", + "alias": "ShmemHugePages - Memory used by shared memory (shmem) and tmpfs allocated with huge pages", + "fill": 0 + }, + { + "$$hashKey": "object:4138", + "alias": "ShmemHugePages - Memory used by shared memory (shmem) and tmpfs allocated with huge pages", + "fill": 0 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_memory_Mapped_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Mapped - Used memory in mapped pages files which have been mmaped, such as libraries", + "refId": "A", + "step": 4 + }, + { + "expr": "node_memory_Shmem_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Shmem - Used shared memory (shared between several processes, thus including RAM disks)", + "refId": "B", + "step": 4 + }, + { + "expr": "node_memory_ShmemHugePages_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "ShmemHugePages - Memory used by shared memory (shmem) and tmpfs allocated with huge pages", + "refId": "C", + "step": 4 + }, + { + "expr": "node_memory_ShmemPmdMapped_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "ShmemPmdMapped - Ammount of shared (shmem/tmpfs) memory backed by huge pages", + "refId": "D", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Memory Shared and Mapped", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:4106", + "format": "bytes", + "label": "bytes", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:4107", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Active": "#99440A", + "Buffers": "#58140C", + "Cache": "#6D1F62", + "Cached": "#511749", + "Committed": "#508642", + "Dirty": "#6ED0E0", + "Free": "#B7DBAB", + "Inactive": "#EA6460", + "Mapped": "#052B51", + "PageTables": "#0A50A1", + "Page_Tables": "#0A50A1", + "Slab_Cache": "#EAB839", + "Swap": "#BF1B00", + "Swap_Cache": "#C15C17", + "Total": "#511749", + "Total RAM": "#052B51", + "Total RAM + Swap": "#052B51", + "Total Swap": "#614D93", + "VmallocUsed": "#EA6460" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 90 + }, + "hiddenSeries": false, + "id": 131, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 2, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "node_memory_SUnreclaim_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "SUnreclaim - Part of Slab, that cannot be reclaimed on memory pressure", + "refId": "A", + "step": 4 + }, + { + "expr": "node_memory_SReclaimable_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "SReclaimable - Part of Slab, that might be reclaimed, such as caches", + "refId": "B", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Memory Slab", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": "bytes", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Active": "#99440A", + "Buffers": "#58140C", + "Cache": "#6D1F62", + "Cached": "#511749", + "Committed": "#508642", + "Dirty": "#6ED0E0", + "Free": "#B7DBAB", + "Inactive": "#EA6460", + "Mapped": "#052B51", + "PageTables": "#0A50A1", + "Page_Tables": "#0A50A1", + "Slab_Cache": "#EAB839", + "Swap": "#BF1B00", + "Swap_Cache": "#C15C17", + "Total": "#511749", + "Total RAM": "#052B51", + "Total RAM + Swap": "#052B51", + "VmallocUsed": "#EA6460" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 100 + }, + "hiddenSeries": false, + "id": 70, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_memory_VmallocChunk_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "VmallocChunk - Largest contigious block of vmalloc area which is free", + "refId": "A", + "step": 4 + }, + { + "expr": "node_memory_VmallocTotal_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "VmallocTotal - Total size of vmalloc memory area", + "refId": "B", + "step": 4 + }, + { + "expr": "node_memory_VmallocUsed_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "VmallocUsed - Amount of vmalloc area which is used", + "refId": "C", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Memory Vmalloc", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": "bytes", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Apps": "#629E51", + "Buffers": "#614D93", + "Cache": "#6D1F62", + "Cached": "#511749", + "Committed": "#508642", + "Free": "#0A437C", + "Hardware Corrupted - Amount of RAM that the kernel identified as corrupted / not working": "#CFFAFF", + "Inactive": "#584477", + "PageTables": "#0A50A1", + "Page_Tables": "#0A50A1", + "RAM_Free": "#E0F9D7", + "Slab": "#806EB7", + "Slab_Cache": "#E0752D", + "Swap": "#BF1B00", + "Swap_Cache": "#C15C17", + "Swap_Free": "#2F575E", + "Unused": "#EAB839" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 100 + }, + "hiddenSeries": false, + "id": 159, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": 350, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_memory_Bounce_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Bounce - Memory used for block device bounce buffers", + "refId": "A", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Memory Bounce", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": "bytes", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Active": "#99440A", + "Buffers": "#58140C", + "Cache": "#6D1F62", + "Cached": "#511749", + "Committed": "#508642", + "Dirty": "#6ED0E0", + "Free": "#B7DBAB", + "Inactive": "#EA6460", + "Mapped": "#052B51", + "PageTables": "#0A50A1", + "Page_Tables": "#0A50A1", + "Slab_Cache": "#EAB839", + "Swap": "#BF1B00", + "Swap_Cache": "#C15C17", + "Total": "#511749", + "Total RAM": "#052B51", + "Total RAM + Swap": "#052B51", + "VmallocUsed": "#EA6460" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 110 + }, + "hiddenSeries": false, + "id": 129, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*Inactive *./", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_memory_AnonHugePages_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "AnonHugePages - Memory in anonymous huge pages", + "refId": "A", + "step": 4 + }, + { + "expr": "node_memory_AnonPages_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "AnonPages - Memory in user pages not backed by files", + "refId": "B", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Memory Anonymous", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": "bytes", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Apps": "#629E51", + "Buffers": "#614D93", + "Cache": "#6D1F62", + "Cached": "#511749", + "Committed": "#508642", + "Free": "#0A437C", + "Hardware Corrupted - Amount of RAM that the kernel identified as corrupted / not working": "#CFFAFF", + "Inactive": "#584477", + "PageTables": "#0A50A1", + "Page_Tables": "#0A50A1", + "RAM_Free": "#E0F9D7", + "Slab": "#806EB7", + "Slab_Cache": "#E0752D", + "Swap": "#BF1B00", + "Swap_Cache": "#C15C17", + "Swap_Free": "#2F575E", + "Unused": "#EAB839" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 110 + }, + "hiddenSeries": false, + "id": 160, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": 350, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 2, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_memory_KernelStack_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "KernelStack - Kernel memory stack. This is not reclaimable", + "refId": "A", + "step": 4 + }, + { + "expr": "node_memory_Percpu_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "PerCPU - Per CPU memory allocated dynamically by loadable modules", + "refId": "B", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Memory Kernel / CPU", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": "bytes", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Active": "#99440A", + "Buffers": "#58140C", + "Cache": "#6D1F62", + "Cached": "#511749", + "Committed": "#508642", + "Dirty": "#6ED0E0", + "Free": "#B7DBAB", + "Inactive": "#EA6460", + "Mapped": "#052B51", + "PageTables": "#0A50A1", + "Page_Tables": "#0A50A1", + "Slab_Cache": "#EAB839", + "Swap": "#BF1B00", + "Swap_Cache": "#C15C17", + "Total": "#511749", + "Total RAM": "#806EB7", + "Total RAM + Swap": "#806EB7", + "VmallocUsed": "#EA6460" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 120 + }, + "hiddenSeries": false, + "id": 140, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_memory_HugePages_Free{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "HugePages_Free - Huge pages in the pool that are not yet allocated", + "refId": "A", + "step": 4 + }, + { + "expr": "node_memory_HugePages_Rsvd{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "HugePages_Rsvd - Huge pages for which a commitment to allocate from the pool has been made, but no allocation has yet been made", + "refId": "B", + "step": 4 + }, + { + "expr": "node_memory_HugePages_Surp{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "HugePages_Surp - Huge pages in the pool above the value in /proc/sys/vm/nr_hugepages", + "refId": "C", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Memory HugePages Counter", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "pages", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Active": "#99440A", + "Buffers": "#58140C", + "Cache": "#6D1F62", + "Cached": "#511749", + "Committed": "#508642", + "Dirty": "#6ED0E0", + "Free": "#B7DBAB", + "Inactive": "#EA6460", + "Mapped": "#052B51", + "PageTables": "#0A50A1", + "Page_Tables": "#0A50A1", + "Slab_Cache": "#EAB839", + "Swap": "#BF1B00", + "Swap_Cache": "#C15C17", + "Total": "#511749", + "Total RAM": "#806EB7", + "Total RAM + Swap": "#806EB7", + "VmallocUsed": "#EA6460" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 120 + }, + "hiddenSeries": false, + "id": 71, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 2, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_memory_HugePages_Total{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "HugePages - Total size of the pool of huge pages", + "refId": "A", + "step": 4 + }, + { + "expr": "node_memory_Hugepagesize_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Hugepagesize - Huge Page size", + "refId": "B", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Memory HugePages Size", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": "bytes", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Active": "#99440A", + "Buffers": "#58140C", + "Cache": "#6D1F62", + "Cached": "#511749", + "Committed": "#508642", + "Dirty": "#6ED0E0", + "Free": "#B7DBAB", + "Inactive": "#EA6460", + "Mapped": "#052B51", + "PageTables": "#0A50A1", + "Page_Tables": "#0A50A1", + "Slab_Cache": "#EAB839", + "Swap": "#BF1B00", + "Swap_Cache": "#C15C17", + "Total": "#511749", + "Total RAM": "#052B51", + "Total RAM + Swap": "#052B51", + "VmallocUsed": "#EA6460" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 130 + }, + "hiddenSeries": false, + "id": 128, + "legend": { + "alignAsTable": true, + "avg": true, + "current": false, + "hideEmpty": false, + "hideZero": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_memory_DirectMap1G_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "DirectMap1G - Amount of pages mapped as this size", + "refId": "A", + "step": 4 + }, + { + "expr": "node_memory_DirectMap2M_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "DirectMap2M - Amount of pages mapped as this size", + "refId": "B", + "step": 4 + }, + { + "expr": "node_memory_DirectMap4k_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "DirectMap4K - Amount of pages mapped as this size", + "refId": "C", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Memory DirectMap", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": "bytes", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Apps": "#629E51", + "Buffers": "#614D93", + "Cache": "#6D1F62", + "Cached": "#511749", + "Committed": "#508642", + "Free": "#0A437C", + "Hardware Corrupted - Amount of RAM that the kernel identified as corrupted / not working": "#CFFAFF", + "Inactive": "#584477", + "PageTables": "#0A50A1", + "Page_Tables": "#0A50A1", + "RAM_Free": "#E0F9D7", + "Slab": "#806EB7", + "Slab_Cache": "#E0752D", + "Swap": "#BF1B00", + "Swap_Cache": "#C15C17", + "Swap_Free": "#2F575E", + "Unused": "#EAB839" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 130 + }, + "hiddenSeries": false, + "id": 137, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": 350, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_memory_Unevictable_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Unevictable - Amount of unevictable memory that can't be swapped out for a variety of reasons", + "refId": "A", + "step": 4 + }, + { + "expr": "node_memory_Mlocked_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "MLocked - Size of pages locked to memory using the mlock() system call", + "refId": "B", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Memory Unevictable and MLocked", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": "bytes", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Active": "#99440A", + "Buffers": "#58140C", + "Cache": "#6D1F62", + "Cached": "#511749", + "Committed": "#508642", + "Dirty": "#6ED0E0", + "Free": "#B7DBAB", + "Inactive": "#EA6460", + "Mapped": "#052B51", + "PageTables": "#0A50A1", + "Page_Tables": "#0A50A1", + "Slab_Cache": "#EAB839", + "Swap": "#BF1B00", + "Swap_Cache": "#C15C17", + "Total": "#511749", + "Total RAM": "#052B51", + "Total RAM + Swap": "#052B51", + "Total Swap": "#614D93", + "VmallocUsed": "#EA6460" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 140 + }, + "hiddenSeries": false, + "id": 132, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_memory_NFS_Unstable_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "NFS Unstable - Memory in NFS pages sent to the server, but not yet commited to the storage", + "refId": "A", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Memory NFS", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": "bytes", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "repeat": null, + "title": "Memory Meminfo", + "type": "row" + }, + { + "collapsed": true, + "datasource": "${DS_PROMETHEUS}", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 24 + }, + "id": 267, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 23 + }, + "hiddenSeries": false, + "id": 176, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*out/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(node_vmstat_pgpgin{instance=\"$node\",job=\"$job\"}[5m])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Pagesin - Page in operations", + "refId": "A", + "step": 4 + }, + { + "expr": "irate(node_vmstat_pgpgout{instance=\"$node\",job=\"$job\"}[5m])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Pagesout - Page out operations", + "refId": "B", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Memory Pages In / Out", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "pages out (-) / in (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 23 + }, + "hiddenSeries": false, + "id": 22, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*out/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(node_vmstat_pswpin{instance=\"$node\",job=\"$job\"}[5m])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Pswpin - Pages swapped in", + "refId": "A", + "step": 4 + }, + { + "expr": "irate(node_vmstat_pswpout{instance=\"$node\",job=\"$job\"}[5m])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Pswpout - Pages swapped out", + "refId": "B", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Memory Pages Swap In / Out", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "pages out (-) / in (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Apps": "#629E51", + "Buffers": "#614D93", + "Cache": "#6D1F62", + "Cached": "#511749", + "Committed": "#508642", + "Free": "#0A437C", + "Hardware Corrupted - Amount of RAM that the kernel identified as corrupted / not working": "#CFFAFF", + "Inactive": "#584477", + "PageTables": "#0A50A1", + "Page_Tables": "#0A50A1", + "RAM_Free": "#E0F9D7", + "Slab": "#806EB7", + "Slab_Cache": "#E0752D", + "Swap": "#BF1B00", + "Swap_Cache": "#C15C17", + "Swap_Free": "#2F575E", + "Unused": "#EAB839" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 33 + }, + "hiddenSeries": false, + "id": 175, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": 350, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:6118", + "alias": "Pgfault - Page major and minor fault operations", + "fill": 0, + "stack": false + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "irate(node_vmstat_pgfault{instance=\"$node\",job=\"$job\"}[5m])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Pgfault - Page major and minor fault operations", + "refId": "A", + "step": 4 + }, + { + "expr": "irate(node_vmstat_pgmajfault{instance=\"$node\",job=\"$job\"}[5m])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Pgmajfault - Major page fault operations", + "refId": "B", + "step": 4 + }, + { + "expr": "irate(node_vmstat_pgfault{instance=\"$node\",job=\"$job\"}[5m]) - irate(node_vmstat_pgmajfault{instance=\"$node\",job=\"$job\"}[5m])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Pgminfault - Minor page fault operations", + "refId": "C", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Memory Page Faults", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:6133", + "format": "short", + "label": "faults", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:6134", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Active": "#99440A", + "Buffers": "#58140C", + "Cache": "#6D1F62", + "Cached": "#511749", + "Committed": "#508642", + "Dirty": "#6ED0E0", + "Free": "#B7DBAB", + "Inactive": "#EA6460", + "Mapped": "#052B51", + "PageTables": "#0A50A1", + "Page_Tables": "#0A50A1", + "Slab_Cache": "#EAB839", + "Swap": "#BF1B00", + "Swap_Cache": "#C15C17", + "Total": "#511749", + "Total RAM": "#052B51", + "Total RAM + Swap": "#052B51", + "Total Swap": "#614D93", + "VmallocUsed": "#EA6460" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 33 + }, + "hiddenSeries": false, + "id": 307, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(node_vmstat_oom_kill{instance=\"$node\",job=\"$job\"}[5m])", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "oom killer invocations ", + "refId": "A", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "OOM Killer", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:5373", + "format": "short", + "label": "counter", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:5374", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "repeat": null, + "title": "Memory Vmstat", + "type": "row" + }, + { + "collapsed": true, + "datasource": "${DS_PROMETHEUS}", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 25 + }, + "id": 293, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "description": "", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 24 + }, + "hiddenSeries": false, + "id": 260, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*Variation*./", + "color": "#890F02" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_timex_estimated_error_seconds{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "Estimated error in seconds", + "refId": "A", + "step": 240 + }, + { + "expr": "node_timex_offset_seconds{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "Time offset in between local system and reference clock", + "refId": "B", + "step": 240 + }, + { + "expr": "node_timex_maxerror_seconds{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "Maximum error in seconds", + "refId": "C", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Time Syncronized Drift", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "label": "seconds", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": "counter", + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "description": "", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 24 + }, + "hiddenSeries": false, + "id": 291, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_timex_loop_time_constant{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "Phase-locked loop time adjust", + "refId": "A", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Time PLL Adjust", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "counter", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "description": "", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 34 + }, + "hiddenSeries": false, + "id": 168, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*Variation*./", + "color": "#890F02" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_timex_sync_status{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "Is clock synchronized to a reliable server (1 = yes, 0 = no)", + "refId": "A", + "step": 240 + }, + { + "expr": "node_timex_frequency_adjustment_ratio{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "Local clock frequency adjustment", + "refId": "B", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Time Syncronized Status", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "counter", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "description": "", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 34 + }, + "hiddenSeries": false, + "id": 294, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_timex_tick_seconds{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "Seconds between clock ticks", + "refId": "A", + "step": 240 + }, + { + "expr": "node_timex_tai_offset_seconds{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "International Atomic Time (TAI) offset", + "refId": "B", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Time Misc", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "label": "seconds", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "title": "System Timesync", + "type": "row" + }, + { + "collapsed": true, + "datasource": "${DS_PROMETHEUS}", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 26 + }, + "id": 312, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 7 + }, + "hiddenSeries": false, + "id": 62, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_procs_blocked{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Processes blocked waiting for I/O to complete", + "refId": "A", + "step": 240 + }, + { + "expr": "node_procs_running{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Processes in runnable state", + "refId": "B", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Processes Status", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:6500", + "format": "short", + "label": "counter", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:6501", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 7 + }, + "hiddenSeries": false, + "id": 315, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "node_processes_state{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{ state }}", + "refId": "A", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Processes State", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:6500", + "format": "short", + "label": "counter", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:6501", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 17 + }, + "hiddenSeries": false, + "id": 148, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(node_forks_total{instance=\"$node\",job=\"$job\"}[5m])", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "Processes forks second", + "refId": "A", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Processes Forks", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:6640", + "format": "short", + "label": "forks / sec", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:6641", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 17 + }, + "hiddenSeries": false, + "id": 149, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*Max.*/", + "fill": 0 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(process_virtual_memory_bytes{instance=\"$node\",job=\"$job\"}[5m])", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "Processes virtual memory size in bytes", + "refId": "A", + "step": 240 + }, + { + "expr": "process_resident_memory_max_bytes{instance=\"$node\",job=\"$job\"}", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "Maximum amount of virtual memory available in bytes", + "refId": "B", + "step": 240 + }, + { + "expr": "irate(process_virtual_memory_bytes{instance=\"$node\",job=\"$job\"}[5m])", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "Processes virtual memory size in bytes", + "refId": "C", + "step": 240 + }, + { + "expr": "irate(process_virtual_memory_max_bytes{instance=\"$node\",job=\"$job\"}[5m])", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "Maximum amount of virtual memory available in bytes", + "refId": "D", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Processes Memory", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "decbytes", + "label": "bytes", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 27 + }, + "hiddenSeries": false, + "id": 313, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:709", + "alias": "PIDs limit", + "color": "#F2495C", + "fill": 0 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_processes_pids{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "Number of PIDs", + "refId": "A", + "step": 240 + }, + { + "expr": "node_processes_max_processes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "PIDs limit", + "refId": "B", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "PIDs Number and Limit", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:6500", + "format": "short", + "label": "counter", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:6501", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 27 + }, + "hiddenSeries": false, + "id": 305, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:4963", + "alias": "/.*waiting.*/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(node_schedstat_running_seconds_total{instance=\"$node\",job=\"$job\"}[5m])", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "CPU {{ cpu }} - seconds spent running a process", + "refId": "A", + "step": 240 + }, + { + "expr": "irate(node_schedstat_waiting_seconds_total{instance=\"$node\",job=\"$job\"}[5m])", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "CPU {{ cpu }} - seconds spent by processing waiting for this CPU", + "refId": "B", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Process schdeule stats Running / Waiting", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:4860", + "format": "s", + "label": "seconds", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:4861", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 37 + }, + "hiddenSeries": false, + "id": 314, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:709", + "alias": "Threads limit", + "color": "#F2495C", + "fill": 0 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_processes_threads{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "Allocated threads", + "refId": "A", + "step": 240 + }, + { + "expr": "node_processes_max_threads{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "Threads limit", + "refId": "B", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Threads Number and Limit", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:6500", + "format": "short", + "label": "counter", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:6501", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "title": "System Processes", + "type": "row" + }, + { + "collapsed": false, + "datasource": "${DS_PROMETHEUS}", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 27 + }, + "id": 269, + "panels": [], + "repeat": null, + "title": "System Misc", + "type": "row" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [] + }, + "overrides": [] + }, + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 28 + }, + "hiddenSeries": false, + "id": 8, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.3.2", + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(node_context_switches_total{instance=\"$node\",job=\"$job\"}[5m])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Context switches", + "refId": "A", + "step": 240 + }, + { + "expr": "irate(node_intr_total{instance=\"$node\",job=\"$job\"}[5m])", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "Interrupts", + "refId": "B", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Context Switches / Interrupts", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "counter", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [] + }, + "overrides": [] + }, + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 28 + }, + "hiddenSeries": false, + "id": 7, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.3.2", + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_load1{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 4, + "legendFormat": "Load 1m", + "refId": "A", + "step": 480 + }, + { + "expr": "node_load5{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 4, + "legendFormat": "Load 5m", + "refId": "B", + "step": 480 + }, + { + "expr": "node_load15{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 4, + "legendFormat": "Load 15m", + "refId": "C", + "step": 480 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "System Load", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:6261", + "format": "short", + "label": "counter", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:6262", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [] + }, + "overrides": [] + }, + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 38 + }, + "hiddenSeries": false, + "id": 259, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.3.2", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*Critical*./", + "color": "#E24D42", + "fill": 0 + }, + { + "alias": "/.*Max*./", + "color": "#EF843C", + "fill": 0 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(node_interrupts_total{instance=\"$node\",job=\"$job\"}[5m])", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{ type }} - {{ info }}", + "refId": "A", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Interrupts Detail", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "counter", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [] + }, + "overrides": [] + }, + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 38 + }, + "hiddenSeries": false, + "id": 306, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.3.2", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(node_schedstat_timeslices_total{instance=\"$node\",job=\"$job\"}[5m])", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "CPU {{ cpu }}", + "refId": "A", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Schedule timeslices executed by each cpu", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:4860", + "format": "short", + "label": "counter", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:4861", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [] + }, + "overrides": [] + }, + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 48 + }, + "hiddenSeries": false, + "id": 151, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.3.2", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_entropy_available_bits{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Entropy available to random number generators", + "refId": "A", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Entropy", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:6568", + "format": "short", + "label": "counter", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:6569", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [] + }, + "overrides": [] + }, + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 48 + }, + "hiddenSeries": false, + "id": 308, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.3.2", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(process_cpu_seconds_total{instance=\"$node\",job=\"$job\"}[5m])", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "Time spent", + "refId": "A", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "CPU time spent in user and system contexts", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:4860", + "format": "s", + "label": "seconds", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:4861", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [] + }, + "overrides": [] + }, + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 58 + }, + "hiddenSeries": false, + "id": 64, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.3.2", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:6323", + "alias": "/.*Max*./", + "color": "#890F02", + "fill": 0 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "process_max_fds{instance=\"$node\",job=\"$job\"}", + "interval": "", + "intervalFactor": 2, + "legendFormat": "Maximum open file descriptors", + "refId": "A", + "step": 240 + }, + { + "expr": "process_open_fds{instance=\"$node\",job=\"$job\"}", + "interval": "", + "intervalFactor": 2, + "legendFormat": "Open file descriptors", + "refId": "B", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "File Descriptors", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:6338", + "format": "short", + "label": "counter", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:6339", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "collapsed": true, + "datasource": "${DS_PROMETHEUS}", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 68 + }, + "id": 304, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 26 + }, + "hiddenSeries": false, + "id": 158, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:6726", + "alias": "/.*Critical*./", + "color": "#E24D42", + "fill": 0 + }, + { + "$$hashKey": "object:6727", + "alias": "/.*Max*./", + "color": "#EF843C", + "fill": 0 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_hwmon_temp_celsius{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{ chip }} {{ sensor }} temp", + "refId": "A", + "step": 240 + }, + { + "expr": "node_hwmon_temp_crit_alarm_celsius{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": true, + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{ chip }} {{ sensor }} Critical Alarm", + "refId": "B", + "step": 240 + }, + { + "expr": "node_hwmon_temp_crit_celsius{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{ chip }} {{ sensor }} Critical", + "refId": "C", + "step": 240 + }, + { + "expr": "node_hwmon_temp_crit_hyst_celsius{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": true, + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{ chip }} {{ sensor }} Critical Historical", + "refId": "D", + "step": 240 + }, + { + "expr": "node_hwmon_temp_max_celsius{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": true, + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{ chip }} {{ sensor }} Max", + "refId": "E", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Hardware temperature monitor", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:6750", + "format": "celsius", + "label": "temperature", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:6751", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 26 + }, + "hiddenSeries": false, + "id": 300, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:1655", + "alias": "/.*Max*./", + "color": "#EF843C", + "fill": 0 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_cooling_device_cur_state{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "Current {{ name }} in {{ type }}", + "refId": "A", + "step": 240 + }, + { + "expr": "node_cooling_device_max_state{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "Max {{ name }} in {{ type }}", + "refId": "B", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Throttle cooling device", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:1678", + "format": "short", + "label": "counter", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:1679", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 36 + }, + "hiddenSeries": false, + "id": 302, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_power_supply_online{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{ power_supply }} online", + "refId": "A", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Power supply", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:1678", + "format": "short", + "label": "counter", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:1679", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "title": "Hardware Misc", + "type": "row" + }, + { + "collapsed": true, + "datasource": "${DS_PROMETHEUS}", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 69 + }, + "id": 296, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 10 + }, + "hiddenSeries": false, + "id": 297, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(node_systemd_socket_accepted_connections_total{instance=\"$node\",job=\"$job\"}[5m])", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{ name }} Connections", + "refId": "A", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Systemd Sockets", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "counter", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 10 + }, + "hiddenSeries": false, + "id": 298, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "Failed", + "color": "#F2495C" + }, + { + "alias": "Inactive", + "color": "#FF9830" + }, + { + "alias": "Active", + "color": "#73BF69" + }, + { + "alias": "Deactivating", + "color": "#FFCB7D" + }, + { + "alias": "Activating", + "color": "#C8F2C2" + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "node_systemd_units{instance=\"$node\",job=\"$job\",state=\"activating\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "Activating", + "refId": "A", + "step": 240 + }, + { + "expr": "node_systemd_units{instance=\"$node\",job=\"$job\",state=\"active\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "Active", + "refId": "B", + "step": 240 + }, + { + "expr": "node_systemd_units{instance=\"$node\",job=\"$job\",state=\"deactivating\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "Deactivating", + "refId": "C", + "step": 240 + }, + { + "expr": "node_systemd_units{instance=\"$node\",job=\"$job\",state=\"failed\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "Failed", + "refId": "D", + "step": 240 + }, + { + "expr": "node_systemd_units{instance=\"$node\",job=\"$job\",state=\"inactive\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "Inactive", + "refId": "E", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Systemd Units State", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "counter", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "title": "Systemd", + "type": "row" + }, + { + "collapsed": true, + "datasource": "${DS_PROMETHEUS}", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 70 + }, + "id": 270, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "description": "", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 29 + }, + "hiddenSeries": false, + "id": 9, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideZero": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + { + "$$hashKey": "object:2033", + "alias": "/.*Read.*/", + "transform": "negative-Y" + }, + { + "$$hashKey": "object:2034", + "alias": "/.*sda_.*/", + "color": "#7EB26D" + }, + { + "$$hashKey": "object:2035", + "alias": "/.*sdb_.*/", + "color": "#EAB839" + }, + { + "$$hashKey": "object:2036", + "alias": "/.*sdc_.*/", + "color": "#6ED0E0" + }, + { + "$$hashKey": "object:2037", + "alias": "/.*sdd_.*/", + "color": "#EF843C" + }, + { + "$$hashKey": "object:2038", + "alias": "/.*sde_.*/", + "color": "#E24D42" + }, + { + "$$hashKey": "object:2039", + "alias": "/.*sda1.*/", + "color": "#584477" + }, + { + "$$hashKey": "object:2040", + "alias": "/.*sda2_.*/", + "color": "#BA43A9" + }, + { + "$$hashKey": "object:2041", + "alias": "/.*sda3_.*/", + "color": "#F4D598" + }, + { + "$$hashKey": "object:2042", + "alias": "/.*sdb1.*/", + "color": "#0A50A1" + }, + { + "$$hashKey": "object:2043", + "alias": "/.*sdb2.*/", + "color": "#BF1B00" + }, + { + "$$hashKey": "object:2044", + "alias": "/.*sdb3.*/", + "color": "#E0752D" + }, + { + "$$hashKey": "object:2045", + "alias": "/.*sdc1.*/", + "color": "#962D82" + }, + { + "$$hashKey": "object:2046", + "alias": "/.*sdc2.*/", + "color": "#614D93" + }, + { + "$$hashKey": "object:2047", + "alias": "/.*sdc3.*/", + "color": "#9AC48A" + }, + { + "$$hashKey": "object:2048", + "alias": "/.*sdd1.*/", + "color": "#65C5DB" + }, + { + "$$hashKey": "object:2049", + "alias": "/.*sdd2.*/", + "color": "#F9934E" + }, + { + "$$hashKey": "object:2050", + "alias": "/.*sdd3.*/", + "color": "#EA6460" + }, + { + "$$hashKey": "object:2051", + "alias": "/.*sde1.*/", + "color": "#E0F9D7" + }, + { + "$$hashKey": "object:2052", + "alias": "/.*sdd2.*/", + "color": "#FCEACA" + }, + { + "$$hashKey": "object:2053", + "alias": "/.*sde3.*/", + "color": "#F9E2D2" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(node_disk_reads_completed_total{instance=\"$node\",job=\"$job\"}[5m])", + "intervalFactor": 4, + "legendFormat": "{{device}} - Reads completed", + "refId": "A", + "step": 8 + }, + { + "expr": "irate(node_disk_writes_completed_total{instance=\"$node\",job=\"$job\"}[5m])", + "intervalFactor": 2, + "legendFormat": "{{device}} - Writes completed", + "refId": "B", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Disk IOps Completed", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:2186", + "format": "iops", + "label": "IO read (-) / write (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:2187", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "description": "", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 29 + }, + "hiddenSeries": false, + "id": 33, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideZero": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*Read.*/", + "transform": "negative-Y" + }, + { + "alias": "/.*sda_.*/", + "color": "#7EB26D" + }, + { + "alias": "/.*sdb_.*/", + "color": "#EAB839" + }, + { + "alias": "/.*sdc_.*/", + "color": "#6ED0E0" + }, + { + "alias": "/.*sdd_.*/", + "color": "#EF843C" + }, + { + "alias": "/.*sde_.*/", + "color": "#E24D42" + }, + { + "alias": "/.*sda1.*/", + "color": "#584477" + }, + { + "alias": "/.*sda2_.*/", + "color": "#BA43A9" + }, + { + "alias": "/.*sda3_.*/", + "color": "#F4D598" + }, + { + "alias": "/.*sdb1.*/", + "color": "#0A50A1" + }, + { + "alias": "/.*sdb2.*/", + "color": "#BF1B00" + }, + { + "alias": "/.*sdb3.*/", + "color": "#E0752D" + }, + { + "alias": "/.*sdc1.*/", + "color": "#962D82" + }, + { + "alias": "/.*sdc2.*/", + "color": "#614D93" + }, + { + "alias": "/.*sdc3.*/", + "color": "#9AC48A" + }, + { + "alias": "/.*sdd1.*/", + "color": "#65C5DB" + }, + { + "alias": "/.*sdd2.*/", + "color": "#F9934E" + }, + { + "alias": "/.*sdd3.*/", + "color": "#EA6460" + }, + { + "alias": "/.*sde1.*/", + "color": "#E0F9D7" + }, + { + "alias": "/.*sdd2.*/", + "color": "#FCEACA" + }, + { + "alias": "/.*sde3.*/", + "color": "#F9E2D2" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(node_disk_read_bytes_total{instance=\"$node\",job=\"$job\"}[5m])", + "format": "time_series", + "intervalFactor": 4, + "legendFormat": "{{device}} - Read bytes", + "refId": "A", + "step": 8 + }, + { + "expr": "irate(node_disk_written_bytes_total{instance=\"$node\",job=\"$job\"}[5m])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{device}} - Written bytes", + "refId": "B", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Disk R/W Data", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "Bps", + "label": "bytes read (-) / write (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "description": "", + "fill": 3, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 39 + }, + "hiddenSeries": false, + "id": 37, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideZero": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*Read.*/", + "transform": "negative-Y" + }, + { + "alias": "/.*sda_.*/", + "color": "#7EB26D" + }, + { + "alias": "/.*sdb_.*/", + "color": "#EAB839" + }, + { + "alias": "/.*sdc_.*/", + "color": "#6ED0E0" + }, + { + "alias": "/.*sdd_.*/", + "color": "#EF843C" + }, + { + "alias": "/.*sde_.*/", + "color": "#E24D42" + }, + { + "alias": "/.*sda1.*/", + "color": "#584477" + }, + { + "alias": "/.*sda2_.*/", + "color": "#BA43A9" + }, + { + "alias": "/.*sda3_.*/", + "color": "#F4D598" + }, + { + "alias": "/.*sdb1.*/", + "color": "#0A50A1" + }, + { + "alias": "/.*sdb2.*/", + "color": "#BF1B00" + }, + { + "alias": "/.*sdb3.*/", + "color": "#E0752D" + }, + { + "alias": "/.*sdc1.*/", + "color": "#962D82" + }, + { + "alias": "/.*sdc2.*/", + "color": "#614D93" + }, + { + "alias": "/.*sdc3.*/", + "color": "#9AC48A" + }, + { + "alias": "/.*sdd1.*/", + "color": "#65C5DB" + }, + { + "alias": "/.*sdd2.*/", + "color": "#F9934E" + }, + { + "alias": "/.*sdd3.*/", + "color": "#EA6460" + }, + { + "alias": "/.*sde1.*/", + "color": "#E0F9D7" + }, + { + "alias": "/.*sdd2.*/", + "color": "#FCEACA" + }, + { + "alias": "/.*sde3.*/", + "color": "#F9E2D2" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(node_disk_read_time_seconds_total{instance=\"$node\",job=\"$job\"}[5m])", + "hide": false, + "intervalFactor": 4, + "legendFormat": "{{device}} - Read time", + "refId": "A", + "step": 8 + }, + { + "expr": "irate(node_disk_write_time_seconds_total{instance=\"$node\",job=\"$job\"}[5m])", + "hide": false, + "intervalFactor": 2, + "legendFormat": "{{device}} - Write time", + "refId": "B", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Disk R/W Time", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "label": "time. read (-) / write (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "description": "", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 39 + }, + "hiddenSeries": false, + "id": 35, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideZero": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*sda_.*/", + "color": "#7EB26D" + }, + { + "alias": "/.*sdb_.*/", + "color": "#EAB839" + }, + { + "alias": "/.*sdc_.*/", + "color": "#6ED0E0" + }, + { + "alias": "/.*sdd_.*/", + "color": "#EF843C" + }, + { + "alias": "/.*sde_.*/", + "color": "#E24D42" + }, + { + "alias": "/.*sda1.*/", + "color": "#584477" + }, + { + "alias": "/.*sda2_.*/", + "color": "#BA43A9" + }, + { + "alias": "/.*sda3_.*/", + "color": "#F4D598" + }, + { + "alias": "/.*sdb1.*/", + "color": "#0A50A1" + }, + { + "alias": "/.*sdb2.*/", + "color": "#BF1B00" + }, + { + "alias": "/.*sdb3.*/", + "color": "#E0752D" + }, + { + "alias": "/.*sdc1.*/", + "color": "#962D82" + }, + { + "alias": "/.*sdc2.*/", + "color": "#614D93" + }, + { + "alias": "/.*sdc3.*/", + "color": "#9AC48A" + }, + { + "alias": "/.*sdd1.*/", + "color": "#65C5DB" + }, + { + "alias": "/.*sdd2.*/", + "color": "#F9934E" + }, + { + "alias": "/.*sdd3.*/", + "color": "#EA6460" + }, + { + "alias": "/.*sde1.*/", + "color": "#E0F9D7" + }, + { + "alias": "/.*sdd2.*/", + "color": "#FCEACA" + }, + { + "alias": "/.*sde3.*/", + "color": "#F9E2D2" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(node_disk_io_time_weighted_seconds_total{instance=\"$node\",job=\"$job\"}[5m])", + "intervalFactor": 4, + "legendFormat": "{{device}} - IO time weighted", + "refId": "A", + "step": 8 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Disk IOs Weighted", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "label": "time", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "description": "", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 49 + }, + "hiddenSeries": false, + "id": 133, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideZero": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*Read.*/", + "transform": "negative-Y" + }, + { + "alias": "/.*sda_.*/", + "color": "#7EB26D" + }, + { + "alias": "/.*sdb_.*/", + "color": "#EAB839" + }, + { + "alias": "/.*sdc_.*/", + "color": "#6ED0E0" + }, + { + "alias": "/.*sdd_.*/", + "color": "#EF843C" + }, + { + "alias": "/.*sde_.*/", + "color": "#E24D42" + }, + { + "alias": "/.*sda1.*/", + "color": "#584477" + }, + { + "alias": "/.*sda2_.*/", + "color": "#BA43A9" + }, + { + "alias": "/.*sda3_.*/", + "color": "#F4D598" + }, + { + "alias": "/.*sdb1.*/", + "color": "#0A50A1" + }, + { + "alias": "/.*sdb2.*/", + "color": "#BF1B00" + }, + { + "alias": "/.*sdb3.*/", + "color": "#E0752D" + }, + { + "alias": "/.*sdc1.*/", + "color": "#962D82" + }, + { + "alias": "/.*sdc2.*/", + "color": "#614D93" + }, + { + "alias": "/.*sdc3.*/", + "color": "#9AC48A" + }, + { + "alias": "/.*sdd1.*/", + "color": "#65C5DB" + }, + { + "alias": "/.*sdd2.*/", + "color": "#F9934E" + }, + { + "alias": "/.*sdd3.*/", + "color": "#EA6460" + }, + { + "alias": "/.*sde1.*/", + "color": "#E0F9D7" + }, + { + "alias": "/.*sdd2.*/", + "color": "#FCEACA" + }, + { + "alias": "/.*sde3.*/", + "color": "#F9E2D2" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(node_disk_reads_merged_total{instance=\"$node\",job=\"$job\"}[5m])", + "intervalFactor": 2, + "legendFormat": "{{device}} - Read merged", + "refId": "A", + "step": 4 + }, + { + "expr": "irate(node_disk_writes_merged_total{instance=\"$node\",job=\"$job\"}[5m])", + "intervalFactor": 2, + "legendFormat": "{{device}} - Write merged", + "refId": "B", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Disk R/W Merged", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "iops", + "label": "I/Os", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "description": "", + "fill": 3, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 49 + }, + "hiddenSeries": false, + "id": 36, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideZero": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*sda_.*/", + "color": "#7EB26D" + }, + { + "alias": "/.*sdb_.*/", + "color": "#EAB839" + }, + { + "alias": "/.*sdc_.*/", + "color": "#6ED0E0" + }, + { + "alias": "/.*sdd_.*/", + "color": "#EF843C" + }, + { + "alias": "/.*sde_.*/", + "color": "#E24D42" + }, + { + "alias": "/.*sda1.*/", + "color": "#584477" + }, + { + "alias": "/.*sda2_.*/", + "color": "#BA43A9" + }, + { + "alias": "/.*sda3_.*/", + "color": "#F4D598" + }, + { + "alias": "/.*sdb1.*/", + "color": "#0A50A1" + }, + { + "alias": "/.*sdb2.*/", + "color": "#BF1B00" + }, + { + "alias": "/.*sdb3.*/", + "color": "#E0752D" + }, + { + "alias": "/.*sdc1.*/", + "color": "#962D82" + }, + { + "alias": "/.*sdc2.*/", + "color": "#614D93" + }, + { + "alias": "/.*sdc3.*/", + "color": "#9AC48A" + }, + { + "alias": "/.*sdd1.*/", + "color": "#65C5DB" + }, + { + "alias": "/.*sdd2.*/", + "color": "#F9934E" + }, + { + "alias": "/.*sdd3.*/", + "color": "#EA6460" + }, + { + "alias": "/.*sde1.*/", + "color": "#E0F9D7" + }, + { + "alias": "/.*sdd2.*/", + "color": "#FCEACA" + }, + { + "alias": "/.*sde3.*/", + "color": "#F9E2D2" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(node_disk_io_time_seconds_total{instance=\"$node\",job=\"$job\"}[5m])", + "intervalFactor": 4, + "legendFormat": "{{device}} - IO time", + "refId": "A", + "step": 8 + }, + { + "expr": "irate(node_disk_discard_time_seconds_total{instance=\"$node\",job=\"$job\"}[5m])", + "interval": "", + "intervalFactor": 4, + "legendFormat": "{{device}} - discard time", + "refId": "B", + "step": 8 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Time Spent Doing I/Os", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "label": "time", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "description": "", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 59 + }, + "hiddenSeries": false, + "id": 34, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideZero": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*sda_.*/", + "color": "#7EB26D" + }, + { + "alias": "/.*sdb_.*/", + "color": "#EAB839" + }, + { + "alias": "/.*sdc_.*/", + "color": "#6ED0E0" + }, + { + "alias": "/.*sdd_.*/", + "color": "#EF843C" + }, + { + "alias": "/.*sde_.*/", + "color": "#E24D42" + }, + { + "alias": "/.*sda1.*/", + "color": "#584477" + }, + { + "alias": "/.*sda2_.*/", + "color": "#BA43A9" + }, + { + "alias": "/.*sda3_.*/", + "color": "#F4D598" + }, + { + "alias": "/.*sdb1.*/", + "color": "#0A50A1" + }, + { + "alias": "/.*sdb2.*/", + "color": "#BF1B00" + }, + { + "alias": "/.*sdb3.*/", + "color": "#E0752D" + }, + { + "alias": "/.*sdc1.*/", + "color": "#962D82" + }, + { + "alias": "/.*sdc2.*/", + "color": "#614D93" + }, + { + "alias": "/.*sdc3.*/", + "color": "#9AC48A" + }, + { + "alias": "/.*sdd1.*/", + "color": "#65C5DB" + }, + { + "alias": "/.*sdd2.*/", + "color": "#F9934E" + }, + { + "alias": "/.*sdd3.*/", + "color": "#EA6460" + }, + { + "alias": "/.*sde1.*/", + "color": "#E0F9D7" + }, + { + "alias": "/.*sdd2.*/", + "color": "#FCEACA" + }, + { + "alias": "/.*sde3.*/", + "color": "#F9E2D2" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(node_disk_io_now{instance=\"$node\",job=\"$job\"}[5m])", + "intervalFactor": 4, + "legendFormat": "{{device}} - IO now", + "refId": "A", + "step": 8 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Disk IOs Current in Progress", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "iops", + "label": "I/Os", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "description": "", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 59 + }, + "hiddenSeries": false, + "id": 301, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideZero": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null as zero", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2034", + "alias": "/.*sda_.*/", + "color": "#7EB26D" + }, + { + "$$hashKey": "object:2035", + "alias": "/.*sdb_.*/", + "color": "#EAB839" + }, + { + "$$hashKey": "object:2036", + "alias": "/.*sdc_.*/", + "color": "#6ED0E0" + }, + { + "$$hashKey": "object:2037", + "alias": "/.*sdd_.*/", + "color": "#EF843C" + }, + { + "$$hashKey": "object:2038", + "alias": "/.*sde_.*/", + "color": "#E24D42" + }, + { + "$$hashKey": "object:2039", + "alias": "/.*sda1.*/", + "color": "#584477" + }, + { + "$$hashKey": "object:2040", + "alias": "/.*sda2_.*/", + "color": "#BA43A9" + }, + { + "$$hashKey": "object:2041", + "alias": "/.*sda3_.*/", + "color": "#F4D598" + }, + { + "$$hashKey": "object:2042", + "alias": "/.*sdb1.*/", + "color": "#0A50A1" + }, + { + "$$hashKey": "object:2043", + "alias": "/.*sdb2.*/", + "color": "#BF1B00" + }, + { + "$$hashKey": "object:2044", + "alias": "/.*sdb3.*/", + "color": "#E0752D" + }, + { + "$$hashKey": "object:2045", + "alias": "/.*sdc1.*/", + "color": "#962D82" + }, + { + "$$hashKey": "object:2046", + "alias": "/.*sdc2.*/", + "color": "#614D93" + }, + { + "$$hashKey": "object:2047", + "alias": "/.*sdc3.*/", + "color": "#9AC48A" + }, + { + "$$hashKey": "object:2048", + "alias": "/.*sdd1.*/", + "color": "#65C5DB" + }, + { + "$$hashKey": "object:2049", + "alias": "/.*sdd2.*/", + "color": "#F9934E" + }, + { + "$$hashKey": "object:2050", + "alias": "/.*sdd3.*/", + "color": "#EA6460" + }, + { + "$$hashKey": "object:2051", + "alias": "/.*sde1.*/", + "color": "#E0F9D7" + }, + { + "$$hashKey": "object:2052", + "alias": "/.*sdd2.*/", + "color": "#FCEACA" + }, + { + "$$hashKey": "object:2053", + "alias": "/.*sde3.*/", + "color": "#F9E2D2" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(node_disk_discards_completed_total{instance=\"$node\",job=\"$job\"}[5m])", + "interval": "", + "intervalFactor": 4, + "legendFormat": "{{device}} - Discards completed", + "refId": "A", + "step": 8 + }, + { + "expr": "irate(node_disk_discards_merged_total{instance=\"$node\",job=\"$job\"}[5m])", + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{device}} - Discards merged", + "refId": "B", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Disk IOps Discards completed / merged", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:2186", + "format": "iops", + "label": "IOs", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:2187", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "repeat": null, + "title": "Storage Disk", + "type": "row" + }, + { + "collapsed": true, + "datasource": "${DS_PROMETHEUS}", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 71 + }, + "id": 271, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 3, + "description": "", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 78 + }, + "hiddenSeries": false, + "id": 43, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_filesystem_avail_bytes{instance=\"$node\",job=\"$job\",device!~'rootfs'}", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "{{mountpoint}} - Available", + "metric": "", + "refId": "A", + "step": 4 + }, + { + "expr": "node_filesystem_free_bytes{instance=\"$node\",job=\"$job\",device!~'rootfs'}", + "format": "time_series", + "hide": true, + "intervalFactor": 2, + "legendFormat": "{{mountpoint}} - Free", + "refId": "B", + "step": 2 + }, + { + "expr": "node_filesystem_size_bytes{instance=\"$node\",job=\"$job\",device!~'rootfs'}", + "format": "time_series", + "hide": true, + "intervalFactor": 2, + "legendFormat": "{{mountpoint}} - Size", + "refId": "C", + "step": 2 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Filesystem space available", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:3826", + "format": "bytes", + "label": "bytes", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:3827", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "description": "", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 78 + }, + "hiddenSeries": false, + "id": 41, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideZero": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_filesystem_files_free{instance=\"$node\",job=\"$job\",device!~'rootfs'}", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "{{mountpoint}} - Free file nodes", + "refId": "A", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "File Nodes Free", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:3894", + "format": "short", + "label": "file nodes", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:3895", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "description": "", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 88 + }, + "hiddenSeries": false, + "id": 28, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_filefd_maximum{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 4, + "legendFormat": "Max open files", + "refId": "A", + "step": 8 + }, + { + "expr": "node_filefd_allocated{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Open files", + "refId": "B", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "File Descriptor", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "files", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "description": "", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 88 + }, + "hiddenSeries": false, + "id": 219, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideZero": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_filesystem_files{instance=\"$node\",job=\"$job\",device!~'rootfs'}", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "{{mountpoint}} - File nodes total", + "refId": "A", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "File Nodes Size", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "file Nodes", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "/ ReadOnly": "#890F02" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": null, + "description": "", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 98 + }, + "hiddenSeries": false, + "id": 44, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "node_filesystem_readonly{instance=\"$node\",job=\"$job\",device!~'rootfs'}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{mountpoint}} - ReadOnly", + "refId": "A", + "step": 4 + }, + { + "expr": "node_filesystem_device_error{instance=\"$node\",job=\"$job\",device!~'rootfs'}", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{mountpoint}} - Device error", + "refId": "B", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Filesystem in ReadOnly / Error", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:3670", + "format": "short", + "label": "counter", + "logBase": 1, + "max": "1", + "min": "0", + "show": true + }, + { + "$$hashKey": "object:3671", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "repeat": null, + "title": "Storage Filesystem", + "type": "row" + }, + { + "collapsed": true, + "datasource": "${DS_PROMETHEUS}", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 72 + }, + "id": 272, + "panels": [ + { + "aliasColors": { + "receive_packets_eth0": "#7EB26D", + "receive_packets_lo": "#E24D42", + "transmit_packets_eth0": "#7EB26D", + "transmit_packets_lo": "#E24D42" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 30 + }, + "hiddenSeries": false, + "id": 60, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": 300, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*Trans.*/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(node_network_receive_packets_total{instance=\"$node\",job=\"$job\",device=~\"^(([^v].{3}|.[^e].{2}|.{2}[^t].|.{3}[^h]).*|.{0,3})$\",device=~\"^(([^c].{2}|.[^n].{1}|.{2}[^i]).*|.{0,2})$\"}[5m])", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{device}} - Receive", + "refId": "A", + "step": 4 + }, + { + "expr": "irate(node_network_transmit_packets_total{instance=\"$node\",job=\"$job\",device=~\"^(([^v].{3}|.[^e].{2}|.{2}[^t].|.{3}[^h]).*|.{0,3})$\",device=~\"^(([^c].{2}|.[^n].{1}|.{2}[^i]).*|.{0,2})$\"}[5m])", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{device}} - Transmit", + "refId": "B", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Network Traffic by Packets", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "pps", + "label": "packets out (-) / in (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 30 + }, + "hiddenSeries": false, + "id": 142, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": false, + "hideZero": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": 300, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*Trans.*/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(node_network_receive_errs_total{instance=\"$node\",job=\"$job\",device=~\"^(([^v].{3}|.[^e].{2}|.{2}[^t].|.{3}[^h]).*|.{0,3})$\",device=~\"^(([^c].{2}|.[^n].{1}|.{2}[^i]).*|.{0,2})$\"}[5m])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{device}} - Receive errors", + "refId": "A", + "step": 4 + }, + { + "expr": "irate(node_network_transmit_errs_total{instance=\"$node\",job=\"$job\",device=~\"^(([^v].{3}|.[^e].{2}|.{2}[^t].|.{3}[^h]).*|.{0,3})$\",device=~\"^(([^c].{2}|.[^n].{1}|.{2}[^i]).*|.{0,2})$\"}[5m])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{device}} - Rransmit errors", + "refId": "B", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Network Traffic Errors", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "pps", + "label": "packets out (-) / in (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 40 + }, + "hiddenSeries": false, + "id": 143, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": false, + "hideZero": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": 300, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*Trans.*/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(node_network_receive_drop_total{instance=\"$node\",job=\"$job\",device=~\"^(([^v].{3}|.[^e].{2}|.{2}[^t].|.{3}[^h]).*|.{0,3})$\",device=~\"^(([^c].{2}|.[^n].{1}|.{2}[^i]).*|.{0,2})$\"}[5m])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{device}} - Receive drop", + "refId": "A", + "step": 4 + }, + { + "expr": "irate(node_network_transmit_drop_total{instance=\"$node\",job=\"$job\",device=~\"^(([^v].{3}|.[^e].{2}|.{2}[^t].|.{3}[^h]).*|.{0,3})$\",device=~\"^(([^c].{2}|.[^n].{1}|.{2}[^i]).*|.{0,2})$\"}[5m])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{device}} - Transmit drop", + "refId": "B", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Network Traffic Drop", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "pps", + "label": "packets out (-) / in (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 40 + }, + "hiddenSeries": false, + "id": 141, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": false, + "hideZero": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": 300, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*Trans.*/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(node_network_receive_compressed_total{instance=\"$node\",job=\"$job\",device=~\"^(([^v].{3}|.[^e].{2}|.{2}[^t].|.{3}[^h]).*|.{0,3})$\",device=~\"^(([^c].{2}|.[^n].{1}|.{2}[^i]).*|.{0,2})$\"}[5m])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{device}} - Receive compressed", + "refId": "A", + "step": 4 + }, + { + "expr": "irate(node_network_transmit_compressed_total{instance=\"$node\",job=\"$job\",device=~\"^(([^v].{3}|.[^e].{2}|.{2}[^t].|.{3}[^h]).*|.{0,3})$\",device=~\"^(([^c].{2}|.[^n].{1}|.{2}[^i]).*|.{0,2})$\"}[5m])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{device}} - Transmit compressed", + "refId": "B", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Network Traffic Compressed", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "pps", + "label": "packets out (-) / in (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 50 + }, + "hiddenSeries": false, + "id": 146, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": false, + "hideZero": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": 300, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*Trans.*/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(node_network_receive_multicast_total{instance=\"$node\",job=\"$job\",device=~\"^(([^v].{3}|.[^e].{2}|.{2}[^t].|.{3}[^h]).*|.{0,3})$\",device=~\"^(([^c].{2}|.[^n].{1}|.{2}[^i]).*|.{0,2})$\"}[5m])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{device}} - Receive multicast", + "refId": "A", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Network Traffic Multicast", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "pps", + "label": "packets out (-) / in (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 50 + }, + "hiddenSeries": false, + "id": 144, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": false, + "hideZero": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": 300, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*Trans.*/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(node_network_receive_fifo_total{instance=\"$node\",job=\"$job\",device=~\"^(([^v].{3}|.[^e].{2}|.{2}[^t].|.{3}[^h]).*|.{0,3})$\",device=~\"^(([^c].{2}|.[^n].{1}|.{2}[^i]).*|.{0,2})$\"}[5m])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{device}} - Receive fifo", + "refId": "A", + "step": 4 + }, + { + "expr": "irate(node_network_transmit_fifo_total{instance=\"$node\",job=\"$job\",device=~\"^(([^v].{3}|.[^e].{2}|.{2}[^t].|.{3}[^h]).*|.{0,3})$\",device=~\"^(([^c].{2}|.[^n].{1}|.{2}[^i]).*|.{0,2})$\"}[5m])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{device}} - Transmit fifo", + "refId": "B", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Network Traffic Fifo", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "pps", + "label": "packets out (-) / in (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 60 + }, + "hiddenSeries": false, + "id": 145, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": false, + "hideZero": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": 300, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:576", + "alias": "/.*Trans.*/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(node_network_receive_frame_total{instance=\"$node\",job=\"$job\",device=~\"^(([^v].{3}|.[^e].{2}|.{2}[^t].|.{3}[^h]).*|.{0,3})$\",device=~\"^(([^c].{2}|.[^n].{1}|.{2}[^i]).*|.{0,2})$\"}[5m])", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "{{device}} - Receive frame", + "refId": "A", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Network Traffic Frame", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:589", + "format": "pps", + "label": "packets out (-) / in (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:590", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 60 + }, + "hiddenSeries": false, + "id": 231, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": false, + "hideZero": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": 300, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(node_network_transmit_carrier_total{instance=\"$node\",job=\"$job\",device=~\"^(([^v].{3}|.[^e].{2}|.{2}[^t].|.{3}[^h]).*|.{0,3})$\",device=~\"^(([^c].{2}|.[^n].{1}|.{2}[^i]).*|.{0,2})$\"}[5m])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{device}} - Statistic transmit_carrier", + "refId": "A", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Network Traffic Carrier", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "counter", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 70 + }, + "hiddenSeries": false, + "id": 232, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": false, + "hideZero": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": 300, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*Trans.*/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(node_network_transmit_colls_total{instance=\"$node\",job=\"$job\",device=~\"^(([^v].{3}|.[^e].{2}|.{2}[^t].|.{3}[^h]).*|.{0,3})$\",device=~\"^(([^c].{2}|.[^n].{1}|.{2}[^i]).*|.{0,2})$\"}[5m])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{device}} - Transmit colls", + "refId": "A", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Network Traffic Colls", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "counter", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 70 + }, + "hiddenSeries": false, + "id": 61, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:663", + "alias": "NF conntrack limit", + "color": "#890F02", + "fill": 0 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_nf_conntrack_entries{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "NF conntrack entries", + "refId": "A", + "step": 4 + }, + { + "expr": "node_nf_conntrack_entries_limit{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "NF conntrack limit", + "refId": "B", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "NF Contrack", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:678", + "format": "short", + "label": "entries", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:679", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 80 + }, + "hiddenSeries": false, + "id": 230, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_arp_entries{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{ device }} - ARP entries", + "refId": "A", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "ARP Entries", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "Entries", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 80 + }, + "hiddenSeries": false, + "id": 288, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 1, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_network_mtu_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{ device }} - Bytes", + "refId": "A", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "MTU", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": 0, + "format": "bytes", + "label": "bytes", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 90 + }, + "hiddenSeries": false, + "id": 280, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 1, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_network_speed_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{ device }} - Speed", + "refId": "A", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Speed", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": 0, + "format": "bytes", + "label": "bytes", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 90 + }, + "hiddenSeries": false, + "id": 289, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 1, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_network_transmit_queue_length{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{ device }} - Interface transmit queue length", + "refId": "A", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Queue Length", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": 0, + "format": "none", + "label": "packets", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 100 + }, + "hiddenSeries": false, + "id": 290, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": false, + "hideZero": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": 300, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:232", + "alias": "/.*Dropped.*/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(node_softnet_processed_total{instance=\"$node\",job=\"$job\"}[5m])", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "CPU {{cpu}} - Processed", + "refId": "A", + "step": 4 + }, + { + "expr": "irate(node_softnet_dropped_total{instance=\"$node\",job=\"$job\"}[5m])", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "CPU {{cpu}} - Dropped", + "refId": "B", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Softnet Packets", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:207", + "format": "short", + "label": "packetes drop (-) / process (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:208", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 100 + }, + "hiddenSeries": false, + "id": 310, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": false, + "hideZero": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": 300, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(node_softnet_times_squeezed_total{instance=\"$node\",job=\"$job\"}[5m])", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "CPU {{cpu}} - Squeezed", + "refId": "A", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Softnet Out of Quota", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:207", + "format": "short", + "label": "counter", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:208", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 110 + }, + "hiddenSeries": false, + "id": 309, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": false, + "hideZero": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": 300, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_network_up{operstate=\"up\",instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{interface}} - Operational state UP", + "refId": "A", + "step": 4 + }, + { + "expr": "node_network_carrier{instance=\"$node\",job=\"$job\",device=~\"^(([^v].{3}|.[^e].{2}|.{2}[^t].|.{3}[^h]).*|.{0,3})$\",device=~\"^(([^c].{2}|.[^n].{1}|.{2}[^i]).*|.{0,2})$\"}", + "format": "time_series", + "instant": false, + "legendFormat": "{{device}} - Physical link state", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Network Operational Status", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "counter", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "repeat": null, + "title": "Network Traffic", + "type": "row" + }, + { + "collapsed": true, + "datasource": "${DS_PROMETHEUS}", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 73 + }, + "id": 273, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 13 + }, + "hiddenSeries": false, + "id": 63, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": false, + "hideZero": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": 300, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_sockstat_TCP_alloc{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "TCP_alloc - Allocated sockets", + "refId": "A", + "step": 240 + }, + { + "expr": "node_sockstat_TCP_inuse{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "TCP_inuse - Tcp sockets currently in use", + "refId": "B", + "step": 240 + }, + { + "expr": "node_sockstat_TCP_mem{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": true, + "interval": "", + "intervalFactor": 2, + "legendFormat": "TCP_mem - Used memory for tcp", + "refId": "C", + "step": 240 + }, + { + "expr": "node_sockstat_TCP_orphan{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "TCP_orphan - Orphan sockets", + "refId": "D", + "step": 240 + }, + { + "expr": "node_sockstat_TCP_tw{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "TCP_tw - Sockets wating close", + "refId": "E", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Sockstat TCP", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "counter", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 13 + }, + "hiddenSeries": false, + "id": 124, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": false, + "hideZero": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": 300, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_sockstat_UDPLITE_inuse{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "UDPLITE_inuse - Udplite sockets currently in use", + "refId": "A", + "step": 240 + }, + { + "expr": "node_sockstat_UDP_inuse{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "UDP_inuse - Udp sockets currently in use", + "refId": "B", + "step": 240 + }, + { + "expr": "node_sockstat_UDP_mem{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "UDP_mem - Used memory for udp", + "refId": "C", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Sockstat UDP", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "counter", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 23 + }, + "hiddenSeries": false, + "id": 126, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": false, + "hideZero": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": 300, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_sockstat_sockets_used{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "Sockets_used - Sockets currently in use", + "refId": "A", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Sockstat Used", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "sockets", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 23 + }, + "hiddenSeries": false, + "id": 220, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": false, + "hideZero": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": 300, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_sockstat_TCP_mem_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "mem_bytes - TCP sockets in that state", + "refId": "A", + "step": 240 + }, + { + "expr": "node_sockstat_UDP_mem_bytes{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "mem_bytes - UDP sockets in that state", + "refId": "B", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Sockstat Memory Size", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": "bytes", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 33 + }, + "hiddenSeries": false, + "id": 125, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": false, + "hideZero": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": 300, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_sockstat_FRAG_inuse{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "FRAG_inuse - Frag sockets currently in use", + "refId": "A", + "step": 240 + }, + { + "expr": "node_sockstat_FRAG_memory{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "FRAG_memory - Used memory for frag", + "refId": "B", + "step": 240 + }, + { + "expr": "node_sockstat_RAW_inuse{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "RAW_inuse - Raw sockets currently in use", + "refId": "C", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Sockstat FRAG / RAW", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:1572", + "format": "short", + "label": "counter", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:1573", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "repeat": null, + "title": "Network Sockstat", + "type": "row" + }, + { + "collapsed": true, + "datasource": "${DS_PROMETHEUS}", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 74 + }, + "id": 274, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 32 + }, + "height": "", + "hiddenSeries": false, + "id": 221, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": false, + "hideZero": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": 300, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 12, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:1876", + "alias": "/.*Out.*/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(node_netstat_IpExt_InOctets{instance=\"$node\",job=\"$job\"}[5m])", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "InOctets - Received octets", + "refId": "A", + "step": 4 + }, + { + "expr": "irate(node_netstat_IpExt_OutOctets{instance=\"$node\",job=\"$job\"}[5m])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "OutOctets - Sent octets", + "refId": "B", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Netstat IP In / Out Octets", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:1889", + "format": "short", + "label": "octects out (-) / in (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:1890", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 32 + }, + "height": "", + "hiddenSeries": false, + "id": 81, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": false, + "hideZero": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sideWidth": 300, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(node_netstat_Ip_Forwarding{instance=\"$node\",job=\"$job\"}[5m])", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "Forwarding - IP forwarding", + "refId": "A", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Netstat IP Forwarding", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:1957", + "format": "short", + "label": "datagrams", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:1958", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": null, + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 42 + }, + "height": "", + "hiddenSeries": false, + "id": 115, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideZero": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 12, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*Out.*/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(node_netstat_Icmp_InMsgs{instance=\"$node\",job=\"$job\"}[5m])", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "InMsgs - Messages which the entity received. Note that this counter includes all those counted by icmpInErrors", + "refId": "A", + "step": 4 + }, + { + "expr": "irate(node_netstat_Icmp_OutMsgs{instance=\"$node\",job=\"$job\"}[5m])", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "OutMsgs - Messages which this entity attempted to send. Note that this counter includes all those counted by icmpOutErrors", + "refId": "B", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "ICMP In / Out", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "messages out (-) / in (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": null, + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 42 + }, + "height": "", + "hiddenSeries": false, + "id": 50, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideZero": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 12, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*Out.*/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(node_netstat_Icmp_InErrors{instance=\"$node\",job=\"$job\"}[5m])", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "InErrors - Messages which the entity received but determined as having ICMP-specific errors (bad ICMP checksums, bad length, etc.)", + "refId": "A", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "ICMP Errors", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "messages out (-) / in (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": null, + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 52 + }, + "height": "", + "hiddenSeries": false, + "id": 55, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideZero": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 12, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*Out.*/", + "transform": "negative-Y" + }, + { + "alias": "/.*Snd.*/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(node_netstat_Udp_InDatagrams{instance=\"$node\",job=\"$job\"}[5m])", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "InDatagrams - Datagrams received", + "refId": "A", + "step": 4 + }, + { + "expr": "irate(node_netstat_Udp_OutDatagrams{instance=\"$node\",job=\"$job\"}[5m])", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "OutDatagrams - Datagrams sent", + "refId": "B", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "UDP In / Out", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "datagrams out (-) / in (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 52 + }, + "height": "", + "hiddenSeries": false, + "id": 109, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideZero": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 12, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(node_netstat_Udp_InErrors{instance=\"$node\",job=\"$job\"}[5m])", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "InErrors - UDP Datagrams that could not be delivered to an application", + "refId": "A", + "step": 4 + }, + { + "expr": "irate(node_netstat_Udp_NoPorts{instance=\"$node\",job=\"$job\"}[5m])", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "NoPorts - UDP Datagrams received on a port with no listener", + "refId": "B", + "step": 4 + }, + { + "expr": "irate(node_netstat_UdpLite_InErrors{instance=\"$node\",job=\"$job\"}[5m])", + "interval": "", + "legendFormat": "InErrors Lite - UDPLite Datagrams that could not be delivered to an application", + "refId": "C" + }, + { + "expr": "irate(node_netstat_Udp_RcvbufErrors{instance=\"$node\",job=\"$job\"}[5m])", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "RcvbufErrors - UDP buffer errors received", + "refId": "D", + "step": 4 + }, + { + "expr": "irate(node_netstat_Udp_SndbufErrors{instance=\"$node\",job=\"$job\"}[5m])", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "SndbufErrors - UDP buffer errors send", + "refId": "E", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "UDP Errors", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:4232", + "format": "short", + "label": "datagrams", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:4233", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": null, + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 62 + }, + "height": "", + "hiddenSeries": false, + "id": 299, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideZero": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 12, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*Out.*/", + "transform": "negative-Y" + }, + { + "alias": "/.*Snd.*/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(node_netstat_Tcp_InSegs{instance=\"$node\",job=\"$job\"}[5m])", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "InSegs - Segments received, including those received in error. This count includes segments received on currently established connections", + "refId": "A", + "step": 4 + }, + { + "expr": "irate(node_netstat_Tcp_OutSegs{instance=\"$node\",job=\"$job\"}[5m])", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "OutSegs - Segments sent, including those on current connections but excluding those containing only retransmitted octets", + "refId": "B", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "TCP In / Out", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "datagrams out (-) / in (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "description": "", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 62 + }, + "height": "", + "hiddenSeries": false, + "id": 104, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": false, + "hideZero": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 12, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(node_netstat_TcpExt_ListenOverflows{instance=\"$node\",job=\"$job\"}[5m])", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "ListenOverflows - Times the listen queue of a socket overflowed", + "refId": "A", + "step": 4 + }, + { + "expr": "irate(node_netstat_TcpExt_ListenDrops{instance=\"$node\",job=\"$job\"}[5m])", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "ListenDrops - SYNs to LISTEN sockets ignored", + "refId": "B", + "step": 4 + }, + { + "expr": "irate(node_netstat_TcpExt_TCPSynRetrans{instance=\"$node\",job=\"$job\"}[5m])", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "TCPSynRetrans - SYN-SYN/ACK retransmits to break down retransmissions in SYN, fast/timeout retransmits", + "refId": "C", + "step": 4 + }, + { + "expr": "irate(node_netstat_Tcp_RetransSegs{instance=\"$node\",job=\"$job\"}[5m])", + "interval": "", + "legendFormat": "RetransSegs - Segments retransmitted - that is, the number of TCP segments transmitted containing one or more previously transmitted octets", + "refId": "D" + }, + { + "expr": "irate(node_netstat_Tcp_InErrs{instance=\"$node\",job=\"$job\"}[5m])", + "interval": "", + "legendFormat": "InErrs - Segments received in error (e.g., bad TCP checksums)", + "refId": "E" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "TCP Errors", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "counter", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 72 + }, + "height": "", + "hiddenSeries": false, + "id": 85, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideZero": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 12, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:454", + "alias": "/.*MaxConn *./", + "color": "#890F02", + "fill": 0 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_netstat_Tcp_CurrEstab{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "CurrEstab - TCP connections for which the current state is either ESTABLISHED or CLOSE- WAIT", + "refId": "A", + "step": 4 + }, + { + "expr": "node_netstat_Tcp_MaxConn{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "MaxConn - Limit on the total number of TCP connections the entity can support (Dinamic is \"-1\")", + "refId": "B", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "TCP Connections", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:469", + "format": "short", + "label": "connections", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:470", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "description": "", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 72 + }, + "height": "", + "hiddenSeries": false, + "id": 91, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": false, + "hideZero": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 12, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*Sent.*/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(node_netstat_TcpExt_SyncookiesFailed{instance=\"$node\",job=\"$job\"}[5m])", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "SyncookiesFailed - Invalid SYN cookies received", + "refId": "A", + "step": 4 + }, + { + "expr": "irate(node_netstat_TcpExt_SyncookiesRecv{instance=\"$node\",job=\"$job\"}[5m])", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "SyncookiesRecv - SYN cookies received", + "refId": "B", + "step": 4 + }, + { + "expr": "irate(node_netstat_TcpExt_SyncookiesSent{instance=\"$node\",job=\"$job\"}[5m])", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "SyncookiesSent - SYN cookies sent", + "refId": "C", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "TCP SynCookie", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "counter out (-) / in (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 82 + }, + "height": "", + "hiddenSeries": false, + "id": 82, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideZero": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "maxPerRow": 12, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(node_netstat_Tcp_ActiveOpens{instance=\"$node\",job=\"$job\"}[5m])", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "ActiveOpens - TCP connections that have made a direct transition to the SYN-SENT state from the CLOSED state", + "refId": "A", + "step": 4 + }, + { + "expr": "irate(node_netstat_Tcp_PassiveOpens{instance=\"$node\",job=\"$job\"}[5m])", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "PassiveOpens - TCP connections that have made a direct transition to the SYN-RCVD state from the LISTEN state", + "refId": "B", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "TCP Direct Transition", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "connections", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "repeat": null, + "title": "Network Netstat", + "type": "row" + }, + { + "collapsed": true, + "datasource": "${DS_PROMETHEUS}", + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 75 + }, + "id": 279, + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "description": "", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 54 + }, + "hiddenSeries": false, + "id": 40, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "node_scrape_collector_duration_seconds{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{collector}} - Scrape duration", + "refId": "A", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Node Exporter Scrape Time", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "label": "seconds", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_PROMETHEUS}", + "description": "", + "fill": 2, + "fillGradient": 0, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 54 + }, + "hiddenSeries": false, + "id": 157, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:1969", + "alias": "/.*error.*/", + "color": "#F2495C", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "node_scrape_collector_success{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{collector}} - Scrape success", + "refId": "A", + "step": 4 + }, + { + "expr": "node_textfile_scrape_error{instance=\"$node\",job=\"$job\"}", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{collector}} - Scrape textfile error (1 = true)", + "refId": "B", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Node Exporter Scrape", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:1484", + "format": "short", + "label": "counter", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:1485", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "repeat": null, + "title": "Node Exporter", + "type": "row" + } + ], + "refresh": false, + "schemaVersion": 26, + "style": "dark", + "tags": [ + "linux" + ], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "prometheus", + "value": "prometheus" + }, + "error": null, + "hide": 0, + "includeAll": false, + "label": "datasource", + "multi": false, + "name": "prometheus", + "options": [], + "query": "prometheus", + "queryValue": "", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + }, + { + "allValue": null, + "current": {}, + "datasource": "${DS_PROMETHEUS}", + "definition": "label_values(node_cpu_seconds_total, job)", + "error": null, + "hide": 0, + "includeAll": false, + "label": "Job", + "multi": false, + "name": "job", + "options": [], + "query": "label_values(node_cpu_seconds_total, job)", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "current": { + "selected": false, + "text": ".+", + "value": ".+" + }, + "error": null, + "hide": 0, + "label": "Host Filter Regex", + "name": "host_filter", + "options": [ + { + "selected": true, + "text": ".+", + "value": ".+" + } + ], + "query": ".+", + "skipUrlSync": false, + "type": "textbox" + }, + { + "allValue": null, + "current": {}, + "datasource": "${DS_PROMETHEUS}", + "definition": "label_values(node_cpu_seconds_total{job=\"$job\", instance=~\"$host_filter\"}, instance)", + "error": null, + "hide": 0, + "includeAll": false, + "label": "Host:", + "multi": false, + "name": "node", + "options": [], + "query": "label_values(node_cpu_seconds_total{job=\"$job\", instance=~\"$host_filter\"}, instance)", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + "selected": false, + "text": "", + "value": "" + }, + "error": null, + "hide": 2, + "includeAll": false, + "label": null, + "multi": false, + "name": "diskdevices", + "options": [ + { + "selected": false, + "text": "[a-z]+|nvme[0-9]+n[0-9]+", + "value": "[a-z]+|nvme[0-9]+n[0-9]+" + } + ], + "query": "[a-z]+|nvme[0-9]+n[0-9]+", + "skipUrlSync": false, + "type": "custom" + }, + { + "current": { + "selected": true, + "text": "", + "value": "" + }, + "error": null, + "hide": 0, + "label": "Slurm Job ID", + "name": "job_id", + "options": [ + { + "selected": true, + "text": "", + "value": "" + } + ], + "query": "", + "skipUrlSync": false, + "type": "textbox" + } + ] + }, + "time": { + "from": "now-3h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "Node Exporter Slurm", + "uid": "node-exporter-slurm", + "version": 16, + "description": "Mash up of slurm-stats and node-exporter" +} \ No newline at end of file diff --git a/ansible/roles/kube_prometheus_stack/defaults/main/main.yml b/ansible/roles/kube_prometheus_stack/defaults/main/main.yml index b272c36e1..5664a6455 100644 --- a/ansible/roles/kube_prometheus_stack/defaults/main/main.yml +++ b/ansible/roles/kube_prometheus_stack/defaults/main/main.yml @@ -89,7 +89,7 @@ grafana_plugins: [] # Additional datasources to configure alongside kube-prometheus-stack defaults grafana_datasources: [] -grafana_home_dashboard: 13427.json # node exporter slurm +grafana_home_dashboard: node-exporter-slurm.json alertmanager_image_tag: v0.27.0 grafana_sidecar_image_tag: 1.26.1 diff --git a/environments/common/inventory/group_vars/all/grafana.yml b/environments/common/inventory/group_vars/all/grafana.yml index 930bb5988..ee38738ed 100644 --- a/environments/common/inventory/group_vars/all/grafana.yml +++ b/environments/common/inventory/group_vars/all/grafana.yml @@ -11,7 +11,7 @@ grafana_domain: "{{ ( openondemand_servername | default('') ) if groups['openond grafana_dashboards_default: # node exporter slurm: - - dashboard_id: 13427 + - dashboard_file: node-exporter-slurm.json replacements: - placeholder: DS_PROMETHEUS replacement: prometheus From 46d95ef8ceab04430281c9aa9bf768b88808937f Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Fri, 25 Oct 2024 14:16:58 +0100 Subject: [PATCH 65/90] set default dashboard to slurm jobs --- ansible/roles/kube_prometheus_stack/defaults/main/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ansible/roles/kube_prometheus_stack/defaults/main/main.yml b/ansible/roles/kube_prometheus_stack/defaults/main/main.yml index 5664a6455..63fdc30ef 100644 --- a/ansible/roles/kube_prometheus_stack/defaults/main/main.yml +++ b/ansible/roles/kube_prometheus_stack/defaults/main/main.yml @@ -89,7 +89,7 @@ grafana_plugins: [] # Additional datasources to configure alongside kube-prometheus-stack defaults grafana_datasources: [] -grafana_home_dashboard: node-exporter-slurm.json +grafana_home_dashboard: slurm-jobs.json alertmanager_image_tag: v0.27.0 grafana_sidecar_image_tag: 1.26.1 From b2b673f7d22f27e0854cba720da921b2a782baa3 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Fri, 25 Oct 2024 14:18:52 +0100 Subject: [PATCH 66/90] added ansible to migrate cloudalchemy data to KPS --- .../kube_prometheus_stack/tasks/main.yml | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/ansible/roles/kube_prometheus_stack/tasks/main.yml b/ansible/roles/kube_prometheus_stack/tasks/main.yml index 5ea7c9453..8c16d801b 100644 --- a/ansible/roles/kube_prometheus_stack/tasks/main.yml +++ b/ansible/roles/kube_prometheus_stack/tasks/main.yml @@ -1,5 +1,35 @@ --- +- name: Checking for existing Prometheus data directory + ansible.builtin.stat: + path: "{{ prometheus_db_dir }}" + register: prom_exists_result + +- name: Check if data is in kube-prometheus-stack + ansible.builtin.stat: + path: "{{ prometheus_db_dir }}/prometheus-db" + register: prom_already_migrated + when: prom_exists_result.stat.exists + +- name: Migrate existing Prometheus data from cloudalchemy roles to kube-prometheus-stack + when: prom_exists_result.stat.exists and not prom_already_migrated.stat.exists + block: + - name: Get existing files to copy + ansible.builtin.find: + paths: "{{ prometheus_db_dir }}" + file_type: any + register: prometheus_files + + - name: Create KPS subdirectory + ansible.builtin.file: + path: "{{ prometheus_db_dir }}/prometheus-db" + state: directory + + - name: Move data to KPS subdirectory + ansible.builtin.shell: + cmd: "mv {{ item.path }} {{ prometheus_db_dir }}/prometheus-db/{{ item.path | regex_search('([^/]+$)') }}" + loop: "{{ prometheus_files.files }}" + - name: Creating namespace kubernetes.core.k8s: name: "{{ kube_prometheus_stack_release_namespace }}" From d712f69638c3faec7a24613f0bf3c643fedeac56 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Fri, 25 Oct 2024 15:37:13 +0100 Subject: [PATCH 67/90] updated docs --- docs/monitoring-and-logging.README.md | 4 ++++ docs/persistent-state.md | 3 +-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/docs/monitoring-and-logging.README.md b/docs/monitoring-and-logging.README.md index 961634419..b8d3a86bd 100644 --- a/docs/monitoring-and-logging.README.md +++ b/docs/monitoring-and-logging.README.md @@ -228,6 +228,10 @@ The port can customised by overriding the `prometheus_port` variable. Note that this service is not password protected, allowing anyone with access to the URL to make queries. +### Upgrades + +The appliance previously used [cloudalchemy.prometheus](https://github.com/cloudalchemy/ansible-prometheus) role to configure Prometheus, but our monitoring stack has since been moved into the [kube-prometheus-stack](https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack) Helm chart running on a k3s cluster. The some of the default Grafana dashboards deployed by kube-prometheus-stack are hardcoded to rely on the `job` label of metrics scraped from Node Exporter to have the value `node-exporter`. By default, the cloudalchemy role scraped these metrics with the `job` label set to `node`. Therefore, if upgrading from previous versions of the appliance which used the cloudalchemy role, pre-upgrade data will not show up by default in Grafana dashboards. The old data can still be viewed in the OpenHPC and Node Exporter Slurm dashboards by selecting the previous `job` value from the Job dropdown. + ### Alerting and recording rules See the upstream documentation for [alerting](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/) and [recording](https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/) rules. diff --git a/docs/persistent-state.md b/docs/persistent-state.md index 7a15d4c37..cccb4ac6d 100644 --- a/docs/persistent-state.md +++ b/docs/persistent-state.md @@ -6,8 +6,7 @@ At present this will affect the following: - `slurmctld` state, i.e. the Slurm queue. - The MySQL database for `slurmdbd`, i.e. Slurm accounting information as shown by the `sacct` command. - Prometheus database -- Grafana data -- OpenDistro/elasticsearch data +- OpenSearch data If using the `environments/common/layout/everything` Ansible groups template (which is the default for a new cookiecutter-produced environment) then these services will all be on the `control` node and hence only this node requires persistent storage. From 2d163563f45891e03b811a46c33305a1691db91c Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Tue, 29 Oct 2024 10:10:43 +0000 Subject: [PATCH 68/90] cleaned up dashboard role --- .../roles/grafana-dashboards/tasks/main.yml | 32 +++---------------- .../templates/configmap-template.yml.j2 | 3 +- 2 files changed, 6 insertions(+), 29 deletions(-) diff --git a/ansible/roles/grafana-dashboards/tasks/main.yml b/ansible/roles/grafana-dashboards/tasks/main.yml index d8042d0bb..36f7af0d4 100644 --- a/ansible/roles/grafana-dashboards/tasks/main.yml +++ b/ansible/roles/grafana-dashboards/tasks/main.yml @@ -23,6 +23,7 @@ # SOFTWARE. - become: false + delegate_to: localhost block: - name: Create local grafana dashboard directory file: @@ -108,35 +109,10 @@ when: - grafana_dashboards | length > 0 -# Templating partial manifests and then adding the dashboard's data server-side because the k8s module doesn't like Jinja2 -- name: Create partial configmaps for server-side templating - ansible.builtin.template: - src: configmap-template.yml.j2 - dest: "{{ _tmp_dashboards.path }}/{{ item_filename }}.yml" - loop: "{{ grafana_dashboards }}" - vars: - item_filename: "{{ ((item.dashboard_id | string) + '.json') if 'dashboard_id' in item else item.dashboard_file }}" - -- name: Setting data keys - ansible.builtin.replace: - path: "{{ _tmp_dashboards.path }}/{{ item_filename }}.yml" - regexp: 'PLACEHOLDER' - replace: "{{ item_filename }}" - loop: "{{ grafana_dashboards }}" - vars: - item_filename: "{{ ((item.dashboard_id | string) + '.json') if 'dashboard_id' in item else item.dashboard_file }}" - -- name: Appending json data to configmaps - ansible.builtin.shell: - cmd: "sed 's/^/ /' {{ item_path }} >> {{ item_path }}.yml" - loop: "{{ grafana_dashboards }}" - vars: - item_filename: "{{ ((item.dashboard_id | string) + '.json') if 'dashboard_id' in item else item.dashboard_file }}" - item_path: "{{ _tmp_dashboards.path }}/{{ item_filename }}" - -- name: Applying dashboard configmaps +- name: Template configmaps from dashboards ansible.builtin.k8s: - src: "{{ item_path }}.yml" + definition: + "{{ lookup('ansible.builtin.template', 'configmap-template.yml.j2') | from_yaml }}" loop: "{{ grafana_dashboards }}" vars: item_filename: "{{ ((item.dashboard_id | string) + '.json') if 'dashboard_id' in item else item.dashboard_file }}" diff --git a/ansible/roles/grafana-dashboards/templates/configmap-template.yml.j2 b/ansible/roles/grafana-dashboards/templates/configmap-template.yml.j2 index d6e2473a0..59a17e565 100644 --- a/ansible/roles/grafana-dashboards/templates/configmap-template.yml.j2 +++ b/ansible/roles/grafana-dashboards/templates/configmap-template.yml.j2 @@ -6,4 +6,5 @@ metadata: labels: grafana_dashboard: "1" data: - PLACEHOLDER: | + {{ item_filename }}: | +{{ lookup('ansible.builtin.file', _tmp_dashboards.path+'/'+item_filename) | indent(4, first=True) }} From c6b221e57eac5625622f8048440d75553f6b07c3 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Tue, 29 Oct 2024 10:39:35 +0000 Subject: [PATCH 69/90] moved image pre-pull list to rolevar --- .../kube_prometheus_stack/defaults/main/install.yml | 11 +++++++++++ ansible/roles/kube_prometheus_stack/tasks/install.yml | 11 ----------- 2 files changed, 11 insertions(+), 11 deletions(-) create mode 100644 ansible/roles/kube_prometheus_stack/defaults/main/install.yml diff --git a/ansible/roles/kube_prometheus_stack/defaults/main/install.yml b/ansible/roles/kube_prometheus_stack/defaults/main/install.yml new file mode 100644 index 000000000..15ef8a297 --- /dev/null +++ b/ansible/roles/kube_prometheus_stack/defaults/main/install.yml @@ -0,0 +1,11 @@ +# Images to pre-pull during build, see ./main.yml for tag definitions +image_list: +- { name: "docker.io/grafana/grafana", tag: "{{ grafana_image_tag }}" } +- { name: "quay.io/prometheus/prometheus", tag: "{{ prometheus_image_tag }}" } +- { name: "quay.io/prometheus/alertmanager", tag: "{{ alertmanager_image_tag }}" } +- { name: "quay.io/prometheus/node-exporter", tag: "{{ node_exporter_image_tag }}" } +- { name: "quay.io/prometheus-operator/prometheus-config-reloader", tag: "{{ kube_prometheus_stack_app_version }}" } +- { name: "quay.io/prometheus-operator/prometheus-operator", tag: "{{ kube_prometheus_stack_app_version }}" } +- { name: "quay.io/kiwigrid/k8s-sidecar", tag: "{{ grafana_sidecar_image_tag }}" } +- { name: "registry.k8s.io/kube-state-metrics/kube-state-metrics", tag: "{{ kube_prometheus_stack_metrics_image_tag }}" } +- { name: "registry.k8s.io/ingress-nginx/kube-webhook-certgen", tag: "{{ kube_prometheus_stack_patch_image_tag }}" } \ No newline at end of file diff --git a/ansible/roles/kube_prometheus_stack/tasks/install.yml b/ansible/roles/kube_prometheus_stack/tasks/install.yml index d3ff3582a..e0ec9b821 100644 --- a/ansible/roles/kube_prometheus_stack/tasks/install.yml +++ b/ansible/roles/kube_prometheus_stack/tasks/install.yml @@ -1,17 +1,6 @@ --- - name: Pre-pull kube-prometheus-stack images and import to k3s - vars: - image_list: - - { name: "docker.io/grafana/grafana", tag: "{{ grafana_image_tag }}" } - - { name: "quay.io/prometheus/prometheus", tag: "{{ prometheus_image_tag }}" } - - { name: "quay.io/prometheus/alertmanager", tag: "{{ alertmanager_image_tag }}" } - - { name: "quay.io/prometheus/node-exporter", tag: "{{ node_exporter_image_tag }}" } - - { name: "quay.io/prometheus-operator/prometheus-config-reloader", tag: "{{ kube_prometheus_stack_app_version }}" } - - { name: "quay.io/prometheus-operator/prometheus-operator", tag: "{{ kube_prometheus_stack_app_version }}" } - - { name: "quay.io/kiwigrid/k8s-sidecar", tag: "{{ grafana_sidecar_image_tag }}" } - - { name: "registry.k8s.io/kube-state-metrics/kube-state-metrics", tag: "{{ kube_prometheus_stack_metrics_image_tag }}" } - - { name: "registry.k8s.io/ingress-nginx/kube-webhook-certgen", tag: "{{ kube_prometheus_stack_patch_image_tag }}" } block: - name: Pull with images with podman containers.podman.podman_image: From d1c915ef9ae637d1e9a51541e8dee88a013f223c Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Tue, 29 Oct 2024 11:19:16 +0000 Subject: [PATCH 70/90] doc changes + opensearch datasource now based on opensearch group --- ansible/roles/kube_prometheus_stack/tasks/main.yml | 2 +- docs/monitoring-and-logging.md | 2 +- environments/common/inventory/group_vars/all/defaults.yml | 1 + environments/common/inventory/group_vars/all/grafana.yml | 2 +- 4 files changed, 4 insertions(+), 3 deletions(-) diff --git a/ansible/roles/kube_prometheus_stack/tasks/main.yml b/ansible/roles/kube_prometheus_stack/tasks/main.yml index 8c16d801b..bece81154 100644 --- a/ansible/roles/kube_prometheus_stack/tasks/main.yml +++ b/ansible/roles/kube_prometheus_stack/tasks/main.yml @@ -166,7 +166,7 @@ name: opensearch subsets: - addresses: - - ip: "{{ control_ip }}" + - ip: "{{ opensearch_ip }}" ports: - port: 9200 name: opensearch diff --git a/docs/monitoring-and-logging.md b/docs/monitoring-and-logging.md index b8d3a86bd..b2eb62001 100644 --- a/docs/monitoring-and-logging.md +++ b/docs/monitoring-and-logging.md @@ -3,7 +3,7 @@ ## Components overview ### [kube-prometheus-stack](https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack) -An umbrella Helm chart which the appliance uses to deploy and manages containerised versions of Grafana and Prometheus. +An umbrella Helm chart which the appliance uses to deploy and manages containerised versions of Prometheus, Grafana, Alertmanager and Node Exporter. ### [filebeat](https://www.elastic.co/beats/filebeat) diff --git a/environments/common/inventory/group_vars/all/defaults.yml b/environments/common/inventory/group_vars/all/defaults.yml index ceded234b..03df64dd2 100644 --- a/environments/common/inventory/group_vars/all/defaults.yml +++ b/environments/common/inventory/group_vars/all/defaults.yml @@ -22,6 +22,7 @@ grafana_address: "{{ hostvars[groups['prometheus'].0].api_address }}" # Service IP addresses openondemand_ip: "{{ hostvars[groups['openondemand'].0].ansible_host if groups['openondemand'] else '' }}" +opensearch_ip: "{{ hostvars[groups['opensearch'].0].ansible_host if groups['opensearch'] else '' }}" ############################# bootstrap: local user configuration ######################### diff --git a/environments/common/inventory/group_vars/all/grafana.yml b/environments/common/inventory/group_vars/all/grafana.yml index ee38738ed..2e209dece 100644 --- a/environments/common/inventory/group_vars/all/grafana.yml +++ b/environments/common/inventory/group_vars/all/grafana.yml @@ -49,7 +49,7 @@ grafana_datasources: - name: slurmstats # see https://github.com/grafana/opensearch-datasource#configure-the-data-source-with-provisioning type: grafana-opensearch-datasource - url: "https://{{ control_ip }}:9200" + url: "https://{{ opensearch_ip }}:9200" basicAuth: true basicAuthUser: admin secureJsonData: From b6be009e06f20c26400b14082df48011fce6a0df Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Tue, 29 Oct 2024 11:35:02 +0000 Subject: [PATCH 71/90] made kps default dashboards more configurable --- .../roles/kube_prometheus_stack/defaults/main/main.yml | 4 +++- ansible/roles/kube_prometheus_stack/tasks/main.yml | 8 +------- docs/monitoring-and-logging.md | 2 +- environments/common/inventory/group_vars/all/grafana.yml | 9 +++++++++ 4 files changed, 14 insertions(+), 9 deletions(-) diff --git a/ansible/roles/kube_prometheus_stack/defaults/main/main.yml b/ansible/roles/kube_prometheus_stack/defaults/main/main.yml index 63fdc30ef..bf6097089 100644 --- a/ansible/roles/kube_prometheus_stack/defaults/main/main.yml +++ b/ansible/roles/kube_prometheus_stack/defaults/main/main.yml @@ -73,6 +73,9 @@ prometheus_rules: # ------------------------------------------------------------------------------------------ grafana_image_tag: 11.2.2 +grafana_sidecar_image_tag: 1.26.1 + +grafana_exclude_default_dashboards: [] grafana_security: admin_user: grafana @@ -92,7 +95,6 @@ grafana_datasources: [] grafana_home_dashboard: slurm-jobs.json alertmanager_image_tag: v0.27.0 -grafana_sidecar_image_tag: 1.26.1 node_exporter_image_tag: v1.8.2 node_exporter_args: diff --git a/ansible/roles/kube_prometheus_stack/tasks/main.yml b/ansible/roles/kube_prometheus_stack/tasks/main.yml index bece81154..13488de58 100644 --- a/ansible/roles/kube_prometheus_stack/tasks/main.yml +++ b/ansible/roles/kube_prometheus_stack/tasks/main.yml @@ -197,10 +197,4 @@ definition: metadata: name: "{{ item }}" - loop: - - kube-prometheus-stack-nodes-darwin - - kube-prometheus-stack-grafana-overview - - kube-prometheus-stack-proxy - - kube-prometheus-stack-etcd - - kube-prometheus-stack-alertmanager-overview - - kube-prometheus-stack-scheduler + loop: "{{ grafana_exclude_default_dashboards }}" diff --git a/docs/monitoring-and-logging.md b/docs/monitoring-and-logging.md index b2eb62001..c1e00f5a1 100644 --- a/docs/monitoring-and-logging.md +++ b/docs/monitoring-and-logging.md @@ -109,7 +109,7 @@ Note that if Open OnDemand is enabled, Grafana is only accessible through OOD's ### grafana dashboards -In addition to the default set of dashboards that are deployed by kube-prometheus-stack, the appliance ships with a default set of dashboards (listed below). The set of appliance-specific dashboards can be configured via the `grafana_dashboards` variable. The dashboards are either internal to the [grafana-dashboards role](../ansible/roles/grafana-dashboards/files/) or downloaded from grafana.com. +In addition to the default set of dashboards that are deployed by kube-prometheus-stack, the appliance ships with a default set of dashboards (listed below). The set of appliance-specific dashboards can be configured via the `grafana_dashboards` variable. The dashboards are either internal to the [grafana-dashboards role](../ansible/roles/grafana-dashboards/files/) or downloaded from grafana.com. If you wish to selectively remove the default dashboards deployed by kube-prometheus-stack, this can be done by overriding the `grafana_exclude_default_dashboards` variable in [environments/common/inventory/group_vars/all/grafana.yml](../environments/common/inventory/group_vars/all/grafana.yml). #### node exporter slurm diff --git a/environments/common/inventory/group_vars/all/grafana.yml b/environments/common/inventory/group_vars/all/grafana.yml index 2e209dece..ee874d2ed 100644 --- a/environments/common/inventory/group_vars/all/grafana.yml +++ b/environments/common/inventory/group_vars/all/grafana.yml @@ -40,6 +40,15 @@ grafana_dashboards_default: revision_id: 3 grafana_dashboards: "{{ grafana_dashboards_default + (openondemand_dashboard if groups.get('openondemand') else []) }}" +# Configmap names of kube prometheus stack's default dashboards to exclude +grafana_exclude_default_dashboards: +- kube-prometheus-stack-nodes-darwin +- kube-prometheus-stack-grafana-overview +- kube-prometheus-stack-proxy +- kube-prometheus-stack-etcd +- kube-prometheus-stack-alertmanager-overview +- kube-prometheus-stack-scheduler + grafana_security: admin_user: grafana admin_password: "{{ vault_grafana_admin_password }}" From 206134d2f7b750dcb5b409081e1edde1899174f2 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Tue, 29 Oct 2024 11:36:38 +0000 Subject: [PATCH 72/90] bump image up to date with main --- .../.stackhpc/terraform/cluster_image.auto.tfvars.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json index 45ef20938..6b440865e 100644 --- a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json @@ -1,7 +1,7 @@ { "cluster_image": { - "RL8": "openhpc-RL8-241024-0744-d0024540", - "RL9": "openhpc-RL9-241024-0744-d0024540", - "RL9-cuda": "openhpc-cuda-RL9-241024-0744-d0024540" + "RL8": "openhpc-RL8-241029-0905-f23c2fca", + "RL9": "openhpc-RL9-241029-0949-f23c2fca", + "RL9-cuda": "openhpc-cuda-RL9-241029-0905-f23c2fca" } } From 8364eb8ee9dbc2d94784f2c70dba2a6183c73441 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Tue, 29 Oct 2024 11:39:42 +0000 Subject: [PATCH 73/90] newline --- ansible/roles/kube_prometheus_stack/defaults/main/install.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ansible/roles/kube_prometheus_stack/defaults/main/install.yml b/ansible/roles/kube_prometheus_stack/defaults/main/install.yml index 15ef8a297..b1fcdc7c1 100644 --- a/ansible/roles/kube_prometheus_stack/defaults/main/install.yml +++ b/ansible/roles/kube_prometheus_stack/defaults/main/install.yml @@ -8,4 +8,4 @@ image_list: - { name: "quay.io/prometheus-operator/prometheus-operator", tag: "{{ kube_prometheus_stack_app_version }}" } - { name: "quay.io/kiwigrid/k8s-sidecar", tag: "{{ grafana_sidecar_image_tag }}" } - { name: "registry.k8s.io/kube-state-metrics/kube-state-metrics", tag: "{{ kube_prometheus_stack_metrics_image_tag }}" } -- { name: "registry.k8s.io/ingress-nginx/kube-webhook-certgen", tag: "{{ kube_prometheus_stack_patch_image_tag }}" } \ No newline at end of file +- { name: "registry.k8s.io/ingress-nginx/kube-webhook-certgen", tag: "{{ kube_prometheus_stack_patch_image_tag }}" } From 603e818d6c89dd5268756bded1a0a6eba3fa7c98 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Tue, 29 Oct 2024 12:06:13 +0000 Subject: [PATCH 74/90] bumped caas minimum control node ram --- environments/.caas/ui-meta/slurm-infra-fast-volume-type.yml | 2 +- environments/.caas/ui-meta/slurm-infra-manila-home.yml | 2 +- environments/.caas/ui-meta/slurm-infra.yml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/environments/.caas/ui-meta/slurm-infra-fast-volume-type.yml b/environments/.caas/ui-meta/slurm-infra-fast-volume-type.yml index ab10eff20..3f0c6c025 100644 --- a/environments/.caas/ui-meta/slurm-infra-fast-volume-type.yml +++ b/environments/.caas/ui-meta/slurm-infra-fast-volume-type.yml @@ -29,7 +29,7 @@ parameters: kind: cloud.size immutable: true options: - min_ram: 4096 + min_ram: 6144 min_disk: 20 - name: compute_count diff --git a/environments/.caas/ui-meta/slurm-infra-manila-home.yml b/environments/.caas/ui-meta/slurm-infra-manila-home.yml index 4a01bb6fa..00b86fa54 100644 --- a/environments/.caas/ui-meta/slurm-infra-manila-home.yml +++ b/environments/.caas/ui-meta/slurm-infra-manila-home.yml @@ -32,7 +32,7 @@ parameters: kind: cloud.size immutable: true options: - min_ram: 4096 + min_ram: 6144 min_disk: 20 - name: compute_count diff --git a/environments/.caas/ui-meta/slurm-infra.yml b/environments/.caas/ui-meta/slurm-infra.yml index 36b89281d..472b32c24 100644 --- a/environments/.caas/ui-meta/slurm-infra.yml +++ b/environments/.caas/ui-meta/slurm-infra.yml @@ -29,7 +29,7 @@ parameters: kind: cloud.size immutable: true options: - min_ram: 4096 + min_ram: 6144 min_disk: 20 - name: compute_count From c4a4847dfcfb9ceb36dc9c694f446d12d5691fe2 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Mon, 11 Nov 2024 16:49:26 +0000 Subject: [PATCH 75/90] reduced disk footprint of container pe-pulls --- .../files/podman_to_k3s.yml | 15 +++++++++++++ .../kube_prometheus_stack/tasks/install.yml | 21 ++----------------- 2 files changed, 17 insertions(+), 19 deletions(-) create mode 100644 ansible/roles/kube_prometheus_stack/files/podman_to_k3s.yml diff --git a/ansible/roles/kube_prometheus_stack/files/podman_to_k3s.yml b/ansible/roles/kube_prometheus_stack/files/podman_to_k3s.yml new file mode 100644 index 000000000..117091b01 --- /dev/null +++ b/ansible/roles/kube_prometheus_stack/files/podman_to_k3s.yml @@ -0,0 +1,15 @@ +- name: "Pull {{ item.name }} image with podman" + containers.podman.podman_image: + name: "{{ item.name }}" + tag: "{{ item.tag }}" + +- name: "Export {{ item.name }} image to k3s" + containers.podman.podman_save: + image: "{{ item.name }}:{{ item.tag }}" + dest: "/var/lib/rancher/k3s/agent/images/{{ item.name | regex_replace('\\/|\\.','-')}}.tar" + +- name: "Clean up {{ item.name }} podman image" + containers.podman.podman_image: + state: absent + name: "{{ item.name }}" + tag: "{{ item.tag }}" diff --git a/ansible/roles/kube_prometheus_stack/tasks/install.yml b/ansible/roles/kube_prometheus_stack/tasks/install.yml index e0ec9b821..e953c2f55 100644 --- a/ansible/roles/kube_prometheus_stack/tasks/install.yml +++ b/ansible/roles/kube_prometheus_stack/tasks/install.yml @@ -1,22 +1,5 @@ --- - name: Pre-pull kube-prometheus-stack images and import to k3s - block: - - name: Pull with images with podman - containers.podman.podman_image: - name: "{{ item.name }}" - tag: "{{ item.tag }}" - loop: "{{ image_list }}" - - - name: Export images to k3s - containers.podman.podman_save: - image: "{{ item.name }}:{{ item.tag }}" - dest: "/var/lib/rancher/k3s/agent/images/{{ item.name | regex_replace('\\/|\\.','-')}}.tar" - loop: "{{ image_list }}" - - - name: Clean up podman images - containers.podman.podman_image: - state: absent - name: "{{ item.name }}" - tag: "{{ item.tag }}" - loop: "{{ image_list }}" \ No newline at end of file + ansible.builtin.include_tasks: roles/kube_prometheus_stack/files/podman_to_k3s.yml + loop: "{{ image_list }}" From a2540f2e3ec8fe67ee96da3935e559b4eff1cca7 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Tue, 12 Nov 2024 14:00:01 +0000 Subject: [PATCH 76/90] moved image pulls to tasks --- ansible/roles/kube_prometheus_stack/tasks/install.yml | 2 +- .../kube_prometheus_stack/{files => tasks}/podman_to_k3s.yml | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename ansible/roles/kube_prometheus_stack/{files => tasks}/podman_to_k3s.yml (100%) diff --git a/ansible/roles/kube_prometheus_stack/tasks/install.yml b/ansible/roles/kube_prometheus_stack/tasks/install.yml index e953c2f55..ac8056919 100644 --- a/ansible/roles/kube_prometheus_stack/tasks/install.yml +++ b/ansible/roles/kube_prometheus_stack/tasks/install.yml @@ -1,5 +1,5 @@ --- - name: Pre-pull kube-prometheus-stack images and import to k3s - ansible.builtin.include_tasks: roles/kube_prometheus_stack/files/podman_to_k3s.yml + ansible.builtin.include_tasks: podman_to_k3s.yml loop: "{{ image_list }}" diff --git a/ansible/roles/kube_prometheus_stack/files/podman_to_k3s.yml b/ansible/roles/kube_prometheus_stack/tasks/podman_to_k3s.yml similarity index 100% rename from ansible/roles/kube_prometheus_stack/files/podman_to_k3s.yml rename to ansible/roles/kube_prometheus_stack/tasks/podman_to_k3s.yml From cd281f3cfc6f1d8695ebf393220208bacce19f38 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Tue, 12 Nov 2024 14:49:14 +0000 Subject: [PATCH 77/90] moved prometheus install to host group --- ansible/fatimage.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/ansible/fatimage.yml b/ansible/fatimage.yml index e89198fba..62ce9ba03 100644 --- a/ansible/fatimage.yml +++ b/ansible/fatimage.yml @@ -123,6 +123,10 @@ slurm_exporter_state: stopped when: "'slurm_exporter' in group_names" +- hosts: prometheus + become: yes + gather_facts: yes + tasks: - name: kube prometheus stack import_role: name: kube_prometheus_stack From 774f60827bb97d16e507b798d651f76fde8bd12d Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Tue, 12 Nov 2024 16:11:47 +0000 Subject: [PATCH 78/90] Review docs suggestions Co-authored-by: Steve Brasier <33413598+sjpb@users.noreply.github.com> --- docs/monitoring-and-logging.md | 6 +++--- .../common/inventory/group_vars/all/alertmanager.yml | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/monitoring-and-logging.md b/docs/monitoring-and-logging.md index c1e00f5a1..5e66121d1 100644 --- a/docs/monitoring-and-logging.md +++ b/docs/monitoring-and-logging.md @@ -3,7 +3,7 @@ ## Components overview ### [kube-prometheus-stack](https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack) -An umbrella Helm chart which the appliance uses to deploy and manages containerised versions of Prometheus, Grafana, Alertmanager and Node Exporter. +An umbrella Helm chart which the appliance uses to deploy and manage containerised versions of Prometheus, Grafana, Alertmanager and Node Exporter. ### [filebeat](https://www.elastic.co/beats/filebeat) @@ -109,7 +109,7 @@ Note that if Open OnDemand is enabled, Grafana is only accessible through OOD's ### grafana dashboards -In addition to the default set of dashboards that are deployed by kube-prometheus-stack, the appliance ships with a default set of dashboards (listed below). The set of appliance-specific dashboards can be configured via the `grafana_dashboards` variable. The dashboards are either internal to the [grafana-dashboards role](../ansible/roles/grafana-dashboards/files/) or downloaded from grafana.com. If you wish to selectively remove the default dashboards deployed by kube-prometheus-stack, this can be done by overriding the `grafana_exclude_default_dashboards` variable in [environments/common/inventory/group_vars/all/grafana.yml](../environments/common/inventory/group_vars/all/grafana.yml). +In addition to the default set of dashboards that are deployed by kube-prometheus-stack, the appliance ships with additional dashboards listed below. The set of appliance-specific dashboards can be configured via the `grafana_dashboards` variable. The dashboards are either internal to the [grafana-dashboards role](../ansible/roles/grafana-dashboards/files/) or downloaded from grafana.com. If you wish to selectively remove the default dashboards deployed by kube-prometheus-stack, this can be done by overriding the `grafana_exclude_default_dashboards` variable in [environments/common/inventory/group_vars/all/grafana.yml](../environments/common/inventory/group_vars/all/grafana.yml). #### node exporter slurm @@ -230,7 +230,7 @@ Note that this service is not password protected, allowing anyone with access to ### Upgrades -The appliance previously used [cloudalchemy.prometheus](https://github.com/cloudalchemy/ansible-prometheus) role to configure Prometheus, but our monitoring stack has since been moved into the [kube-prometheus-stack](https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack) Helm chart running on a k3s cluster. The some of the default Grafana dashboards deployed by kube-prometheus-stack are hardcoded to rely on the `job` label of metrics scraped from Node Exporter to have the value `node-exporter`. By default, the cloudalchemy role scraped these metrics with the `job` label set to `node`. Therefore, if upgrading from previous versions of the appliance which used the cloudalchemy role, pre-upgrade data will not show up by default in Grafana dashboards. The old data can still be viewed in the OpenHPC and Node Exporter Slurm dashboards by selecting the previous `job` value from the Job dropdown. +The appliance previously used [cloudalchemy.prometheus](https://github.com/cloudalchemy/ansible-prometheus) role to configure Prometheus, but our monitoring stack has since been moved into the [kube-prometheus-stack](https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack) Helm chart running on a k3s cluster. Some of the default Grafana dashboards deployed by kube-prometheus-stack are hardcoded to rely on the `job` label of metrics scraped from Node Exporter to have the value `node-exporter`. By default, the cloudalchemy role scraped these metrics with the `job` label set to `node`. Therefore, if upgrading from previous versions of the appliance which used the cloudalchemy role, pre-upgrade data will not show up by default in Grafana dashboards. The old data can still be viewed in the OpenHPC and Node Exporter Slurm dashboards by selecting the previous `job` value from the Job dropdown. ### Alerting and recording rules diff --git a/environments/common/inventory/group_vars/all/alertmanager.yml b/environments/common/inventory/group_vars/all/alertmanager.yml index 6d95ae233..fd91ce094 100644 --- a/environments/common/inventory/group_vars/all/alertmanager.yml +++ b/environments/common/inventory/group_vars/all/alertmanager.yml @@ -1,4 +1,4 @@ -alertmanager_image_tag: v0.27.0 +alertmanager_image_tag: 'v0.27.0' alertmanager_config: route: From 6506e7e893e417edab6d3efa07264e0b904ff2f8 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Tue, 12 Nov 2024 16:21:06 +0000 Subject: [PATCH 79/90] added readme link --- docs/monitoring-and-logging.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/monitoring-and-logging.md b/docs/monitoring-and-logging.md index 5e66121d1..6970405ac 100644 --- a/docs/monitoring-and-logging.md +++ b/docs/monitoring-and-logging.md @@ -42,7 +42,7 @@ Where `role_name` is the name of the internal role. ## Customising variables -You should only customise the variables in `environments/common` if you are working on a feature that you intend to contribute back. Instead you should override the variables in the environment relevant to your deployment. This is possible since inventories later in the inheritance chain have greater precedence. Please see [README.md](../README.md#environments) for a more detailed explanation. This notice exists to avoid the need to need to keep repeating this point in the following sections. Where it is noted that you should customise a variable, it is implied that this change should be made to your own environment e.g `environments/production` in preference to `environments/common`, even when +You should only customise the variables in `environments/common` if you are working on a feature that you intend to contribute back. Instead you should override the variables in the environment relevant to your deployment. This is possible since inventories later in the inheritance chain have greater precedence. Please see [README.md](../environments/README.md) for a more detailed explanation. This notice exists to avoid the need to need to keep repeating this point in the following sections. Where it is noted that you should customise a variable, it is implied that this change should be made to your own environment e.g `environments/production` in preference to `environments/common`, even when this is not explicitly stated. ## filebeat From a6d8edcb078352ad2d0b2d49f5f7f5af28e760d9 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Wed, 13 Nov 2024 09:41:26 +0000 Subject: [PATCH 80/90] file name and defaults changes --- .../all/{monitoring.yml => kube_prometheus_stack.yml} | 0 environments/common/inventory/group_vars/all/prometheus.yml | 4 ++-- 2 files changed, 2 insertions(+), 2 deletions(-) rename environments/common/inventory/group_vars/all/{monitoring.yml => kube_prometheus_stack.yml} (100%) diff --git a/environments/common/inventory/group_vars/all/monitoring.yml b/environments/common/inventory/group_vars/all/kube_prometheus_stack.yml similarity index 100% rename from environments/common/inventory/group_vars/all/monitoring.yml rename to environments/common/inventory/group_vars/all/kube_prometheus_stack.yml diff --git a/environments/common/inventory/group_vars/all/prometheus.yml b/environments/common/inventory/group_vars/all/prometheus.yml index 4bde55b3d..52b0a40e9 100644 --- a/environments/common/inventory/group_vars/all/prometheus.yml +++ b/environments/common/inventory/group_vars/all/prometheus.yml @@ -2,8 +2,8 @@ prometheus_port: 30000 prometheus_db_dir: "{{ appliances_state_dir }}/prometheus" -prometheus_storage_retention: "30d" -prometheus_storage_retention_size: "40GB" +prometheus_storage_retention: "31d" +prometheus_storage_retention_size: "100GB" prometheus_scrape_configs_default: - job_name: "slurm_exporter" scrape_interval: 30s From 5864b562083743d90cc436e51ef0a50ac233a5b5 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Wed, 13 Nov 2024 10:10:27 +0000 Subject: [PATCH 81/90] disambiguated default addresses --- environments/common/files/filebeat/filebeat.yml | 2 +- environments/common/inventory/group_vars/all/alertmanager.yml | 2 +- environments/common/inventory/group_vars/all/defaults.yml | 3 +-- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/environments/common/files/filebeat/filebeat.yml b/environments/common/files/filebeat/filebeat.yml index 0f7186b3a..fbf61e514 100644 --- a/environments/common/files/filebeat/filebeat.yml +++ b/environments/common/files/filebeat/filebeat.yml @@ -54,7 +54,7 @@ processors: - {from: "json.ElapsedRaw", type: "integer"} output.elasticsearch: - hosts: ["{{ opensearch_address }}:9200"] + hosts: ["{{ '127.0.0.1' if groups['opensearch'] == groups['filebeat'] else opensearch_ip }}:9200"] protocol: "https" ssl.verification_mode: none username: "admin" diff --git a/environments/common/inventory/group_vars/all/alertmanager.yml b/environments/common/inventory/group_vars/all/alertmanager.yml index fd91ce094..ad8c428ca 100644 --- a/environments/common/inventory/group_vars/all/alertmanager.yml +++ b/environments/common/inventory/group_vars/all/alertmanager.yml @@ -20,7 +20,7 @@ alertmanager_slack_receiver: authorization: credentials: "{{ alertmanager_slack_integration.app_creds | default('none') }}" text: "{{ '{{' }} .GroupLabels.alertname {{ '}}' }} : {{ '{{' }} .CommonAnnotations.description {{ '}}' }}" - title_link: "http://{{ prometheus_address }}/alertmanager/#/alerts?receiver=slack-receiver" + title_link: "http://{{ prometheus_address }}:{{ prometheus_port }}/alertmanager/#/alerts?receiver=slack-receiver" send_resolved: true # Uncomment below and add Slack bot app creds for Slack integration diff --git a/environments/common/inventory/group_vars/all/defaults.yml b/environments/common/inventory/group_vars/all/defaults.yml index 03df64dd2..53c3be124 100644 --- a/environments/common/inventory/group_vars/all/defaults.yml +++ b/environments/common/inventory/group_vars/all/defaults.yml @@ -15,8 +15,7 @@ internal_address: "{{ inventory_hostname }}" api_address: "{{ inventory_hostname }}" # Service endpoints -opensearch_address: "127.0.0.1" -prometheus_address: "{{ hostvars[groups['prometheus'].0].api_address }}:{{ prometheus_port }}" +prometheus_address: "{{ hostvars[groups['prometheus'].0].api_address }}" openondemand_address: "{{ hostvars[groups['openondemand'].0].api_address if groups['openondemand'] | count > 0 else '' }}" grafana_address: "{{ hostvars[groups['prometheus'].0].api_address }}" From 15b77db44f4ecd428a9709475a808ce8c8bcfef3 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Wed, 13 Nov 2024 10:55:50 +0000 Subject: [PATCH 82/90] separated prometheus recording and alerting rules --- .../defaults/main/main.yml | 9 ++++++--- docs/monitoring-and-logging.md | 2 +- .../inventory/group_vars/all/prometheus.yml | 18 ++++++++++-------- 3 files changed, 17 insertions(+), 12 deletions(-) diff --git a/ansible/roles/kube_prometheus_stack/defaults/main/main.yml b/ansible/roles/kube_prometheus_stack/defaults/main/main.yml index bf6097089..72c627179 100644 --- a/ansible/roles/kube_prometheus_stack/defaults/main/main.yml +++ b/ansible/roles/kube_prometheus_stack/defaults/main/main.yml @@ -63,13 +63,16 @@ prometheus_external_labels: prometheus_scrape_configs: [] -prometheus_extra_rules: [] +prometheus_extra_recording_rules: [] +prometheus_extra_alerting_rules: [] prometheus_rules: appliance-rules: groups: - - name: all - rules: "{{ prometheus_extra_rules }}" + - name: appliance-recording-rules + rules: "{{ prometheus_extra_recording_rules }}" + - name: appliance-alerting-rules + rules: "{{ prometheus_extra_alerting_rules }}" # ------------------------------------------------------------------------------------------ grafana_image_tag: 11.2.2 diff --git a/docs/monitoring-and-logging.md b/docs/monitoring-and-logging.md index 6970405ac..e46969da0 100644 --- a/docs/monitoring-and-logging.md +++ b/docs/monitoring-and-logging.md @@ -236,7 +236,7 @@ The appliance previously used [cloudalchemy.prometheus](https://github.com/cloud See the upstream documentation for [alerting](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/) and [recording](https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/) rules. -In addition to the default recording and alerting rules set by kube-prometheus-stack, the appliances provides a default set of rules which can be found in the `prometheus_extra_rules` list in: +In addition to the default recording and alerting rules set by kube-prometheus-stack, the appliance provides its own sets of default rules which can be found and modified in the `prometheus_extra_recording_rules` and `prometheus_extra_alerting_rules` lists in: > [environments/common/inventory/group_vars/all/prometheus.yml](../environments/common/inventory/group_vars/all/prometheus.yml) diff --git a/environments/common/inventory/group_vars/all/prometheus.yml b/environments/common/inventory/group_vars/all/prometheus.yml index 52b0a40e9..351aedbe7 100644 --- a/environments/common/inventory/group_vars/all/prometheus.yml +++ b/environments/common/inventory/group_vars/all/prometheus.yml @@ -20,14 +20,7 @@ prometheus_scrape_configs_default: replacement: '${1}' prometheus_scrape_configs: "{{ prometheus_scrape_configs_default + (openondemand_scrape_configs if groups['openondemand'] | count > 0 else []) }}" -prometheus_extra_rules: - - alert: SlurmNodeDown - annotations: - description: '{% raw %}{{ $value }} Slurm nodes are in down status.{% endraw %}' - summary: 'At least one Slurm node is down.' - expr: "slurm_nodes_down > 0\n" - labels: - severity: critical +prometheus_extra_recording_rules: - record: node_cpu_system_seconds:record expr: (100 * sum by(instance)(increase(node_cpu_seconds_total{mode="system",job="node-exporter"}[60s]))) / (sum by(instance)(increase(node_cpu_seconds_total{job="node-exporter"}[60s]))) - record: node_cpu_user_seconds:record @@ -42,3 +35,12 @@ prometheus_extra_rules: expr: min by (instance) (node_cpu_scaling_frequency_hertz) - record: node_cpu_scaling_frequency_hertz_max:record expr: max by (instance) (node_cpu_scaling_frequency_hertz) + +prometheus_extra_alerting_rules: + - alert: SlurmNodeDown + annotations: + description: '{% raw %}{{ $value }} Slurm nodes are in down status.{% endraw %}' + summary: 'At least one Slurm node is down.' + expr: "slurm_nodes_down > 0\n" + labels: + severity: critical From acf0c0d7c81239c573b75c83fe3b7370b417ce69 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Wed, 13 Nov 2024 11:43:01 +0000 Subject: [PATCH 83/90] adding alertmanager docs --- docs/alerting.md | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 docs/alerting.md diff --git a/docs/alerting.md b/docs/alerting.md new file mode 100644 index 000000000..fff516298 --- /dev/null +++ b/docs/alerting.md @@ -0,0 +1,8 @@ +# Receiving alert notifications + +The appliance uses [Alertmanager](https://prometheus.io/docs/alerting/latest/alertmanager/) to route Prometheus alerts to site receivers. See the [monitoring docs](monitoring-and-logging.md#alerting-and-recording-rules) for configuring custom Prometheus alerts. + +A default Slack alert receiver configuration is provided in +> [environments/common/inventory/group_vars/all/alertmanager.yml](../environments/common/inventory/group_vars/all/alertmanager.yml) + +which can be enabled by uncommenting the `alertmanager_slack_integration` and providing application credentials for a Slack app with `chat:write` permissions for the channel you wish to receive alerts in (see [Slack app](https://api.slack.com/quickstart) docs for configuration). From 8ca040733944dd622939672263b8464bf582bb56 Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Tue, 19 Nov 2024 09:53:20 +0000 Subject: [PATCH 84/90] bump --- .../.stackhpc/terraform/cluster_image.auto.tfvars.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json index f9a2087c8..1bf44d240 100644 --- a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-241118-0918-4538c6df", - "RL9": "openhpc-RL9-241118-0918-4538c6df" + "RL8": "openhpc-RL8-241119-0915-6f164927", + "RL9": "openhpc-RL9-241119-0914-6f164927" } } From f864023f3278d2b329a0fa1bc2a9b1fa9ef1952d Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Wed, 20 Nov 2024 12:17:34 +0000 Subject: [PATCH 85/90] bump --- .../.stackhpc/terraform/cluster_image.auto.tfvars.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json index 1bf44d240..64f80d2ee 100644 --- a/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/terraform/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-241119-0915-6f164927", - "RL9": "openhpc-RL9-241119-0914-6f164927" + "RL8": "openhpc-RL8-241120-1147-7f0af9e8", + "RL9": "openhpc-RL9-241120-1147-7f0af9e8" } } From aae2aa4bfcfeaa36a8f60982db805037a0b843ee Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Mon, 20 Jan 2025 10:47:54 +0000 Subject: [PATCH 86/90] pinned python kube version --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 75346c5d1..4965e1adf 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,5 +9,5 @@ cookiecutter selinux # this is a shim to avoid having to use --system-site-packages, you still need sudo yum install libselinux-python3 netaddr matplotlib -kubernetes +kubernetes==31.0.0 pulp-cli==0.23.2 From fe79a338a51e49c044d5d951d6ead8dbb2916d6c Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Mon, 20 Jan 2025 11:03:41 +0000 Subject: [PATCH 87/90] removed old monitoring services from systemd dropins --- environments/common/inventory/group_vars/all/systemd.yml | 6 ------ 1 file changed, 6 deletions(-) diff --git a/environments/common/inventory/group_vars/all/systemd.yml b/environments/common/inventory/group_vars/all/systemd.yml index 2c5e03e35..53efbda60 100644 --- a/environments/common/inventory/group_vars/all/systemd.yml +++ b/environments/common/inventory/group_vars/all/systemd.yml @@ -7,17 +7,11 @@ _systemd_dropins_statedir: opensearch: group: opensearch content: "{{ _systemd_requiresmount_statedir }}" - grafana-server: - group: grafana - content: "{{ _systemd_requiresmount_statedir }}" slurmdbd: group: openhpc content: "{{ _systemd_requiresmount_statedir }}" slurmctld: group: openhpc content: "{{ _systemd_requiresmount_statedir }}" - prometheus: - group: prometheus - content: "{{ _systemd_requiresmount_statedir }}" systemd_dropins: "{{ _systemd_dropins_statedir if appliances_state_dir is defined else {} }}" From 32735c0ec15c9175f003bb552921be5b5c64dc33 Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Mon, 20 Jan 2025 11:50:38 +0000 Subject: [PATCH 88/90] bump images --- environments/.stackhpc/tofu/cluster_image.auto.tfvars.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json b/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json index de5e8e194..81b57bb40 100644 --- a/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-250115-1510-99f67c6d", - "RL9": "openhpc-RL9-250115-1510-99f67c6d" + "RL8": "openhpc-RL8-250120-1106-fe79a338", + "RL9": "openhpc-RL9-250120-1106-fe79a338" } } From 63c309427a7c750580edb40a05ee4130a7cc3097 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Mon, 20 Jan 2025 17:13:14 +0000 Subject: [PATCH 89/90] fixed KPS not having access to legacy data --- ansible/roles/kube_prometheus_stack/tasks/main.yml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/ansible/roles/kube_prometheus_stack/tasks/main.yml b/ansible/roles/kube_prometheus_stack/tasks/main.yml index 13488de58..71ccfd0fb 100644 --- a/ansible/roles/kube_prometheus_stack/tasks/main.yml +++ b/ansible/roles/kube_prometheus_stack/tasks/main.yml @@ -30,6 +30,13 @@ cmd: "mv {{ item.path }} {{ prometheus_db_dir }}/prometheus-db/{{ item.path | regex_search('([^/]+$)') }}" loop: "{{ prometheus_files.files }}" + - name: Set ownership + ansible.builtin.file: + path: "{{ prometheus_db_dir }}" + owner: root + group: root + recurse: true + - name: Creating namespace kubernetes.core.k8s: name: "{{ kube_prometheus_stack_release_namespace }}" From bdd265aece40672d69ee343d181ffecdd1726fde Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Tue, 21 Jan 2025 08:29:10 +0000 Subject: [PATCH 90/90] fixed prometheus not resolving OOD --- environments/common/inventory/group_vars/all/openondemand.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environments/common/inventory/group_vars/all/openondemand.yml b/environments/common/inventory/group_vars/all/openondemand.yml index d252f9024..d954b60e6 100644 --- a/environments/common/inventory/group_vars/all/openondemand.yml +++ b/environments/common/inventory/group_vars/all/openondemand.yml @@ -183,7 +183,7 @@ openondemand_scrape_configs: scrape_interval: 2m static_configs: - targets: - - "{{ openondemand_address }}:9301" + - "{{ openondemand_ip }}:9301" labels: environment: "{{ appliances_environment_name }}" service: "openondemand"