diff --git a/Templates/AWS-HPC-Cluster.yaml b/Templates/AWS-HPC-Cluster.yaml
index a2bc479..73f4441 100644
--- a/Templates/AWS-HPC-Cluster.yaml
+++ b/Templates/AWS-HPC-Cluster.yaml
@@ -957,8 +957,8 @@ Outputs:
Cloud9URL:
Description: Cloud9 Environment
Value: !Sub 'https://${AWS::Region}.console.aws.amazon.com/cloud9/ide/${Cloud9}'
- EnginFrameURL:
+ WebURL:
Description: "EnginFrame HPC Portal, default username: ec2-user , default password: Change_this!"
Value: !Sub
- - 'https://${ALB}/enginframe'
+ - 'https://${ALB}/'
- ALB: !GetAtt ApplicationLoadBalancer.DNSName
\ No newline at end of file
diff --git a/modules/40.install.monitoring.compute.sh b/modules/40.install.monitoring.compute.sh
new file mode 100644
index 0000000..e7637b5
--- /dev/null
+++ b/modules/40.install.monitoring.compute.sh
@@ -0,0 +1,64 @@
+#!/bin/bash
+
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: MIT-0
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this
+# software and associated documentation files (the "Software"), to deal in the Software
+# without restriction, including without limitation the rights to use, copy, modify,
+# merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
+# permit persons to whom the Software is furnished to do so.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
+# PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+source /etc/parallelcluster/cfnconfig
+compute_instance_type=$(ec2-metadata -t | awk '{print $2}')
+gpu_instances="[pg][2-9].*\.[0-9]*[x]*large"
+
+monitoring_dir_name="monitoring"
+monitoring_home="${SHARED_FS_DIR}/${monitoring_dir_name}"
+
+set -x
+set -e
+
+installPreReq() {
+ yum -y install docker golang-bin
+ service docker start
+ chkconfig docker on
+ usermod -a -G docker $cfn_cluster_user
+
+ #to be replaced with yum -y install docker-compose as the repository problem is fixed
+ curl -L "https://github.com/docker/compose/releases/download/1.29.2/docker-compose-$(uname -s)-$(uname -m)" -o /usr/local/bin/docker-compose
+ chmod +x /usr/local/bin/docker-compose
+}
+
+configureMonitoring() {
+
+ if [[ $compute_instance_type =~ $gpu_instances ]]; then
+ distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
+ curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.repo | tee /etc/yum.repos.d/nvidia-docker.repo
+ yum -y clean expire-cache
+ yum -y install nvidia-docker2
+ systemctl restart docker
+ /usr/local/bin/docker-compose -f "${monitoring_home}/docker-compose/docker-compose.compute.gpu.yml" -p monitoring-compute up -d
+
+ else
+ /usr/local/bin/docker-compose -f "${monitoring_home}/docker-compose/docker-compose.compute.yml" -p monitoring-compute up -d
+ fi
+}
+
+# main
+# ----------------------------------------------------------------------------
+main() {
+ echo "[INFO][$(date '+%Y-%m-%d %H:%M:%S')] 40.install.monitoring.compute.sh: START" >&2
+ installPreReq
+ configureMonitoring
+ echo "[INFO][$(date '+%Y-%m-%d %H:%M:%S')] 40.install.monitoring.compute.sh: STOP" >&2
+}
+
+main "$@"
\ No newline at end of file
diff --git a/modules/40.install.monitoring.master.sh b/modules/40.install.monitoring.master.sh
new file mode 100644
index 0000000..6036ec4
--- /dev/null
+++ b/modules/40.install.monitoring.master.sh
@@ -0,0 +1,130 @@
+#!/bin/bash
+
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: MIT-0
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this
+# software and associated documentation files (the "Software"), to deal in the Software
+# without restriction, including without limitation the rights to use, copy, modify,
+# merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
+# permit persons to whom the Software is furnished to do so.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
+# PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+source /etc/parallelcluster/cfnconfig
+cfn_fsx_fs_id=$(cat /etc/chef/dna.json | grep \"cfn_fsx_fs_id\" | awk '{print $2}' | sed "s/\",//g;s/\"//g")
+master_instance_id=$(ec2-metadata -i | awk '{print $2}')
+cfn_max_queue_size=$(aws cloudformation describe-stacks --stack-name $stack_name --region $cfn_region | jq -r '.Stacks[0].Parameters | map(select(.ParameterKey == "MaxSize"))[0].ParameterValue')
+monitoring_dir_name="monitoring"
+monitoring_home="${SHARED_FS_DIR}/${monitoring_dir_name}"
+chef_dna="/etc/chef/dna.json"
+s3_bucket=$(echo $cfn_postinstall | sed "s/s3:\/\///g;s/\/.*//")
+grafana_password=$(aws secretsmanager get-secret-value --secret-id "${stack_name}" --query SecretString --output text --region "${cfn_region}")
+NICE_ROOT=$(jq --arg default "${SHARED_FS_DIR}/nice" -r '.post_install.enginframe | if has("nice_root") then .nice_root else $default end' "${dna_json}")
+
+
+set -x
+set -e
+
+installPreReq() {
+ yum -y install docker golang-bin
+ service docker start
+ chkconfig docker on
+ usermod -a -G docker $cfn_cluster_user
+
+ #to be replaced with yum -y install docker-compose as the repository problem is fixed
+ curl -L "https://github.com/docker/compose/releases/download/1.29.2/docker-compose-$(uname -s)-$(uname -m)" -o /usr/local/bin/docker-compose
+ chmod +x /usr/local/bin/docker-compose
+}
+
+saveClusterConfigLocally(){
+
+ cluster_s3_bucket=$(cat "${chef_dna}" | grep \"cluster_s3_bucket\" | awk '{print $2}' | sed "s/\",//g;s/\"//g")
+ cluster_config_s3_key=$(cat "${chef_dna}" | grep \"cluster_config_s3_key\" | awk '{print $2}' | sed "s/\",//g;s/\"//g")
+ cluster_config_version=$(cat "${chef_dna}" | grep \"cluster_config_version\" | awk '{print $2}' | sed "s/\",//g;s/\"//g")
+ log_group_names="\/aws\/parallelcluster\/$(echo ${stack_name} | cut -d "-" -f2-)"
+
+ mkdir -p "${monitoring_home}/parallelcluster"
+ aws s3api get-object --bucket $cluster_s3_bucket --key $cluster_config_s3_key --region $cfn_region --version-id $cluster_config_version "${monitoring_home}/parallelcluster/cluster-config.json"
+}
+
+installMonitoring(){
+
+ aws s3 cp --recursive "${post_install_base}/monitoring" "${monitoring_home}" --region "${cfn_region}" || exit 1
+ chown $cfn_cluster_user:$cfn_cluster_user -R "${monitoring_home}"
+ chmod +x ${monitoring_home}/custom-metrics/*
+
+ cp -rp ${monitoring_home}/custom-metrics/* /usr/local/bin/
+ mv -f "${monitoring_home}/prometheus-slurm-exporter/slurm_exporter.service" /etc/systemd/system/
+
+ cp -rp ${monitoring_home}/www/* "${NICE_ROOT}/enginframe/conf/tomcat/webapps/ROOT/"
+}
+
+
+
+configureMonitoring() {
+
+ (crontab -l -u $cfn_cluster_user; echo "*/1 * * * * /usr/local/bin/1m-cost-metrics.sh") | crontab -u $cfn_cluster_user -
+ (crontab -l -u $cfn_cluster_user; echo "*/60 * * * * /usr/local/bin/1h-cost-metrics.sh") | crontab -u $cfn_cluster_user -
+
+ # replace tokens
+ sed -i "s/_S3_BUCKET_/${s3_bucket}/g" "${monitoring_home}/grafana/dashboards/ParallelCluster.json"
+ sed -i "s/__INSTANCE_ID__/${master_instance_id}/g" "${monitoring_home}/grafana/dashboards/ParallelCluster.json"
+ sed -i "s/__FSX_ID__/${cfn_fsx_fs_id}/g" "${monitoring_home}/grafana/dashboards/ParallelCluster.json"
+ sed -i "s/__AWS_REGION__/${cfn_region}/g" "${monitoring_home}/grafana/dashboards/ParallelCluster.json"
+
+ sed -i "s/__AWS_REGION__/${cfn_region}/g" "${monitoring_home}/grafana/dashboards/logs.json"
+ sed -i "s/__LOG_GROUP__NAMES__/${log_group_names}/g" "${monitoring_home}/grafana/dashboards/logs.json"
+
+ sed -i "s/__Application__/${stack_name}/g" "${monitoring_home}/prometheus/prometheus.yml"
+
+ sed -i "s/__INSTANCE_ID__/${master_instance_id}/g" "${monitoring_home}/grafana/dashboards/master-node-details.json"
+ sed -i "s/__INSTANCE_ID__/${master_instance_id}/g" "${monitoring_home}/grafana/dashboards/compute-node-list.json"
+ sed -i "s/__INSTANCE_ID__/${master_instance_id}/g" "${monitoring_home}/grafana/dashboards/compute-node-details.json"
+
+ sed -i "s~__MONITORING_DIR__~${monitoring_home}~g" "${monitoring_home}/docker-compose/docker-compose.master.yml"
+ sed -i "s~__GRAFANA_PASSWORD__~${grafana_password}~g" "${monitoring_home}/docker-compose/docker-compose.master.yml"
+
+
+ # Download and build prometheus-slurm-exporter
+ ##### Plese note this software package is under GPLv3 License #####
+ # More info here: https://github.com/vpenso/prometheus-slurm-exporter/blob/master/LICENSE
+ cd "${monitoring_home}"
+ #FIXME: temporary
+ rm -rf prometheus-slurm-exporter
+ git clone https://github.com/vpenso/prometheus-slurm-exporter.git
+ cd prometheus-slurm-exporter
+ sed -i 's/NodeList,AllocMem,Memory,CPUsState,StateLong/NodeList: ,AllocMem: ,Memory: ,CPUsState: ,StateLong:/' node.go
+ GOPATH=/root/go-modules-cache HOME=/root go mod download
+ GOPATH=/root/go-modules-cache HOME=/root go build
+ mv -f "${monitoring_home}/prometheus-slurm-exporter/prometheus-slurm-exporter" /usr/bin/prometheus-slurm-exporter
+}
+
+
+startMonitoringDaemons() {
+
+ /usr/local/bin/docker-compose --env-file /etc/parallelcluster/cfnconfig -f "${monitoring_home}/docker-compose/docker-compose.master.yml" -p monitoring-master up -d
+ systemctl daemon-reload
+ systemctl enable slurm_exporter
+ systemctl start slurm_exporter
+
+}
+
+# main
+# ----------------------------------------------------------------------------
+main() {
+ echo "[INFO][$(date '+%Y-%m-%d %H:%M:%S')] 40.install.monitoring.master.sh: START" >&2
+ installPreReq
+ saveClusterConfigLocally
+ installMonitoring
+ configureMonitoring
+ startMonitoringDaemons
+ echo "[INFO][$(date '+%Y-%m-%d %H:%M:%S')] 40.install.monitoring.master.sh: STOP" >&2
+}
+
+main "$@"
\ No newline at end of file
diff --git a/monitoring/custom-metrics/1h-cost-metrics.sh b/monitoring/custom-metrics/1h-cost-metrics.sh
new file mode 100644
index 0000000..f4d3cb0
--- /dev/null
+++ b/monitoring/custom-metrics/1h-cost-metrics.sh
@@ -0,0 +1,123 @@
+#!/bin/bash
+#
+#
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: MIT-0
+#
+#
+
+#source the AWS ParallelCluster profile
+. /etc/parallelcluster/cfnconfig
+
+export AWS_DEFAULT_REGION=$cfn_region
+aws_region_long_name=$(python /usr/local/bin/aws-region.py $cfn_region)
+aws_region_long_name=${aws_region_long_name/Europe/EU}
+
+masterInstanceType=$(ec2-metadata -t | awk '{print $2}')
+masterInstanceId=$(ec2-metadata -i | awk '{print $2}')
+s3_bucket=$(echo $cfn_postinstall | sed "s/s3:\/\///g;s/\/.*//")
+s3_size_gb=$(echo "$(aws s3api list-objects --bucket $s3_bucket --output json --query "[sum(Contents[].Size)]"| sed -n 2p | tr -d ' ') / 1024 / 1024 / 1024" | bc)
+
+
+#retrieve the s3 cost
+if [[ $s3_size_gb -le 51200 ]]; then
+ s3_range=51200
+elif [[ $VAR -le 512000 ]]; then
+ s3_range=512000
+else
+ s3_range="Inf"
+fi
+
+####################### S3 #########################
+
+s3_cost_gb_month=$(aws --region us-east-1 pricing get-products \
+ --service-code AmazonS3 \
+ --filters 'Type=TERM_MATCH,Field=location,Value='"${aws_region_long_name}" \
+ 'Type=TERM_MATCH,Field=storageClass,Value=General Purpose' \
+ --query 'PriceList[0]' --output text \
+ | jq -r --arg endRange $s3_range '.terms.OnDemand | to_entries[] | .value.priceDimensions | to_entries[].value | select(.endRange==$endRange).pricePerUnit.USD')
+
+s3=$(echo "scale=2; $s3_cost_gb_month * $s3_size_gb / 720" | bc)
+echo "s3_cost $s3" | curl --data-binary @- http://127.0.0.1:9091/metrics/job/cost
+
+
+####################### Master #########################
+master_node_h_price=$(aws pricing get-products \
+ --region us-east-1 \
+ --service-code AmazonEC2 \
+ --filters 'Type=TERM_MATCH,Field=instanceType,Value='$masterInstanceType \
+ 'Type=TERM_MATCH,Field=location,Value='"${aws_region_long_name}" \
+ 'Type=TERM_MATCH,Field=preInstalledSw,Value=NA' \
+ 'Type=TERM_MATCH,Field=operatingSystem,Value=Linux' \
+ 'Type=TERM_MATCH,Field=tenancy,Value=Shared' \
+ 'Type=TERM_MATCH,Field=capacitystatus,Value=UnusedCapacityReservation' \
+ --output text \
+ --query 'PriceList' \
+ | jq -r '.terms.OnDemand | to_entries[] | .value.priceDimensions | to_entries[] | .value.pricePerUnit.USD')
+
+echo "master_node_cost $master_node_h_price" | curl --data-binary @- http://127.0.0.1:9091/metrics/job/cost
+
+
+fsx_id=$(aws cloudformation describe-stacks --stack-name $stack_name --region $cfn_region \
+ | jq -r '.Stacks[0].Parameters | map(select(.ParameterKey == "FSXOptions"))[0].ParameterValue' \
+ | awk -F "," '{print $2}')
+fsx_summary=$(aws fsx describe-file-systems --region $cfn_region --file-system-ids $fsx_id)
+fsx_size_gb=$(echo $fsx_summary | jq -r '.FileSystems[0].StorageCapacity')
+fsx_type=$(echo $fsx_summary | jq -r '.FileSystems[0].LustreConfiguration.DeploymentType')
+fsx_throughput=$(echo $fsx_summary | jq -r '.FileSystems[0].LustreConfiguration.PerUnitStorageThroughput')
+
+if [[ $fsx_type = "SCRATCH_2" ]] || [[ $fsx_type = "SCRATCH_1" ]]; then
+ fsx_cost_gb_month=$(aws pricing get-products \
+ --region us-east-1 \
+ --service-code AmazonFSx \
+ --filters 'Type=TERM_MATCH,Field=location,Value='"${aws_region_long_name}" \
+ 'Type=TERM_MATCH,Field=fileSystemType,Value=Lustre' \
+ 'Type=TERM_MATCH,Field=throughputCapacity,Value=N/A' \
+ --output text \
+ --query 'PriceList' \
+ | jq -r '.terms.OnDemand | to_entries[] | .value.priceDimensions | to_entries[] | .value.pricePerUnit.USD')
+
+elif [ $fsx_type = "PERSISTENT_1" ]; then
+ fsx_cost_gb_month=$(aws pricing get-products \
+ --region us-east-1 \
+ --service-code AmazonFSx \
+ --filters 'Type=TERM_MATCH,Field=location,Value='"${aws_region_long_name}" \
+ 'Type=TERM_MATCH,Field=fileSystemType,Value=Lustre' \
+ 'Type=TERM_MATCH,Field=throughputCapacity,Value='$fsx_throughput \
+ --output text \
+ --query 'PriceList' \
+ | jq -r '.terms.OnDemand | to_entries[] | .value.priceDimensions | to_entries[] | .value.pricePerUnit.USD')
+
+else
+ fsx_cost_gb_month=0
+fi
+
+fsx=$(echo "scale=2; $fsx_cost_gb_month * $fsx_size_gb / 720" | bc)
+echo "fsx_cost $fsx" | curl --data-binary @- http://127.0.0.1:9091/metrics/job/cost
+
+
+#parametrize:
+ebs_volume_total_cost=0
+ebs_volume_ids=$(aws ec2 describe-instances --instance-ids $masterInstanceId \
+ | jq -r '.Reservations | to_entries[].value | .Instances | to_entries[].value | .BlockDeviceMappings | to_entries[].value | .Ebs.VolumeId')
+
+for ebs_volume_id in $ebs_volume_ids
+do
+ ebs_volume_type=$(aws ec2 describe-volumes --volume-ids $ebs_volume_id | jq -r '.Volumes | to_entries[].value.VolumeType')
+ #ebs_volume_iops=$(aws ec2 describe-volumes --volume-ids $ebs_volume_id | jq -r '.Volumes | to_entries[].value.Iops')
+ ebs_volume_size=$(aws ec2 describe-volumes --volume-ids $ebs_volume_id | jq -r '.Volumes | to_entries[].value.Size')
+
+ ebs_cost_gb_month=$(aws --region us-east-1 pricing get-products \
+ --service-code AmazonEC2 \
+ --query 'PriceList' \
+ --output text \
+ --filters 'Type=TERM_MATCH,Field=location,Value='"${aws_region_long_name}" \
+ 'Type=TERM_MATCH,Field=productFamily,Value=Storage' \
+ 'Type=TERM_MATCH,Field=volumeApiName,Value='$ebs_volume_type \
+ | jq -r '.terms.OnDemand | to_entries[] | .value.priceDimensions | to_entries[] | .value.pricePerUnit.USD')
+
+ ebs_volume_cost=$(echo "scale=2; $ebs_cost_gb_month * $ebs_volume_size / 720" | bc)
+ ebs_volume_total_cost=$(echo "scale=2; $ebs_volume_total_cost + $ebs_volume_cost" | bc)
+done
+
+echo "ebs_master_cost $ebs_volume_total_cost" | curl --data-binary @- http://127.0.0.1:9091/metrics/job/cost
\ No newline at end of file
diff --git a/monitoring/custom-metrics/1m-cost-metrics.sh b/monitoring/custom-metrics/1m-cost-metrics.sh
new file mode 100644
index 0000000..9a8ee8a
--- /dev/null
+++ b/monitoring/custom-metrics/1m-cost-metrics.sh
@@ -0,0 +1,70 @@
+#!/bin/bash
+#
+#
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: MIT-0
+#
+#
+
+#!/bin/bash
+#
+#
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: MIT-0
+#
+#
+
+#source the AWS ParallelCluster profile
+. /etc/parallelcluster/cfnconfig
+
+export AWS_DEFAULT_REGION=$cfn_region
+aws_region_long_name=$(python /usr/local/bin/aws-region.py $cfn_region)
+aws_region_long_name=${aws_region_long_name/Europe/EU}
+
+#FIXME: not hardcode dir
+monitoring_dir_name="monitoring"
+monitoring_home="/fsx/${monitoring_dir_name}"
+
+queues=$(/opt/slurm/bin/sinfo --noheader -O partition | sed 's/\*//g')
+cluster_config_file="${monitoring_home}/parallelcluster-setup/cluster-config.json"
+
+compute_nodes_total_cost=0
+
+for queue in $queues; do
+
+ instance_type=$(cat "${cluster_config_file}" | jq -r --arg queue $queue '.cluster.queue_settings | to_entries[] | select(.key==$queue).value.compute_resource_settings | to_entries[]| .value.instance_type')
+
+ compute_node_h_price=$(aws pricing get-products \
+ --region us-east-1 \
+ --service-code AmazonEC2 \
+ --filters 'Type=TERM_MATCH,Field=instanceType,Value='$instance_type \
+ 'Type=TERM_MATCH,Field=location,Value='"${aws_region_long_name}" \
+ 'Type=TERM_MATCH,Field=preInstalledSw,Value=NA' \
+ 'Type=TERM_MATCH,Field=operatingSystem,Value=Linux' \
+ 'Type=TERM_MATCH,Field=tenancy,Value=Shared' \
+ 'Type=TERM_MATCH,Field=capacitystatus,Value=UnusedCapacityReservation' \
+ --output text \
+ --query 'PriceList' \
+ | jq -r '.terms.OnDemand | to_entries[] | .value.priceDimensions | to_entries[] | .value.pricePerUnit.USD')
+
+ ebs_cost_gb_month=$(aws --region us-east-1 pricing get-products \
+ --service-code AmazonEC2 \
+ --query 'PriceList' \
+ --output text \
+ --filters 'Type=TERM_MATCH,Field=location,Value='"${aws_region_long_name}" \
+ 'Type=TERM_MATCH,Field=productFamily,Value=Storage' \
+ 'Type=TERM_MATCH,Field=volumeApiName,Value=gp2' \
+ | jq -r '.terms.OnDemand | to_entries[] | .value.priceDimensions | to_entries[] | .value.pricePerUnit.USD')
+
+ total_num_compute_nodes=$(/opt/slurm/bin/sinfo --noheader --partition=$queue | egrep -v "idle~" | awk '{sum += $4} END {if (sum) print sum; else print 0; }')
+
+ ebs_volume_size=$(aws cloudformation describe-stacks --stack-name $stack_name --region $cfn_region | jq -r '.Stacks[0].Parameters | map(select(.ParameterKey == "ComputeRootVolumeSize"))[0].ParameterValue')
+ compute_ebs_volume_cost=$(echo "scale=2; $ebs_cost_gb_month * $total_num_compute_nodes * $ebs_volume_size / 720" | bc)
+ compute_nodes_cost=$(echo "scale=2; $total_num_compute_nodes * $compute_node_h_price" | bc)
+
+ compute_nodes_total_cost=$(echo "scale=2; $compute_nodes_total_cost + $compute_nodes_cost" | bc)
+
+done
+
+echo "ebs_compute_cost $compute_ebs_volume_cost" | curl --data-binary @- http://127.0.0.1:9091/metrics/job/cost
+echo "compute_nodes_cost $compute_nodes_total_cost" | curl --data-binary @- http://127.0.0.1:9091/metrics/job/cost
\ No newline at end of file
diff --git a/monitoring/custom-metrics/aws-region.py b/monitoring/custom-metrics/aws-region.py
new file mode 100644
index 0000000..c5465f9
--- /dev/null
+++ b/monitoring/custom-metrics/aws-region.py
@@ -0,0 +1,23 @@
+#
+#
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: MIT-0
+#
+#
+import json
+import sys
+
+from pkg_resources import resource_filename
+
+region = str(sys.argv[1])
+
+name = None
+endpoint_file = resource_filename('botocore', 'data/endpoints.json')
+with open(endpoint_file, 'r') as ep_file:
+ data = json.load(ep_file)
+ for partition in data['partitions']:
+ if region in partition['regions']:
+ name = partition['regions'][region]['description']
+ break
+
+print(name)
\ No newline at end of file
diff --git a/monitoring/docker-compose/docker-compose.compute.gpu.yml b/monitoring/docker-compose/docker-compose.compute.gpu.yml
new file mode 100644
index 0000000..e0778ac
--- /dev/null
+++ b/monitoring/docker-compose/docker-compose.compute.gpu.yml
@@ -0,0 +1,28 @@
+#
+#
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: MIT-0
+#
+#
+version: '3.8'
+services:
+ prometheus-node-exporter:
+ container_name: node-exporter
+ network_mode: host
+ pid: host
+ restart: unless-stopped
+ volumes:
+ - '/:/host:ro,rslave'
+ image: quay.io/prometheus/node-exporter
+ command:
+ - '--path.rootfs=/host'
+ dcgm-exporter:
+ container_name: nvidia-dcgm
+ network_mode: host
+ pid: host
+ restart: unless-stopped
+ image: nvidia/dcgm-exporter
+ runtime: nvidia
+ environment:
+ - NVIDIA_VISIBLE_DEVICES=all
+ - NVIDIA_DRIVER_CAPABILITIES=all
\ No newline at end of file
diff --git a/monitoring/docker-compose/docker-compose.compute.yml b/monitoring/docker-compose/docker-compose.compute.yml
new file mode 100644
index 0000000..3f1dfd6
--- /dev/null
+++ b/monitoring/docker-compose/docker-compose.compute.yml
@@ -0,0 +1,18 @@
+#
+#
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: MIT-0
+#
+#
+version: '3.8'
+services:
+ prometheus-node-exporter:
+ container_name: node-exporter
+ network_mode: host
+ pid: host
+ restart: unless-stopped
+ volumes:
+ - '/:/host:ro,rslave'
+ image: quay.io/prometheus/node-exporter
+ command:
+ - '--path.rootfs=/host'
\ No newline at end of file
diff --git a/monitoring/docker-compose/docker-compose.master.yml b/monitoring/docker-compose/docker-compose.master.yml
new file mode 100644
index 0000000..270f5b8
--- /dev/null
+++ b/monitoring/docker-compose/docker-compose.master.yml
@@ -0,0 +1,65 @@
+#
+#
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: MIT-0
+#
+#
+version: '3.8'
+services:
+ pushgateway:
+ container_name: pushgateway
+ network_mode: host
+ pid: host
+ restart: unless-stopped
+ image: prom/pushgateway
+ prometheus:
+ container_name: prometheus
+ network_mode: host
+ pid: host
+ restart: unless-stopped
+ volumes:
+ - '__MONITORING_DIR__/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml'
+ - 'prometheus-data:/prometheus'
+ image: prom/prometheus
+ command:
+ - '--config.file=/etc/prometheus/prometheus.yml'
+ - '--storage.tsdb.path=/prometheus'
+ - '--web.console.libraries=/usr/share/prometheus/console_libraries'
+ - '--web.console.templates=/usr/share/prometheus/consoles'
+ - '--web.external-url=/prometheus/'
+ - '--web.route-prefix=/'
+ grafana:
+ container_name: grafana
+ network_mode: host
+ pid: host
+ restart: unless-stopped
+ environment:
+ - 'GF_SECURITY_ADMIN_PASSWORD=__GRAFANA_PASSWORD__'
+ - 'GF_SERVER_ROOT_URL=http://%(domain)s/grafana/'
+ volumes:
+ - '__MONITORING_DIR__/grafana:/etc/grafana/provisioning'
+ - 'grafana-data:/var/lib/grafana'
+ image: grafana/grafana
+ prometheus-node-exporter:
+ container_name: node-exporter
+ network_mode: host
+ pid: host
+ restart: unless-stopped
+ volumes:
+ - '/:/host:ro,rslave'
+ image: quay.io/prometheus/node-exporter
+ command:
+ - '--path.rootfs=/host'
+ nginx:
+ container_name: nginx
+ network_mode: host
+ pid: host
+ restart: unless-stopped
+ volumes:
+ - '__MONITORING_DIR__/nginx/conf.d:/etc/nginx/conf.d/'
+ - '__MONITORING_DIR__/nginx/ssl:/etc/ssl/'
+ - '__MONITORING_DIR__/www:/usr/share/nginx/html'
+ image: nginx
+volumes:
+ prometheus-data:
+ grafana-data:
\ No newline at end of file
diff --git a/monitoring/grafana/dashboards/ParallelCluster.json b/monitoring/grafana/dashboards/ParallelCluster.json
new file mode 100755
index 0000000..8d48680
--- /dev/null
+++ b/monitoring/grafana/dashboards/ParallelCluster.json
@@ -0,0 +1,2761 @@
+{
+ "annotations": {
+ "list": [
+ {
+ "builtIn": 1,
+ "datasource": "-- Grafana --",
+ "enable": true,
+ "hide": true,
+ "iconColor": "rgba(0, 211, 255, 1)",
+ "name": "Annotations & Alerts",
+ "type": "dashboard"
+ }
+ ]
+ },
+ "editable": true,
+ "gnetId": null,
+ "graphTooltip": 0,
+ "links": [
+ {
+ "asDropdown": false,
+ "icon": "external link",
+ "tags": [],
+ "type": "dashboards"
+ }
+ ],
+ "panels": [
+ {
+ "collapsed": false,
+ "datasource": null,
+ "gridPos": {
+ "h": 1,
+ "w": 24,
+ "x": 0,
+ "y": 0
+ },
+ "id": 45,
+ "panels": [],
+ "title": "Storage",
+ "type": "row"
+ },
+ {
+ "columns": [],
+ "datasource": "prometheus",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {}
+ },
+ "overrides": []
+ },
+ "fontSize": "110%",
+ "gridPos": {
+ "h": 9,
+ "w": 8,
+ "x": 0,
+ "y": 1
+ },
+ "id": 50,
+ "links": [],
+ "pageSize": null,
+ "scroll": true,
+ "showHeader": true,
+ "sort": {
+ "col": 6,
+ "desc": false
+ },
+ "styles": [
+ {
+ "alias": "Mounted on",
+ "align": "auto",
+ "colorMode": null,
+ "colors": [
+ "rgba(50, 172, 45, 0.97)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(245, 54, 54, 0.9)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "mappingType": 1,
+ "pattern": "mountpoint",
+ "thresholds": [
+ ""
+ ],
+ "type": "string",
+ "unit": "bytes"
+ },
+ {
+ "alias": "Avail",
+ "align": "auto",
+ "colorMode": "value",
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "mappingType": 1,
+ "pattern": "Value #A",
+ "thresholds": [
+ "10000000000",
+ "20000000000"
+ ],
+ "type": "number",
+ "unit": "bytes"
+ },
+ {
+ "alias": "Used",
+ "align": "auto",
+ "colorMode": "cell",
+ "colors": [
+ "rgba(50, 172, 45, 0.97)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(245, 54, 54, 0.9)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "mappingType": 1,
+ "pattern": "Value #B",
+ "thresholds": [
+ "0.6",
+ "0.8"
+ ],
+ "type": "number",
+ "unit": "percentunit"
+ },
+ {
+ "alias": "Size",
+ "align": "auto",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 1,
+ "link": false,
+ "mappingType": 1,
+ "pattern": "Value #C",
+ "thresholds": [],
+ "type": "number",
+ "unit": "bytes"
+ },
+ {
+ "alias": "Filesystem",
+ "align": "auto",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "link": false,
+ "mappingType": 1,
+ "pattern": "fstype",
+ "thresholds": [],
+ "type": "string",
+ "unit": "short"
+ },
+ {
+ "alias": "IP",
+ "align": "auto",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "link": false,
+ "mappingType": 1,
+ "pattern": "instance",
+ "preserveFormat": false,
+ "sanitize": false,
+ "thresholds": [],
+ "type": "string",
+ "unit": "short"
+ },
+ {
+ "alias": "",
+ "align": "auto",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "decimals": 2,
+ "pattern": "/.*/",
+ "preserveFormat": true,
+ "sanitize": false,
+ "thresholds": [],
+ "type": "hidden",
+ "unit": "short"
+ }
+ ],
+ "targets": [
+ {
+ "expr": "node_filesystem_size_bytes{instance_id=\"__INSTANCE_ID__\",fstype=~\"ext4|xfs|lustre|nfs|nfs4\"}-0",
+ "format": "table",
+ "hide": false,
+ "instant": true,
+ "interval": "",
+ "intervalFactor": 1,
+ "legendFormat": "",
+ "refId": "C"
+ },
+ {
+ "expr": "node_filesystem_avail_bytes {instance_id=\"__INSTANCE_ID__\",fstype=~\"ext4|xfs|lustre|nfs|nfs4\"}-0",
+ "format": "table",
+ "hide": false,
+ "instant": true,
+ "interval": "10s",
+ "intervalFactor": 1,
+ "legendFormat": "",
+ "refId": "A"
+ },
+ {
+ "expr": "1-(node_filesystem_free_bytes{instance_id=\"__INSTANCE_ID__\",fstype=~\"ext4|xfs|lustre|nfs|nfs4\"} / node_filesystem_size_bytes{instance_id=\"__INSTANCE_ID__\",fstype=~\"ext4|xfs|lustre|nfs|nfs4\"})",
+ "format": "table",
+ "hide": false,
+ "instant": true,
+ "interval": "",
+ "intervalFactor": 1,
+ "legendFormat": "",
+ "refId": "B"
+ }
+ ],
+ "title": "Cluster Storage",
+ "transform": "table",
+ "type": "table-old"
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "cloudwatch",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {
+ "align": null
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ },
+ {
+ "color": "red",
+ "value": 80
+ }
+ ]
+ }
+ },
+ "overrides": []
+ },
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 9,
+ "w": 8,
+ "x": 8,
+ "y": 1
+ },
+ "hiddenSeries": false,
+ "id": 52,
+ "interval": "",
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "hideEmpty": false,
+ "hideZero": false,
+ "max": true,
+ "min": true,
+ "rightSide": false,
+ "show": true,
+ "total": true,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pluginVersion": "7.0.3",
+ "pointradius": 2,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "alias": "",
+ "dimensions": {"FileSystemId": "__FSX_ID__"},
+ "expression": "",
+ "id": "fsxRead",
+ "matchExact": false,
+ "metricName": "DataReadBytes",
+ "namespace": "AWS/FSx",
+ "period": "",
+ "refId": "A",
+ "region": "__AWS_REGION__",
+ "statistics": [
+ "Sum"
+ ]
+ },
+ {
+ "alias": "",
+ "dimensions": {"FileSystemId": "__FSX_ID__"},
+ "expression": "",
+ "hide": false,
+ "id": "fsxWrite",
+ "matchExact": false,
+ "metricName": "DataWriteBytes",
+ "namespace": "AWS/FSx",
+ "period": "",
+ "refId": "B",
+ "region": "__AWS_REGION__",
+ "statistics": [
+ "Sum"
+ ]
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "FSx Total throughput ",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "transformations": [
+ {
+ "id": "filterFieldsByName",
+ "options": {
+ "include": {
+ "names": [
+ "Time",
+ "DataReadBytes",
+ "DataWriteBytes"
+ ]
+ }
+ }
+ }
+ ],
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "$$hashKey": "object:9663",
+ "decimals": 1,
+ "format": "Bps",
+ "label": "Total throughput (bytes/sec)",
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "$$hashKey": "object:9664",
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "cloudwatch",
+ "description": "",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {
+ "align": null
+ },
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ },
+ {
+ "color": "red",
+ "value": 80
+ }
+ ]
+ }
+ },
+ "overrides": []
+ },
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 9,
+ "w": 8,
+ "x": 16,
+ "y": 1
+ },
+ "hiddenSeries": false,
+ "id": 53,
+ "interval": "",
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
+ "min": true,
+ "show": true,
+ "total": true,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pluginVersion": "7.0.3",
+ "pointradius": 2,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "alias": "",
+ "dimensions": {"FileSystemId": "__FSX_ID__"},
+ "expression": "",
+ "id": "fsxIOPsRead",
+ "matchExact": false,
+ "metricName": "DataReadOperations",
+ "namespace": "AWS/FSx",
+ "period": "",
+ "refId": "A",
+ "region": "__AWS_REGION__",
+ "statistics": [
+ "Sum"
+ ]
+ },
+ {
+ "alias": "",
+ "dimensions": {"FileSystemId": "__FSX_ID__"},
+ "expression": "",
+ "hide": false,
+ "id": "fsxIOPsWrite",
+ "matchExact": false,
+ "metricName": "DataWriteOperations",
+ "namespace": "AWS/FSx",
+ "period": "",
+ "refId": "B",
+ "region": "__AWS_REGION__",
+ "statistics": [
+ "Sum"
+ ]
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "FSx Total IOPs",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "transformations": [
+ {
+ "id": "filterFieldsByName",
+ "options": {
+ "include": {
+ "names": [
+ "Time",
+ "DataReadOperations",
+ "DataWriteOperations"
+ ]
+ }
+ }
+ }
+ ],
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "$$hashKey": "object:9663",
+ "decimals": 1,
+ "format": "iops",
+ "label": "Total IOPs (Operations/sec)",
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "$$hashKey": "object:9664",
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "collapsed": false,
+ "datasource": null,
+ "gridPos": {
+ "h": 1,
+ "w": 24,
+ "x": 0,
+ "y": 10
+ },
+ "id": 40,
+ "panels": [],
+ "title": "Scheduler Stats",
+ "type": "row"
+ },
+ {
+ "datasource": "prometheus",
+ "description": "",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {},
+ "decimals": 0,
+ "displayName": "Jobs",
+ "mappings": [],
+ "max": 6,
+ "min": 0,
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ },
+ {
+ "color": "#EAB839",
+ "value": 3.01
+ },
+ {
+ "color": "red",
+ "value": 5.01
+ }
+ ]
+ },
+ "unit": "none"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 6,
+ "w": 4,
+ "x": 0,
+ "y": 11
+ },
+ "id": 48,
+ "options": {
+ "orientation": "auto",
+ "reduceOptions": {
+ "calcs": [
+ "last"
+ ],
+ "fields": "",
+ "values": false
+ },
+ "showThresholdLabels": false,
+ "showThresholdMarkers": true
+ },
+ "pluginVersion": "7.0.6",
+ "targets": [
+ {
+ "expr": "slurm_queue_configuring",
+ "instant": false,
+ "interval": "1",
+ "intervalFactor": 1,
+ "legendFormat": " ",
+ "refId": "A"
+ }
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Slurm Pending Jobs",
+ "type": "gauge"
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "prometheus",
+ "decimals": 0,
+ "fieldConfig": {
+ "defaults": {
+ "custom": {}
+ },
+ "overrides": []
+ },
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 12,
+ "w": 9,
+ "x": 4,
+ "y": 11
+ },
+ "hiddenSeries": false,
+ "id": 59,
+ "legend": {
+ "alignAsTable": true,
+ "avg": false,
+ "current": true,
+ "max": true,
+ "min": true,
+ "rightSide": true,
+ "show": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "slurm_queue_completing",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "Completing Jobs",
+ "refId": "A"
+ },
+ {
+ "expr": "slurm_queue_running",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "Running Jobs",
+ "refId": "B"
+ },
+ {
+ "expr": "slurm_queue_pending",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "Pending Jobs",
+ "refId": "C"
+ },
+ {
+ "expr": "slurm_queue_completed",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "Completed Jobs",
+ "refId": "D"
+ },
+ {
+ "expr": "slurm_queue_timeout",
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "Timed out Jobs",
+ "refId": "E"
+ },
+ {
+ "expr": "slurm_queue_failed",
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "Failed Jobs",
+ "refId": "F"
+ },
+ {
+ "expr": "slurm_queue_node_fail",
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "Failed jobs (due to NodeFail)",
+ "refId": "G"
+ },
+ {
+ "expr": "slurm_queue_suspended",
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "Suspended Jobs",
+ "refId": "H"
+ },
+ {
+ "expr": "slurm_queue_cancelled",
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "Cancelled Jobs",
+ "refId": "I"
+ },
+ {
+ "expr": "slurm_queue_preempted",
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "Preempted Jobs",
+ "refId": "J"
+ },
+ {
+ "expr": "slurm_queue_completing + slurm_queue_running + slurm_queue_pending + slurm_queue_completed + slurm_queue_timeout + slurm_queue_failed + slurm_queue_node_fail + slurm_queue_suspended + slurm_queue_cancelled + slurm_queue_preempted",
+ "hide": true,
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "Total",
+ "refId": "K"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Total Jobs",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "decimals": 0,
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {
+ "Total Nodes": "#052b51"
+ },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "prometheus",
+ "decimals": 0,
+ "fieldConfig": {
+ "defaults": {
+ "custom": {}
+ },
+ "overrides": []
+ },
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 12,
+ "w": 11,
+ "x": 13,
+ "y": 11
+ },
+ "hiddenSeries": false,
+ "id": 57,
+ "legend": {
+ "alignAsTable": true,
+ "avg": false,
+ "current": true,
+ "hideEmpty": false,
+ "hideZero": false,
+ "max": true,
+ "min": true,
+ "rightSide": true,
+ "show": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 2,
+ "links": [],
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "slurm_nodes_alloc",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "Allocated Nodes",
+ "refId": "A"
+ },
+ {
+ "expr": "slurm_nodes_comp",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "Completing Nodes",
+ "refId": "B"
+ },
+ {
+ "expr": "slurm_nodes_idle",
+ "format": "time_series",
+ "intervalFactor": 2,
+ "legendFormat": "Idle Nodes",
+ "refId": "C"
+ },
+ {
+ "expr": "slurm_nodes_down",
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "Down Nodes",
+ "refId": "E"
+ },
+ {
+ "expr": "slurm_nodes_drain",
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "Draining Nodes",
+ "refId": "F"
+ },
+ {
+ "expr": "slurm_nodes_err",
+ "interval": "",
+ "legendFormat": "Nodes in *error* state",
+ "refId": "G"
+ },
+ {
+ "expr": "slurm_nodes_fail",
+ "interval": "",
+ "legendFormat": "Nodes in *fail* state",
+ "refId": "H"
+ },
+ {
+ "expr": "slurm_nodes_alloc + slurm_nodes_down + slurm_nodes_drain + slurm_nodes_idle + slurm_nodes_mix + slurm_nodes_comp + slurm_nodes_maint",
+ "format": "time_series",
+ "instant": false,
+ "intervalFactor": 2,
+ "legendFormat": "Total Nodes",
+ "refId": "D"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Nodes",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "min": "0",
+ "decimals": "0",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "datasource": "prometheus",
+ "description": "",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {},
+ "displayName": "Jobs",
+ "mappings": [],
+ "max": 10,
+ "min": 0,
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ },
+ {
+ "color": "red",
+ "value": 80
+ }
+ ]
+ },
+ "unit": "none"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 6,
+ "w": 4,
+ "x": 0,
+ "y": 17
+ },
+ "id": 47,
+ "options": {
+ "orientation": "auto",
+ "reduceOptions": {
+ "calcs": [
+ "last"
+ ],
+ "fields": "",
+ "values": false
+ },
+ "showThresholdLabels": false,
+ "showThresholdMarkers": true
+ },
+ "pluginVersion": "7.0.6",
+ "targets": [
+ {
+ "expr": "slurm_queue_running",
+ "instant": false,
+ "interval": "1",
+ "intervalFactor": 1,
+ "legendFormat": "Running",
+ "refId": "A"
+ }
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Slurm Jobs",
+ "type": "gauge"
+ },
+ {
+ "collapsed": false,
+ "datasource": null,
+ "gridPos": {
+ "h": 1,
+ "w": 24,
+ "x": 0,
+ "y": 23
+ },
+ "id": 2,
+ "panels": [],
+ "title": "CPU, Memory & Disk Stats",
+ "type": "row"
+ },
+ {
+ "datasource": null,
+ "fieldConfig": {
+ "defaults": {
+ "custom": {},
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ }
+ ]
+ }
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 4,
+ "w": 2,
+ "x": 0,
+ "y": 24
+ },
+ "id": 6,
+ "options": {
+ "colorMode": "value",
+ "graphMode": "area",
+ "justifyMode": "auto",
+ "orientation": "auto",
+ "reduceOptions": {
+ "calcs": [
+ "last"
+ ],
+ "fields": "",
+ "values": false
+ }
+ },
+ "pluginVersion": "7.0.6",
+ "targets": [
+ {
+ "expr": "count(node_cpu_seconds_total{mode=\"idle\",instance_id!=\"__INSTANCE_ID__\"})",
+ "interval": "",
+ "legendFormat": "",
+ "refId": "A"
+ }
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Total CPU cores",
+ "type": "stat"
+ },
+ {
+ "datasource": null,
+ "fieldConfig": {
+ "defaults": {
+ "custom": {},
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "rgb(143, 143, 143)",
+ "value": null
+ }
+ ]
+ }
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 4,
+ "w": 2,
+ "x": 2,
+ "y": 24
+ },
+ "id": 31,
+ "options": {
+ "colorMode": "value",
+ "graphMode": "area",
+ "justifyMode": "auto",
+ "orientation": "auto",
+ "reduceOptions": {
+ "calcs": [
+ "last"
+ ],
+ "fields": "",
+ "values": false
+ }
+ },
+ "pluginVersion": "7.0.6",
+ "targets": [
+ {
+ "expr": "count(node_cpu_seconds_total{mode=\"idle\",instance_id!=\"__INSTANCE_ID__\"})/count(count(node_cpu_seconds_total{instance_id!=\"__INSTANCE_ID__\"}) by (instance_id))",
+ "interval": "",
+ "legendFormat": "",
+ "refId": "A"
+ }
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Avg. CPU cores",
+ "type": "stat"
+ },
+ {
+ "datasource": null,
+ "fieldConfig": {
+ "defaults": {
+ "custom": {},
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ }
+ ]
+ }
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 4,
+ "w": 2,
+ "x": 4,
+ "y": 24
+ },
+ "id": 17,
+ "options": {
+ "colorMode": "value",
+ "graphMode": "area",
+ "justifyMode": "auto",
+ "orientation": "auto",
+ "reduceOptions": {
+ "calcs": [
+ "last"
+ ],
+ "fields": "",
+ "values": false
+ }
+ },
+ "pluginVersion": "7.0.6",
+ "targets": [
+ {
+ "expr": "sum(rate(node_cpu_seconds_total{mode!=\"idle\",instance_id!=\"__INSTANCE_ID__\"}[1m]))",
+ "interval": "",
+ "legendFormat": "",
+ "refId": "A"
+ }
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Total CPU use",
+ "type": "stat"
+ },
+ {
+ "datasource": null,
+ "fieldConfig": {
+ "defaults": {
+ "custom": {},
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "rgb(143, 143, 143)",
+ "value": null
+ }
+ ]
+ }
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 4,
+ "w": 2,
+ "x": 6,
+ "y": 24
+ },
+ "id": 32,
+ "options": {
+ "colorMode": "value",
+ "graphMode": "area",
+ "justifyMode": "auto",
+ "orientation": "auto",
+ "reduceOptions": {
+ "calcs": [
+ "last"
+ ],
+ "fields": "",
+ "values": false
+ }
+ },
+ "pluginVersion": "7.0.6",
+ "targets": [
+ {
+ "expr": "sum(rate(node_cpu_seconds_total{mode!=\"idle\",instance_id!=\"__INSTANCE_ID__\"}[1m]))/count(count(node_cpu_seconds_total{instance_id!=\"__INSTANCE_ID__\"}) by (instance_id))",
+ "interval": "",
+ "legendFormat": "",
+ "refId": "A"
+ }
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Avg. CPU use",
+ "type": "stat"
+ },
+ {
+ "datasource": null,
+ "fieldConfig": {
+ "defaults": {
+ "custom": {},
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ }
+ ]
+ },
+ "unit": "decbytes"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 4,
+ "w": 2,
+ "x": 8,
+ "y": 24
+ },
+ "id": 16,
+ "options": {
+ "colorMode": "value",
+ "graphMode": "area",
+ "justifyMode": "auto",
+ "orientation": "auto",
+ "reduceOptions": {
+ "calcs": [
+ "last"
+ ],
+ "fields": "",
+ "values": false
+ }
+ },
+ "pluginVersion": "7.0.6",
+ "targets": [
+ {
+ "expr": "sum(node_memory_MemTotal_bytes{instance_id!=\"__INSTANCE_ID__\"})",
+ "interval": "",
+ "legendFormat": "",
+ "refId": "A"
+ }
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Total Memory",
+ "type": "stat"
+ },
+ {
+ "datasource": null,
+ "fieldConfig": {
+ "defaults": {
+ "custom": {},
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "rgb(143, 143, 143)",
+ "value": null
+ }
+ ]
+ },
+ "unit": "decbytes"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 4,
+ "w": 2,
+ "x": 10,
+ "y": 24
+ },
+ "id": 33,
+ "options": {
+ "colorMode": "value",
+ "graphMode": "area",
+ "justifyMode": "auto",
+ "orientation": "auto",
+ "reduceOptions": {
+ "calcs": [
+ "last"
+ ],
+ "fields": "",
+ "values": false
+ }
+ },
+ "pluginVersion": "7.0.6",
+ "targets": [
+ {
+ "expr": "sum(node_memory_MemTotal_bytes{instance_id!=\"__INSTANCE_ID__\"})/count(node_memory_MemTotal_bytes{instance_id!=\"__INSTANCE_ID__\"})",
+ "interval": "",
+ "legendFormat": "",
+ "refId": "A"
+ }
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Avg. Memory",
+ "type": "stat"
+ },
+ {
+ "datasource": null,
+ "fieldConfig": {
+ "defaults": {
+ "custom": {},
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ }
+ ]
+ },
+ "unit": "decbytes"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 4,
+ "w": 2,
+ "x": 12,
+ "y": 24
+ },
+ "id": 24,
+ "options": {
+ "colorMode": "value",
+ "graphMode": "area",
+ "justifyMode": "auto",
+ "orientation": "auto",
+ "reduceOptions": {
+ "calcs": [
+ "last"
+ ],
+ "fields": "",
+ "values": false
+ }
+ },
+ "pluginVersion": "7.0.6",
+ "targets": [
+ {
+ "expr": "sum(node_memory_MemTotal_bytes{instance_id!=\"__INSTANCE_ID__\"}) - sum(node_memory_MemAvailable_bytes{instance_id!=\"__INSTANCE_ID__\"})",
+ "interval": "",
+ "legendFormat": "",
+ "refId": "A"
+ }
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Total Memory use",
+ "type": "stat"
+ },
+ {
+ "datasource": null,
+ "fieldConfig": {
+ "defaults": {
+ "custom": {},
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "rgb(143, 143, 143)",
+ "value": null
+ }
+ ]
+ },
+ "unit": "decbytes"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 4,
+ "w": 2,
+ "x": 14,
+ "y": 24
+ },
+ "id": 34,
+ "options": {
+ "colorMode": "value",
+ "graphMode": "area",
+ "justifyMode": "auto",
+ "orientation": "auto",
+ "reduceOptions": {
+ "calcs": [
+ "last"
+ ],
+ "fields": "",
+ "values": false
+ }
+ },
+ "pluginVersion": "7.0.6",
+ "targets": [
+ {
+ "expr": "(sum(node_memory_MemTotal_bytes{instance_id!=\"__INSTANCE_ID__\"}) - sum(node_memory_MemAvailable_bytes{instance_id!=\"__INSTANCE_ID__\"}))/count(node_memory_MemTotal_bytes{instance_id!=\"__INSTANCE_ID__\"})",
+ "interval": "",
+ "legendFormat": "",
+ "refId": "A"
+ }
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Avg. Memory use",
+ "type": "stat"
+ },
+ {
+ "datasource": null,
+ "fieldConfig": {
+ "defaults": {
+ "custom": {},
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ }
+ ]
+ },
+ "unit": "Bps"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 4,
+ "w": 2,
+ "x": 16,
+ "y": 24
+ },
+ "id": 18,
+ "options": {
+ "colorMode": "value",
+ "graphMode": "area",
+ "justifyMode": "auto",
+ "orientation": "auto",
+ "reduceOptions": {
+ "calcs": [
+ "last"
+ ],
+ "fields": "",
+ "values": false
+ }
+ },
+ "pluginVersion": "7.0.6",
+ "targets": [
+ {
+ "expr": "sum(rate(node_disk_written_bytes_total{instance_id!=\"__INSTANCE_ID__\"}[1m]))",
+ "interval": "",
+ "legendFormat": "",
+ "refId": "A"
+ }
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Total Disk (write)",
+ "type": "stat"
+ },
+ {
+ "datasource": null,
+ "fieldConfig": {
+ "defaults": {
+ "custom": {},
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "rgb(143, 143, 143)",
+ "value": null
+ }
+ ]
+ },
+ "unit": "Bps"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 4,
+ "w": 2,
+ "x": 18,
+ "y": 24
+ },
+ "id": 35,
+ "options": {
+ "colorMode": "value",
+ "graphMode": "area",
+ "justifyMode": "auto",
+ "orientation": "auto",
+ "reduceOptions": {
+ "calcs": [
+ "last"
+ ],
+ "fields": "",
+ "values": false
+ }
+ },
+ "pluginVersion": "7.0.6",
+ "targets": [
+ {
+ "expr": "sum(rate(node_disk_written_bytes_total{instance_id!=\"__INSTANCE_ID__\"}[1m])) / count(node_disk_written_bytes_total{instance_id!=\"__INSTANCE_ID__\"})",
+ "interval": "",
+ "legendFormat": "",
+ "refId": "A"
+ }
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Avg. Disk (write)",
+ "type": "stat"
+ },
+ {
+ "datasource": null,
+ "fieldConfig": {
+ "defaults": {
+ "custom": {},
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ }
+ ]
+ },
+ "unit": "Bps"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 4,
+ "w": 2,
+ "x": 20,
+ "y": 24
+ },
+ "id": 25,
+ "options": {
+ "colorMode": "value",
+ "graphMode": "area",
+ "justifyMode": "auto",
+ "orientation": "auto",
+ "reduceOptions": {
+ "calcs": [
+ "last"
+ ],
+ "fields": "",
+ "values": false
+ }
+ },
+ "pluginVersion": "7.0.6",
+ "targets": [
+ {
+ "expr": "sum(rate(node_disk_read_bytes_total{instance_id!=\"__INSTANCE_ID__\"}[1m]))",
+ "interval": "",
+ "legendFormat": "",
+ "refId": "A"
+ }
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Total Disk (read)",
+ "type": "stat"
+ },
+ {
+ "datasource": null,
+ "fieldConfig": {
+ "defaults": {
+ "custom": {},
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "rgb(143, 143, 143)",
+ "value": null
+ }
+ ]
+ },
+ "unit": "Bps"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 4,
+ "w": 2,
+ "x": 22,
+ "y": 24
+ },
+ "id": 36,
+ "options": {
+ "colorMode": "value",
+ "graphMode": "area",
+ "justifyMode": "auto",
+ "orientation": "auto",
+ "reduceOptions": {
+ "calcs": [
+ "last"
+ ],
+ "fields": "",
+ "values": false
+ }
+ },
+ "pluginVersion": "7.0.6",
+ "targets": [
+ {
+ "expr": "sum(rate(node_disk_read_bytes_total{instance_id!=\"__INSTANCE_ID__\"}[1m])) / count(node_disk_read_bytes_total{instance_id!=\"__INSTANCE_ID__\"})",
+ "interval": "",
+ "legendFormat": "",
+ "refId": "A"
+ }
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Avg. Disk (read)",
+ "type": "stat"
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": null,
+ "fieldConfig": {
+ "defaults": {
+ "custom": {}
+ },
+ "overrides": []
+ },
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 12,
+ "w": 8,
+ "x": 0,
+ "y": 28
+ },
+ "hiddenSeries": false,
+ "id": 4,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
+ "min": true,
+ "rightSide": false,
+ "show": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 2,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "count(node_cpu_seconds_total{mode=\"idle\",instance_id!=\"__INSTANCE_ID__\"})",
+ "interval": "",
+ "legendFormat": "Total cores",
+ "refId": "B"
+ },
+ {
+ "expr": "sum(rate(node_cpu_seconds_total{mode!=\"idle\",instance_id!=\"__INSTANCE_ID__\"}[1m]))",
+ "legendFormat": "Load in cores",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "CPU load",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": null,
+ "fieldConfig": {
+ "defaults": {
+ "custom": {}
+ },
+ "overrides": []
+ },
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 12,
+ "w": 8,
+ "x": 8,
+ "y": 28
+ },
+ "hiddenSeries": false,
+ "id": 15,
+ "legend": {
+ "alignAsTable": false,
+ "avg": true,
+ "current": true,
+ "max": true,
+ "min": true,
+ "rightSide": false,
+ "show": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 2,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "sum(node_memory_MemTotal_bytes{instance_id!=\"__INSTANCE_ID__\"})",
+ "interval": "",
+ "legendFormat": "Total Memory",
+ "refId": "B"
+ },
+ {
+ "expr": "sum(node_memory_MemTotal_bytes{instance_id!=\"__INSTANCE_ID__\"}) - sum(node_memory_MemAvailable_bytes{instance_id!=\"__INSTANCE_ID__\"})",
+ "interval": "",
+ "legendFormat": "Used Memory",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Memory",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "decbytes",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": null,
+ "fieldConfig": {
+ "defaults": {
+ "custom": {}
+ },
+ "overrides": []
+ },
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 6,
+ "w": 8,
+ "x": 16,
+ "y": 28
+ },
+ "hiddenSeries": false,
+ "id": 26,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
+ "min": true,
+ "rightSide": false,
+ "show": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 2,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": true,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "sum(rate(node_disk_read_bytes_total{instance_id!=\"__INSTANCE_ID__\"}[1m]))",
+ "interval": "",
+ "legendFormat": "Read throughput",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Disk I/O (read)",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "Bps",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": null,
+ "fieldConfig": {
+ "defaults": {
+ "custom": {}
+ },
+ "overrides": []
+ },
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 6,
+ "w": 8,
+ "x": 16,
+ "y": 34
+ },
+ "hiddenSeries": false,
+ "id": 23,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
+ "min": true,
+ "rightSide": false,
+ "show": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 2,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": true,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "sum(rate(node_disk_written_bytes_total{instance_id!=\"__INSTANCE_ID__\"}[1m]))",
+ "interval": "",
+ "legendFormat": "Write throughput",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Disk I/O (write)",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "$$hashKey": "object:10296",
+ "format": "Bps",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "$$hashKey": "object:10297",
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "collapsed": false,
+ "datasource": null,
+ "gridPos": {
+ "h": 1,
+ "w": 24,
+ "x": 0,
+ "y": 40
+ },
+ "id": 14,
+ "panels": [],
+ "title": "Network Stats",
+ "type": "row"
+ },
+ {
+ "datasource": null,
+ "fieldConfig": {
+ "defaults": {
+ "custom": {},
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "semi-dark-green",
+ "value": null
+ }
+ ]
+ },
+ "unit": "Bps"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 5,
+ "w": 6,
+ "x": 0,
+ "y": 41
+ },
+ "id": 10,
+ "options": {
+ "colorMode": "value",
+ "graphMode": "area",
+ "justifyMode": "auto",
+ "orientation": "auto",
+ "reduceOptions": {
+ "calcs": [
+ "last"
+ ],
+ "fields": "",
+ "values": false
+ }
+ },
+ "pluginVersion": "7.0.6",
+ "targets": [
+ {
+ "expr": "sum(irate(node_network_transmit_bytes_total{device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*',instance_id!=\"__INSTANCE_ID__\"}[1m]))",
+ "interval": "",
+ "legendFormat": "",
+ "refId": "A"
+ }
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Current Total Throughput (Transmit)",
+ "type": "stat"
+ },
+ {
+ "datasource": null,
+ "fieldConfig": {
+ "defaults": {
+ "custom": {},
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "rgb(143, 143, 143)",
+ "value": null
+ }
+ ]
+ },
+ "unit": "Bps"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 5,
+ "w": 6,
+ "x": 6,
+ "y": 41
+ },
+ "id": 28,
+ "options": {
+ "colorMode": "value",
+ "graphMode": "area",
+ "justifyMode": "auto",
+ "orientation": "auto",
+ "reduceOptions": {
+ "calcs": [
+ "last"
+ ],
+ "fields": "",
+ "values": false
+ }
+ },
+ "pluginVersion": "7.0.6",
+ "targets": [
+ {
+ "expr": "sum(irate(node_network_transmit_bytes_total{device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*',instance_id!=\"__INSTANCE_ID__\"}[1m]))/count(count(node_network_transmit_bytes_total{instance_id!=\"__INSTANCE_ID__\"}) by(instance_id))",
+ "interval": "",
+ "legendFormat": "",
+ "refId": "A"
+ }
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Current Avg. Throughput per Node (Transmit)",
+ "type": "stat"
+ },
+ {
+ "datasource": null,
+ "fieldConfig": {
+ "defaults": {
+ "custom": {},
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "semi-dark-purple",
+ "value": null
+ }
+ ]
+ },
+ "unit": "Bps"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 5,
+ "w": 6,
+ "x": 12,
+ "y": 41
+ },
+ "id": 11,
+ "options": {
+ "colorMode": "value",
+ "graphMode": "area",
+ "justifyMode": "auto",
+ "orientation": "auto",
+ "reduceOptions": {
+ "calcs": [
+ "last"
+ ],
+ "fields": "",
+ "values": false
+ }
+ },
+ "pluginVersion": "7.0.6",
+ "targets": [
+ {
+ "expr": "sum(irate(node_network_receive_bytes_total{device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*',instance_id!=\"__INSTANCE_ID__\"}[1m]))",
+ "interval": "",
+ "legendFormat": "",
+ "refId": "A"
+ }
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Current Total Throughput (Receive)",
+ "type": "stat"
+ },
+ {
+ "datasource": null,
+ "fieldConfig": {
+ "defaults": {
+ "custom": {},
+ "mappings": [],
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "rgb(143, 143, 143)",
+ "value": null
+ }
+ ]
+ },
+ "unit": "Bps"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 5,
+ "w": 6,
+ "x": 18,
+ "y": 41
+ },
+ "id": 30,
+ "options": {
+ "colorMode": "value",
+ "graphMode": "area",
+ "justifyMode": "auto",
+ "orientation": "auto",
+ "reduceOptions": {
+ "calcs": [
+ "last"
+ ],
+ "fields": "",
+ "values": false
+ }
+ },
+ "pluginVersion": "7.0.6",
+ "targets": [
+ {
+ "expr": "sum(irate(node_network_receive_bytes_total{device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*',instance_id!=\"__INSTANCE_ID__\"}[1m]))/count(count(node_network_receive_bytes_total{instance_id!=\"__INSTANCE_ID__\"}) by(instance_id))",
+ "interval": "",
+ "legendFormat": "",
+ "refId": "A"
+ }
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Current Avg. Throughput per Node (Receive)",
+ "type": "stat"
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": null,
+ "decimals": null,
+ "fieldConfig": {
+ "defaults": {
+ "custom": {}
+ },
+ "overrides": []
+ },
+ "fill": 1,
+ "fillGradient": 3,
+ "gridPos": {
+ "h": 11,
+ "w": 12,
+ "x": 0,
+ "y": 46
+ },
+ "hiddenSeries": false,
+ "id": 8,
+ "interval": "",
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "hideEmpty": false,
+ "max": true,
+ "min": true,
+ "rightSide": false,
+ "show": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 2,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [
+ {
+ "alias": "Transmit",
+ "color": "#56A64B"
+ }
+ ],
+ "spaceLength": 10,
+ "stack": true,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "sum(irate(node_network_transmit_bytes_total{device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*',instance_id!=\"__INSTANCE_ID__\"}[1m]))",
+ "interval": "",
+ "legendFormat": "Transmit",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Throughput (Transmit)",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "Bps",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": null,
+ "fieldConfig": {
+ "defaults": {
+ "custom": {}
+ },
+ "overrides": []
+ },
+ "fill": 1,
+ "fillGradient": 3,
+ "gridPos": {
+ "h": 11,
+ "w": 12,
+ "x": 12,
+ "y": 46
+ },
+ "hiddenSeries": false,
+ "id": 12,
+ "interval": "",
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
+ "min": true,
+ "rightSide": false,
+ "show": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 2,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [
+ {
+ "alias": "Receive",
+ "color": "#A352CC"
+ }
+ ],
+ "spaceLength": 10,
+ "stack": true,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "sum(irate(node_network_receive_bytes_total{device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*',instance_id!=\"__INSTANCE_ID__\"}[1m]))",
+ "interval": "",
+ "legendFormat": "Receive",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Throughput (Receive)",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "Bps",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "collapsed": false,
+ "datasource": null,
+ "gridPos": {
+ "h": 1,
+ "w": 24,
+ "x": 0,
+ "y": 57
+ },
+ "id": 20,
+ "panels": [],
+ "title": "S3 stats",
+ "type": "row"
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "cloudwatch",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {}
+ },
+ "overrides": []
+ },
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 8,
+ "w": 12,
+ "x": 0,
+ "y": 58
+ },
+ "hiddenSeries": false,
+ "id": 55,
+ "interval": "",
+ "legend": {
+ "alignAsTable": true,
+ "avg": false,
+ "current": true,
+ "max": true,
+ "min": false,
+ "show": true,
+ "total": true,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 2,
+ "nullPointMode": "null as zero",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 2,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [
+ {
+ "$$hashKey": "object:1603",
+ "alias": "transmit",
+ "color": "#A352CC"
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "alias": "",
+ "dimensions": {
+ "BucketName": "_S3_BUCKET_"
+ },
+ "expression": "",
+ "id": "",
+ "matchExact": false,
+ "metricName": "BytesUploaded",
+ "namespace": "AWS/S3",
+ "period": "",
+ "refId": "A",
+ "region": "__AWS_REGION__",
+ "statistics": [
+ "Sum"
+ ]
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "S3 Throughput (Upload)",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "Bps",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "cloudwatch",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {}
+ },
+ "overrides": []
+ },
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 8,
+ "w": 12,
+ "x": 12,
+ "y": 58
+ },
+ "hiddenSeries": false,
+ "id": 54,
+ "interval": "",
+ "legend": {
+ "alignAsTable": true,
+ "avg": false,
+ "current": true,
+ "max": true,
+ "min": false,
+ "show": true,
+ "total": true,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 2,
+ "nullPointMode": "null as zero",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 2,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [
+ {
+ "$$hashKey": "object:1603",
+ "alias": "transmit",
+ "color": "#A352CC"
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "alias": "",
+ "dimensions": {
+ "BucketName": "_S3_BUCKET_"
+ },
+ "expression": "",
+ "id": "",
+ "matchExact": false,
+ "metricName": "BytesDownloaded",
+ "namespace": "AWS/S3",
+ "period": "",
+ "refId": "A",
+ "region": "__AWS_REGION__",
+ "statistics": [
+ "Sum"
+ ]
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "S3 Throughput (Download)",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "Bps",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ }
+ ],
+ "refresh": "5s",
+ "schemaVersion": 25,
+ "style": "dark",
+ "tags": [],
+ "templating": {
+ "list": []
+ },
+ "time": {
+ "from": "now-1h",
+ "to": "now"
+ },
+ "timepicker": {
+ "refresh_intervals": [
+ "10s",
+ "30s",
+ "1m",
+ "5m",
+ "15m",
+ "30m",
+ "1h",
+ "2h",
+ "1d"
+ ]
+ },
+ "timezone": "",
+ "title": "ParallelCluster Summary",
+ "uid": "r9_U2AzGz",
+ "version": 1
+}
\ No newline at end of file
diff --git a/monitoring/grafana/dashboards/compute-node-details.json b/monitoring/grafana/dashboards/compute-node-details.json
new file mode 100755
index 0000000..951a7f6
--- /dev/null
+++ b/monitoring/grafana/dashboards/compute-node-details.json
@@ -0,0 +1,2198 @@
+{
+ "annotations": {
+ "list": [
+ {
+ "builtIn": 1,
+ "datasource": "-- Grafana --",
+ "enable": true,
+ "hide": true,
+ "iconColor": "rgba(0, 211, 255, 1)",
+ "name": "Annotations & Alerts",
+ "type": "dashboard"
+ }
+ ]
+ },
+ "description": "",
+ "editable": true,
+ "gnetId": 11074,
+ "graphTooltip": 0,
+ "id": 2,
+ "iteration": 1584377467734,
+ "links": [
+ {
+ "icon": "external link",
+ "tags": [],
+ "type": "dashboards"
+ }
+ ],
+ "panels": [
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorPostfix": false,
+ "colorPrefix": false,
+ "colorValue": true,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "datasource": "prometheus",
+ "decimals": 1,
+ "description": "",
+ "format": "s",
+ "gauge": {
+ "maxValue": 100,
+ "minValue": 0,
+ "show": false,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "gridPos": {
+ "h": 3,
+ "w": 2,
+ "x": 0,
+ "y": 0
+ },
+ "hideTimeOverride": true,
+ "id": 15,
+ "interval": null,
+ "links": [],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "nullPointMode": "null",
+ "nullText": null,
+ "options": {},
+ "pluginVersion": "6.4.2",
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": false
+ },
+ "tableColumn": "",
+ "targets": [
+ {
+ "expr": "sum(time() - node_boot_time_seconds{instance=~\"$node\"})",
+ "format": "time_series",
+ "hide": false,
+ "instant": true,
+ "intervalFactor": 1,
+ "refId": "A",
+ "step": 40
+ }
+ ],
+ "thresholds": "1,2",
+ "title": "System Uptime",
+ "type": "singlestat",
+ "valueFontSize": "100%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "N/A",
+ "value": "null"
+ }
+ ],
+ "valueName": "current"
+ },
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorValue": true,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "datasource": "prometheus",
+ "decimals": 2,
+ "description": "",
+ "format": "bytes",
+ "gauge": {
+ "maxValue": 100,
+ "minValue": 0,
+ "show": false,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "gridPos": {
+ "h": 3,
+ "w": 2,
+ "x": 2,
+ "y": 0
+ },
+ "id": 75,
+ "interval": null,
+ "links": [],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "maxPerRow": 6,
+ "nullPointMode": "null",
+ "nullText": null,
+ "options": {},
+ "postfix": "",
+ "postfixFontSize": "70%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": false
+ },
+ "tableColumn": "",
+ "targets": [
+ {
+ "expr": "sum(node_memory_MemTotal_bytes{instance=~\"$node\"})",
+ "format": "time_series",
+ "instant": true,
+ "intervalFactor": 1,
+ "legendFormat": "{{instance}}",
+ "refId": "A",
+ "step": 20
+ }
+ ],
+ "thresholds": "2,3",
+ "title": "Total RAM",
+ "type": "singlestat",
+ "valueFontSize": "80%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "N/A",
+ "value": "null"
+ }
+ ],
+ "valueName": "current"
+ },
+ {
+ "datasource": "prometheus",
+ "gridPos": {
+ "h": 6,
+ "w": 5,
+ "x": 4,
+ "y": 0
+ },
+ "id": 177,
+ "options": {
+ "displayMode": "lcd",
+ "fieldOptions": {
+ "calcs": [
+ "last"
+ ],
+ "defaults": {
+ "color": {
+ "mode": "thresholds"
+ },
+ "mappings": [],
+ "max": 100,
+ "min": 0,
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ },
+ {
+ "color": "#EAB839",
+ "value": 60
+ },
+ {
+ "color": "red",
+ "value": 80
+ }
+ ]
+ },
+ "title": "",
+ "unit": "percent"
+ },
+ "overrides": [],
+ "values": false
+ },
+ "orientation": "horizontal",
+ "showUnfilled": true
+ },
+ "pluginVersion": "6.6.2",
+ "targets": [
+ {
+ "expr": "100 - (avg(irate(node_cpu_seconds_total{instance=~\"$node\",mode=\"idle\"}[30m])) * 100)",
+ "instant": true,
+ "legendFormat": "CPU Busy",
+ "refId": "A"
+ },
+ {
+ "expr": "avg(irate(node_cpu_seconds_total{instance=~\"$node\",mode=\"iowait\"}[30m])) * 100",
+ "hide": true,
+ "instant": true,
+ "legendFormat": "Busy Iowait",
+ "refId": "C"
+ },
+ {
+ "expr": "(1 - (node_memory_MemAvailable_bytes{instance=~\"$node\"} / (node_memory_MemTotal_bytes{instance=~\"$node\"})))* 100",
+ "instant": true,
+ "legendFormat": "Used RAM Memory",
+ "refId": "B"
+ },
+ {
+ "expr": "100 - ((node_filesystem_avail_bytes{instance=~\"$node\",mountpoint=\"$maxmount\",fstype=~\"ext4|xfs\"} * 100) / node_filesystem_size_bytes {instance=~\"$node\",mountpoint=\"$maxmount\",fstype=~\"ext4|xfs\"})",
+ "hide": false,
+ "instant": true,
+ "legendFormat": "Used Max Mount($maxmount)",
+ "refId": "D"
+ },
+ {
+ "expr": "(1 - (node_memory_SwapFree_bytes{instance=~\"$node\"} / node_memory_SwapTotal_bytes{instance=~\"$node\"})) * 100",
+ "instant": true,
+ "legendFormat": "Used SWAP",
+ "refId": "E"
+ }
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "",
+ "type": "bargauge"
+ },
+ {
+ "aliasColors": {
+ "15分钟": "#6ED0E0",
+ "1分钟": "#BF1B00",
+ "5分钟": "#CCA300"
+ },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "prometheus",
+ "editable": true,
+ "error": false,
+ "fill": 1,
+ "fillGradient": 1,
+ "grid": {},
+ "gridPos": {
+ "h": 6,
+ "w": 8,
+ "x": 9,
+ "y": 0
+ },
+ "height": "300",
+ "hiddenSeries": false,
+ "id": 13,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
+ "min": false,
+ "rightSide": true,
+ "show": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 2,
+ "links": [],
+ "maxPerRow": 6,
+ "nullPointMode": "null as zero",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "repeat": null,
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "node_load1{instance=~\"$node\"}",
+ "format": "time_series",
+ "instant": false,
+ "interval": "",
+ "intervalFactor": 1,
+ "legendFormat": "{{instance}}_1m",
+ "metric": "",
+ "refId": "A",
+ "step": 20,
+ "target": ""
+ },
+ {
+ "expr": "node_load5{instance=~\"$node\"}",
+ "format": "time_series",
+ "instant": false,
+ "interval": "",
+ "intervalFactor": 1,
+ "legendFormat": "{{instance}}_5m",
+ "refId": "B",
+ "step": 20
+ },
+ {
+ "expr": "node_load15{instance=~\"$node\"}",
+ "format": "time_series",
+ "instant": false,
+ "interval": "",
+ "intervalFactor": 1,
+ "legendFormat": "{{instance}}_15m",
+ "refId": "C",
+ "step": 20
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "System Load",
+ "tooltip": {
+ "msResolution": false,
+ "shared": true,
+ "sort": 2,
+ "value_type": "cumulative"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "columns": [],
+ "datasource": "prometheus",
+ "fontSize": "110%",
+ "gridPos": {
+ "h": 6,
+ "w": 7,
+ "x": 17,
+ "y": 0
+ },
+ "id": 164,
+ "links": [],
+ "options": {},
+ "pageSize": null,
+ "scroll": true,
+ "showHeader": true,
+ "sort": {
+ "col": 6,
+ "desc": false
+ },
+ "styles": [
+ {
+ "alias": "Mounted on",
+ "align": "auto",
+ "colorMode": null,
+ "colors": [
+ "rgba(50, 172, 45, 0.97)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(245, 54, 54, 0.9)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "mappingType": 1,
+ "pattern": "mountpoint",
+ "thresholds": [
+ ""
+ ],
+ "type": "string",
+ "unit": "bytes"
+ },
+ {
+ "alias": "Avail",
+ "align": "auto",
+ "colorMode": "value",
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "mappingType": 1,
+ "pattern": "Value #A",
+ "thresholds": [
+ "10000000000",
+ "20000000000"
+ ],
+ "type": "number",
+ "unit": "bytes"
+ },
+ {
+ "alias": "Used",
+ "align": "auto",
+ "colorMode": "cell",
+ "colors": [
+ "rgba(50, 172, 45, 0.97)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(245, 54, 54, 0.9)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "mappingType": 1,
+ "pattern": "Value #B",
+ "thresholds": [
+ "0.6",
+ "0.8"
+ ],
+ "type": "number",
+ "unit": "percentunit"
+ },
+ {
+ "alias": "Size",
+ "align": "auto",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 1,
+ "link": false,
+ "mappingType": 1,
+ "pattern": "Value #C",
+ "thresholds": [],
+ "type": "number",
+ "unit": "bytes"
+ },
+ {
+ "alias": "Filesystem",
+ "align": "auto",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "link": false,
+ "mappingType": 1,
+ "pattern": "fstype",
+ "thresholds": [],
+ "type": "string",
+ "unit": "short"
+ },
+ {
+ "alias": "IP",
+ "align": "auto",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "link": false,
+ "mappingType": 1,
+ "pattern": "instance",
+ "preserveFormat": false,
+ "sanitize": false,
+ "thresholds": [],
+ "type": "string",
+ "unit": "short"
+ },
+ {
+ "alias": "",
+ "align": "auto",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "decimals": 2,
+ "pattern": "/.*/",
+ "preserveFormat": true,
+ "sanitize": false,
+ "thresholds": [],
+ "type": "hidden",
+ "unit": "short"
+ }
+ ],
+ "targets": [
+ {
+ "expr": "node_filesystem_size_bytes{instance=~'$node',fstype=~\"ext4|xfs\"}-0",
+ "format": "table",
+ "hide": false,
+ "instant": true,
+ "intervalFactor": 1,
+ "legendFormat": "",
+ "refId": "C"
+ },
+ {
+ "expr": "node_filesystem_avail_bytes {instance=~'$node',fstype=~\"ext4|xfs\"}-0",
+ "format": "table",
+ "hide": false,
+ "instant": true,
+ "interval": "10s",
+ "intervalFactor": 1,
+ "legendFormat": "",
+ "refId": "A"
+ },
+ {
+ "expr": "1-(node_filesystem_free_bytes{instance=~'$node',fstype=~\"ext4|xfs\"} / node_filesystem_size_bytes{instance=~'$node',fstype=~\"ext4|xfs\"})",
+ "format": "table",
+ "hide": false,
+ "instant": true,
+ "intervalFactor": 1,
+ "legendFormat": "",
+ "refId": "B"
+ }
+ ],
+ "title": "Disk Space Used Basic(EXT4/XFS)",
+ "transform": "table",
+ "type": "table"
+ },
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorPostfix": false,
+ "colorValue": true,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "datasource": "prometheus",
+ "description": "",
+ "format": "short",
+ "gauge": {
+ "maxValue": 100,
+ "minValue": 0,
+ "show": false,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "gridPos": {
+ "h": 3,
+ "w": 2,
+ "x": 0,
+ "y": 3
+ },
+ "id": 14,
+ "interval": null,
+ "links": [],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "maxPerRow": 6,
+ "nullPointMode": "null",
+ "nullText": null,
+ "options": {},
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": false
+ },
+ "tableColumn": "",
+ "targets": [
+ {
+ "expr": "sum(count(node_cpu_seconds_total{instance=~\"$node\", mode='system'}) by (cpu))",
+ "format": "time_series",
+ "instant": true,
+ "intervalFactor": 1,
+ "legendFormat": "",
+ "refId": "A",
+ "step": 20
+ }
+ ],
+ "thresholds": "1,2",
+ "title": "CPU Cores",
+ "type": "singlestat",
+ "valueFontSize": "100%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "N/A",
+ "value": "null"
+ }
+ ],
+ "valueName": "current"
+ },
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorValue": true,
+ "colors": [
+ "#299c46",
+ "rgba(237, 129, 40, 0.89)",
+ "#d44a3a"
+ ],
+ "datasource": "prometheus",
+ "decimals": 2,
+ "description": "",
+ "format": "percent",
+ "gauge": {
+ "maxValue": 100,
+ "minValue": 0,
+ "show": false,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "gridPos": {
+ "h": 3,
+ "w": 2,
+ "x": 2,
+ "y": 3
+ },
+ "id": 20,
+ "interval": null,
+ "links": [],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "nullPointMode": "connected",
+ "nullText": null,
+ "options": {},
+ "pluginVersion": "6.4.2",
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "#3274D9",
+ "show": true,
+ "ymax": null,
+ "ymin": null
+ },
+ "tableColumn": "",
+ "targets": [
+ {
+ "expr": "avg(irate(node_cpu_seconds_total{instance=~\"$node\",mode=\"iowait\"}[30m])) * 100",
+ "format": "time_series",
+ "hide": false,
+ "instant": false,
+ "interval": "",
+ "intervalFactor": 1,
+ "legendFormat": "",
+ "refId": "A",
+ "step": 20
+ }
+ ],
+ "thresholds": "20,50",
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "CPU IOwait",
+ "type": "singlestat",
+ "valueFontSize": "100%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "N/A",
+ "value": "null"
+ }
+ ],
+ "valueName": "avg"
+ },
+ {
+ "aliasColors": {
+ "192.168.200.241:9100_Total": "dark-red",
+ "Idle - Waiting for something to happen": "#052B51",
+ "guest": "#9AC48A",
+ "idle": "#052B51",
+ "iowait": "#EAB839",
+ "irq": "#BF1B00",
+ "nice": "#C15C17",
+ "sdb_每秒I/O操作%": "#d683ce",
+ "softirq": "#E24D42",
+ "steal": "#FCE2DE",
+ "system": "#508642",
+ "user": "#5195CE",
+ "磁盘花费在I/O操作占比": "#ba43a9"
+ },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "prometheus",
+ "decimals": 2,
+ "description": "",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 11,
+ "w": 12,
+ "x": 0,
+ "y": 6
+ },
+ "hiddenSeries": false,
+ "id": 7,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "hideEmpty": true,
+ "hideZero": true,
+ "max": true,
+ "min": false,
+ "rightSide": true,
+ "show": true,
+ "sideWidth": null,
+ "sort": "current",
+ "sortDesc": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 2,
+ "links": [],
+ "maxPerRow": 6,
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "repeat": null,
+ "seriesOverrides": [
+ {
+ "alias": "/.*_Total/",
+ "color": "#C4162A",
+ "fill": 0
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "avg(irate(node_cpu_seconds_total{instance=~\"$node\",mode=\"system\"}[30m])) by (instance)",
+ "format": "time_series",
+ "hide": false,
+ "instant": false,
+ "interval": "",
+ "intervalFactor": 1,
+ "legendFormat": "{{instance}}_System",
+ "refId": "A",
+ "step": 20
+ },
+ {
+ "expr": "avg(irate(node_cpu_seconds_total{instance=~\"$node\",mode=\"user\"}[30m])) by (instance)",
+ "format": "time_series",
+ "hide": false,
+ "intervalFactor": 1,
+ "legendFormat": "{{instance}}_User",
+ "refId": "B",
+ "step": 240
+ },
+ {
+ "expr": "avg(irate(node_cpu_seconds_total{instance=~\"$node\",mode=\"iowait\"}[30m])) by (instance)",
+ "format": "time_series",
+ "hide": false,
+ "instant": false,
+ "intervalFactor": 1,
+ "legendFormat": "{{instance}}_Iowait",
+ "refId": "D",
+ "step": 240
+ },
+ {
+ "expr": "1 - avg(irate(node_cpu_seconds_total{instance=~\"$node\",mode=\"idle\"}[30m])) by (instance)",
+ "format": "time_series",
+ "hide": false,
+ "intervalFactor": 1,
+ "legendFormat": "{{instance}}_Total",
+ "refId": "F",
+ "step": 240
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "CPU Basic",
+ "tooltip": {
+ "shared": true,
+ "sort": 2,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "decimals": 2,
+ "format": "percentunit",
+ "label": "",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {
+ "192.168.10.227:9100_em1_in下载": "super-light-green",
+ "192.168.10.227:9100_em1_out上传": "dark-blue"
+ },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "prometheus",
+ "fill": 1,
+ "fillGradient": 3,
+ "gridPos": {
+ "h": 11,
+ "w": 12,
+ "x": 12,
+ "y": 6
+ },
+ "height": "300",
+ "hiddenSeries": false,
+ "id": 157,
+ "legend": {
+ "alignAsTable": true,
+ "avg": false,
+ "current": true,
+ "hideEmpty": true,
+ "hideZero": true,
+ "max": true,
+ "min": false,
+ "rightSide": true,
+ "show": true,
+ "sort": "current",
+ "sortDesc": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 2,
+ "links": [],
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 2,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [
+ {
+ "alias": "/.*_transmit$/",
+ "transform": "negative-Y"
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "irate(node_network_receive_bytes_total{instance=~'$node',device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*'}[30m])*8",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "{{instance}}_{{device}}_receive",
+ "refId": "A",
+ "step": 4
+ },
+ {
+ "expr": "irate(node_network_transmit_bytes_total{instance=~'$node',device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*'}[30m])*8",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "{{instance}}_{{device}}_transmit",
+ "refId": "B",
+ "step": 4
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Network Traffic Basic",
+ "tooltip": {
+ "shared": true,
+ "sort": 2,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "bps",
+ "label": "transmit(-)/receive(+)",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {
+ "192.168.200.241:9100_总内存": "dark-red",
+ "内存_Avaliable": "#6ED0E0",
+ "内存_Cached": "#EF843C",
+ "内存_Free": "#629E51",
+ "内存_Total": "#6d1f62",
+ "内存_Used": "#eab839",
+ "可用": "#9ac48a",
+ "总内存": "#bf1b00"
+ },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "prometheus",
+ "decimals": 2,
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 12,
+ "w": 12,
+ "x": 0,
+ "y": 17
+ },
+ "height": "300",
+ "hiddenSeries": false,
+ "id": 156,
+ "legend": {
+ "alignAsTable": true,
+ "avg": false,
+ "current": true,
+ "max": false,
+ "min": false,
+ "rightSide": true,
+ "show": true,
+ "sort": "current",
+ "sortDesc": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 2,
+ "links": [],
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [
+ {
+ "alias": "/.*_Total/",
+ "color": "#C4162A",
+ "fill": 0
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "node_memory_MemTotal_bytes{instance=~\"$node\"}",
+ "format": "time_series",
+ "hide": false,
+ "instant": false,
+ "intervalFactor": 1,
+ "legendFormat": "{{instance}}_Total",
+ "refId": "A",
+ "step": 4
+ },
+ {
+ "expr": "node_memory_MemTotal_bytes{instance=~\"$node\"} - node_memory_MemAvailable_bytes{instance=~\"$node\"}",
+ "format": "time_series",
+ "hide": false,
+ "intervalFactor": 1,
+ "legendFormat": "{{instance}}_Used",
+ "refId": "B",
+ "step": 4
+ },
+ {
+ "expr": "node_memory_MemAvailable_bytes{instance=~\"$node\"}",
+ "format": "time_series",
+ "hide": false,
+ "interval": "",
+ "intervalFactor": 1,
+ "legendFormat": "{{instance}}_Avaliable",
+ "refId": "F",
+ "step": 4
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Memory Basic",
+ "tooltip": {
+ "shared": true,
+ "sort": 2,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "bytes",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {
+ "vda_write": "#6ED0E0"
+ },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "prometheus",
+ "description": "Per second read / write bytes ",
+ "fill": 1,
+ "fillGradient": 1,
+ "gridPos": {
+ "h": 12,
+ "w": 12,
+ "x": 12,
+ "y": 17
+ },
+ "height": "300",
+ "hiddenSeries": false,
+ "id": 168,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "hideEmpty": true,
+ "hideZero": true,
+ "max": true,
+ "min": false,
+ "rightSide": true,
+ "show": true,
+ "sort": "current",
+ "sortDesc": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 2,
+ "links": [],
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [
+ {
+ "alias": "/.*_Read bytes$/",
+ "transform": "negative-Y"
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "irate(node_disk_read_bytes_total{instance=~\"$node\"}[30m])",
+ "format": "time_series",
+ "interval": "",
+ "intervalFactor": 1,
+ "legendFormat": "{{instance}}_{{device}}_Read bytes",
+ "refId": "A",
+ "step": 10
+ },
+ {
+ "expr": "irate(node_disk_written_bytes_total{instance=~\"$node\"}[30m])",
+ "format": "time_series",
+ "hide": false,
+ "intervalFactor": 1,
+ "legendFormat": "{{instance}}_{{device}}_Written bytes",
+ "refId": "B",
+ "step": 10
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Disk R/W Data",
+ "tooltip": {
+ "shared": true,
+ "sort": 2,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "decimals": null,
+ "format": "Bps",
+ "label": "Bytes read (-) / write (+)",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "prometheus",
+ "fill": 1,
+ "fillGradient": 3,
+ "gridPos": {
+ "h": 6,
+ "w": 8,
+ "x": 0,
+ "y": 29
+ },
+ "hiddenSeries": false,
+ "id": 174,
+ "legend": {
+ "alignAsTable": true,
+ "avg": false,
+ "current": true,
+ "hideEmpty": false,
+ "hideZero": false,
+ "max": false,
+ "min": false,
+ "rightSide": false,
+ "show": true,
+ "sideWidth": null,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 2,
+ "links": [],
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [
+ {
+ "alias": "/Inodes.*/",
+ "yaxis": 2
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "1-(node_filesystem_free_bytes{instance=~'$node',fstype=~\"ext4|xfs\"} / node_filesystem_size_bytes{instance=~'$node',fstype=~\"ext4|xfs\"})",
+ "format": "time_series",
+ "instant": false,
+ "intervalFactor": 1,
+ "legendFormat": "{{instance}}:{{mountpoint}}",
+ "refId": "A"
+ },
+ {
+ "expr": "node_filesystem_files_free{instance=~'$node',fstype=~\"ext4|xfs\"} / node_filesystem_files{instance=~'$node',fstype=~\"ext4|xfs\"}",
+ "hide": true,
+ "legendFormat": "Inodes:{{instance}}:{{mountpoint}}",
+ "refId": "B"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Disk Space Used Basic",
+ "tooltip": {
+ "shared": true,
+ "sort": 2,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "decimals": 2,
+ "format": "percentunit",
+ "label": "",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "decimals": 2,
+ "format": "percentunit",
+ "label": null,
+ "logBase": 1,
+ "max": "1",
+ "min": null,
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {
+ "filefd_192.168.200.241:9100": "super-light-green",
+ "switches_192.168.200.241:9100": "semi-dark-red"
+ },
+ "bars": false,
+ "cacheTimeout": null,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "prometheus",
+ "description": "",
+ "fill": 0,
+ "fillGradient": 1,
+ "gridPos": {
+ "h": 6,
+ "w": 8,
+ "x": 8,
+ "y": 29
+ },
+ "hiddenSeries": false,
+ "hideTimeOverride": false,
+ "id": 16,
+ "legend": {
+ "alignAsTable": false,
+ "avg": false,
+ "current": true,
+ "max": true,
+ "min": false,
+ "rightSide": false,
+ "show": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 2,
+ "links": [],
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pluginVersion": "6.4.2",
+ "pointradius": 1,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [
+ {
+ "alias": "/filefd_.*/",
+ "lines": false,
+ "pointradius": 1,
+ "points": true
+ },
+ {
+ "alias": "/switches_.*/",
+ "color": "#F2495C",
+ "yaxis": 2
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "node_filefd_allocated{instance=~\"$node\"}",
+ "format": "time_series",
+ "instant": false,
+ "interval": "",
+ "intervalFactor": 5,
+ "legendFormat": "filefd_{{instance}}",
+ "refId": "B"
+ },
+ {
+ "expr": "irate(node_context_switches_total{instance=~\"$node\"}[30m])",
+ "intervalFactor": 5,
+ "legendFormat": "switches_{{instance}}",
+ "refId": "A"
+ },
+ {
+ "expr": "node_filefd_maximum{instance=~\"$node\"}",
+ "hide": true,
+ "refId": "C"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Open File Descriptor(left)/Context switches(right)",
+ "tooltip": {
+ "shared": true,
+ "sort": 2,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": "",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": "context_switches",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {
+ "Idle - Waiting for something to happen": "#052B51",
+ "guest": "#9AC48A",
+ "idle": "#052B51",
+ "iowait": "#EAB839",
+ "irq": "#BF1B00",
+ "nice": "#C15C17",
+ "sdb_每秒I/O操作%": "#d683ce",
+ "softirq": "#E24D42",
+ "steal": "#FCE2DE",
+ "system": "#508642",
+ "user": "#5195CE",
+ "磁盘花费在I/O操作占比": "#ba43a9"
+ },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "prometheus",
+ "decimals": null,
+ "description": "The time spent on I/O in the natural time of each second.(wall-clock time)",
+ "fill": 1,
+ "fillGradient": 5,
+ "gridPos": {
+ "h": 6,
+ "w": 8,
+ "x": 16,
+ "y": 29
+ },
+ "hiddenSeries": false,
+ "id": 175,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "hideEmpty": true,
+ "hideZero": true,
+ "max": true,
+ "min": false,
+ "rightSide": false,
+ "show": true,
+ "sideWidth": null,
+ "sort": null,
+ "sortDesc": null,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 2,
+ "links": [],
+ "maxPerRow": 6,
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "irate(node_disk_io_time_seconds_total{instance=~\"$node\"}[30m])",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "{{instance}}_{{device}}_ IO time",
+ "refId": "C"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Time Spent Doing I/Os",
+ "tooltip": {
+ "shared": true,
+ "sort": 2,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "decimals": null,
+ "format": "s",
+ "label": "",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {
+ "TCP": "#6ED0E0"
+ },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "prometheus",
+ "description": "TCP_alloc - Allocated sockets\n\nCurrEstab - TCP connections for which the current state is either ESTABLISHED or CLOSE- WAIT\n\nTCP_tw - Sockets wating close\n\nUDP_inuse - Udp sockets currently in use\n\nSockets_used - Sockets currently in use",
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 6,
+ "w": 8,
+ "x": 0,
+ "y": 35
+ },
+ "height": "300",
+ "hiddenSeries": false,
+ "id": 158,
+ "interval": "",
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "hideEmpty": true,
+ "hideZero": true,
+ "max": true,
+ "min": false,
+ "rightSide": false,
+ "show": true,
+ "sort": "current",
+ "sortDesc": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 2,
+ "links": [],
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [
+ {
+ "alias": "/.*_Sockets_used/",
+ "color": "#C4162A",
+ "fill": 0
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "node_netstat_Tcp_CurrEstab{instance=~'$node'}",
+ "format": "time_series",
+ "hide": false,
+ "instant": false,
+ "interval": "",
+ "intervalFactor": 1,
+ "legendFormat": "{{instance}}_CurrEstab",
+ "refId": "A",
+ "step": 20
+ },
+ {
+ "expr": "node_sockstat_TCP_tw{instance=~'$node'}",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "{{instance}}_TCP_tw",
+ "refId": "D"
+ },
+ {
+ "expr": "node_sockstat_sockets_used{instance=~'$node'}",
+ "legendFormat": "{{instance}}_Sockets_used",
+ "refId": "B"
+ },
+ {
+ "expr": "node_sockstat_UDP_inuse{instance=~'$node'}",
+ "legendFormat": "{{instance}}_UDP_inuse",
+ "refId": "C"
+ },
+ {
+ "expr": "node_sockstat_TCP_alloc{instance=~'$node'}",
+ "legendFormat": "{{instance}}_TCP_alloc",
+ "refId": "E"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Network Sockstat",
+ "tooltip": {
+ "shared": true,
+ "sort": 2,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {
+ "vda_write": "#6ED0E0"
+ },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "prometheus",
+ "description": "Read/write completions per second",
+ "fill": 1,
+ "fillGradient": 1,
+ "gridPos": {
+ "h": 6,
+ "w": 8,
+ "x": 8,
+ "y": 35
+ },
+ "height": "300",
+ "hiddenSeries": false,
+ "id": 161,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "hideEmpty": true,
+ "hideZero": true,
+ "max": true,
+ "min": false,
+ "show": true,
+ "sort": "current",
+ "sortDesc": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 2,
+ "links": [],
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [
+ {
+ "alias": "/.*_Reads completed$/",
+ "transform": "negative-Y"
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "irate(node_disk_reads_completed_total{instance=~\"$node\"}[30m])",
+ "format": "time_series",
+ "hide": false,
+ "interval": "",
+ "intervalFactor": 1,
+ "legendFormat": "{{instance}}_{{device}}_Reads completed",
+ "refId": "A",
+ "step": 10
+ },
+ {
+ "expr": "irate(node_disk_writes_completed_total{instance=~\"$node\"}[30m])",
+ "format": "time_series",
+ "hide": false,
+ "intervalFactor": 1,
+ "legendFormat": "{{instance}}_{{device}}_Writes completed",
+ "refId": "B",
+ "step": 10
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Disk IOps Completed",
+ "tooltip": {
+ "shared": true,
+ "sort": 2,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "decimals": null,
+ "format": "iops",
+ "label": "IO read (-) / write (+)",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {
+ "vda": "#6ED0E0"
+ },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "prometheus",
+ "description": "Time spent on each read/write operation",
+ "fill": 1,
+ "fillGradient": 1,
+ "gridPos": {
+ "h": 6,
+ "w": 8,
+ "x": 16,
+ "y": 35
+ },
+ "height": "300",
+ "hiddenSeries": false,
+ "id": 160,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "hideEmpty": true,
+ "hideZero": true,
+ "max": true,
+ "min": false,
+ "show": true,
+ "sort": "current",
+ "sortDesc": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 2,
+ "links": [],
+ "nullPointMode": "null as zero",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [
+ {
+ "alias": "/,*_Read time$/",
+ "transform": "negative-Y"
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "irate(node_disk_read_time_seconds_total{instance=~\"$node\"}[30m]) / irate(node_disk_reads_completed_total{instance=~\"$node\"}[30m])",
+ "format": "time_series",
+ "hide": false,
+ "instant": false,
+ "interval": "",
+ "intervalFactor": 1,
+ "legendFormat": "{{instance}}_{{device}}_Read time",
+ "refId": "B"
+ },
+ {
+ "expr": "irate(node_disk_write_time_seconds_total{instance=~\"$node\"}[30m]) / irate(node_disk_writes_completed_total{instance=~\"$node\"}[30m])",
+ "format": "time_series",
+ "hide": false,
+ "instant": false,
+ "intervalFactor": 1,
+ "legendFormat": "{{instance}}_{{device}}_Write time",
+ "refId": "C"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Disk R/W Time(Reference: less than 100ms)(beta)",
+ "tooltip": {
+ "shared": true,
+ "sort": 2,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "s",
+ "label": "Time. read (-) / write (+)",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ }
+ ],
+ "refresh": "5s",
+ "schemaVersion": 22,
+ "style": "dark",
+ "tags": [],
+ "templating": {
+ "list": [
+ {
+ "allValue": null,
+ "current": {
+ "text": "i-08ab98802857b0864",
+ "value": "i-08ab98802857b0864"
+ },
+ "datasource": "prometheus",
+ "definition": "label_values(node_uname_info{job=~\"ec2_instances\"}, instance_id)",
+ "hide": 0,
+ "includeAll": false,
+ "label": "Instance ID",
+ "multi": false,
+ "name": "instance_id",
+ "options": [],
+ "query": "label_values(node_uname_info{job=~\"ec2_instances\"}, instance_id)",
+ "refresh": 1,
+ "regex": "^(?!.*__INSTANCE_ID__).*$",
+ "skipUrlSync": false,
+ "sort": 0,
+ "tagValuesQuery": "",
+ "tags": [],
+ "tagsQuery": "",
+ "type": "query",
+ "useTags": false
+ },
+ {
+ "allFormat": "glob",
+ "allValue": null,
+ "current": {
+ "selected": false,
+ "text": "All",
+ "value": "$__all"
+ },
+ "datasource": "prometheus",
+ "definition": "label_values(node_uname_info{instance_id=~\"$instance_id\"},instance)",
+ "hide": 0,
+ "includeAll": true,
+ "label": "IP",
+ "multi": false,
+ "multiFormat": "regex values",
+ "name": "node",
+ "options": [],
+ "query": "label_values(node_uname_info{instance_id=~\"$instance_id\"},instance)",
+ "refresh": 2,
+ "regex": "",
+ "skipUrlSync": false,
+ "sort": 1,
+ "tagValuesQuery": "",
+ "tags": [],
+ "tagsQuery": "",
+ "type": "query",
+ "useTags": false
+ },
+ {
+ "allValue": null,
+ "current": {
+ "text": "/",
+ "value": "/"
+ },
+ "datasource": "prometheus",
+ "definition": "",
+ "hide": 2,
+ "includeAll": false,
+ "label": "",
+ "multi": false,
+ "name": "maxmount",
+ "options": [],
+ "query": "query_result(topk(1,sort_desc (max(node_filesystem_size_bytes{instance=~'$node',fstype=~\"ext4|xfs\"}) by (mountpoint))))",
+ "refresh": 2,
+ "regex": "/.*\\\"(.*)\\\".*/",
+ "skipUrlSync": false,
+ "sort": 0,
+ "tagValuesQuery": "",
+ "tags": [],
+ "tagsQuery": "",
+ "type": "query",
+ "useTags": false
+ },
+ {
+ "allFormat": "glob",
+ "allValue": null,
+ "current": {
+ "isNone": true,
+ "selected": false,
+ "text": "None",
+ "value": ""
+ },
+ "datasource": "prometheus",
+ "definition": "",
+ "hide": 2,
+ "includeAll": false,
+ "label": "环境",
+ "multi": false,
+ "multiFormat": "regex values",
+ "name": "env",
+ "options": [],
+ "query": "label_values(node_exporter_build_info,env)",
+ "refresh": 2,
+ "regex": "",
+ "skipUrlSync": false,
+ "sort": 1,
+ "tagValuesQuery": "",
+ "tags": [],
+ "tagsQuery": "",
+ "type": "query",
+ "useTags": false
+ },
+ {
+ "allFormat": "glob",
+ "allValue": "",
+ "current": {
+ "text": "None",
+ "value": [
+ ""
+ ]
+ },
+ "datasource": "prometheus",
+ "definition": "label_values(node_exporter_build_info{env=~'$env'},name)",
+ "hide": 2,
+ "includeAll": false,
+ "label": "名称",
+ "multi": true,
+ "multiFormat": "regex values",
+ "name": "name",
+ "options": [],
+ "query": "label_values(node_exporter_build_info{env=~'$env'},name)",
+ "refresh": 2,
+ "regex": "",
+ "skipUrlSync": false,
+ "sort": 1,
+ "tagValuesQuery": "/.*/",
+ "tags": [],
+ "tagsQuery": "",
+ "type": "query",
+ "useTags": false
+ }
+ ]
+ },
+ "time": {
+ "from": "now-15m",
+ "to": "now"
+ },
+ "timepicker": {
+ "now": true,
+ "refresh_intervals": [
+ "5s",
+ "10s",
+ "30s",
+ "1m",
+ "5m",
+ "15m",
+ "30m",
+ "1h",
+ "2h",
+ "1d"
+ ],
+ "time_options": [
+ "5m",
+ "15m",
+ "1h",
+ "6h",
+ "12h",
+ "24h",
+ "2d",
+ "7d",
+ "30d"
+ ]
+ },
+ "timezone": "browser",
+ "title": "Compute Node Details",
+ "uid": "qI8VfvXZz",
+ "version": 1
+}
\ No newline at end of file
diff --git a/monitoring/grafana/dashboards/compute-node-list.json b/monitoring/grafana/dashboards/compute-node-list.json
new file mode 100755
index 0000000..068780a
--- /dev/null
+++ b/monitoring/grafana/dashboards/compute-node-list.json
@@ -0,0 +1,236 @@
+{
+ "annotations": {
+ "list": [
+ {
+ "builtIn": 1,
+ "datasource": "-- Grafana --",
+ "enable": true,
+ "hide": true,
+ "iconColor": "rgba(0, 211, 255, 1)",
+ "name": "Annotations & Alerts",
+ "type": "dashboard"
+ }
+ ]
+ },
+ "editable": true,
+ "gnetId": null,
+ "graphTooltip": 0,
+ "iteration": 1592242343557,
+ "links": [
+ {
+ "$$hashKey": "object:53",
+ "icon": "external link",
+ "tags": [],
+ "type": "dashboards"
+ }
+ ],
+ "panels": [
+ {
+ "columns": [],
+ "datasource": null,
+ "fieldConfig": {
+ "defaults": {
+ "custom": {}
+ },
+ "overrides": []
+ },
+ "fontSize": "100%",
+ "gridPos": {
+ "h": 24,
+ "w": 9,
+ "x": 0,
+ "y": 0
+ },
+ "id": 2,
+ "pageSize": null,
+ "showHeader": true,
+ "sort": {
+ "col": 2,
+ "desc": true
+ },
+ "styles": [
+ {
+ "alias": "Time",
+ "align": "auto",
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "pattern": "Time",
+ "type": "hidden"
+ },
+ {
+ "alias": "Availability Zone",
+ "align": "left",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "decimals": 2,
+ "pattern": "instance_az",
+ "thresholds": [],
+ "type": "number",
+ "unit": "short"
+ },
+ {
+ "alias": "Instance Id",
+ "align": "auto",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "link": true,
+ "linkTargetBlank": true,
+ "linkTooltip": "Go To Node Details",
+ "linkUrl": "/grafana/d/qI8VfvXZz/node-details-copy?var-instance_id=${__cell}",
+ "mappingType": 1,
+ "pattern": "instance_id",
+ "thresholds": [],
+ "type": "number",
+ "unit": "short"
+ },
+ {
+ "alias": "Instance Type",
+ "align": "auto",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "mappingType": 1,
+ "pattern": "instance_type",
+ "thresholds": [],
+ "type": "number",
+ "unit": "short"
+ },
+ {
+ "alias": "CPU load",
+ "align": "auto",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "mappingType": 1,
+ "pattern": "Value #A",
+ "thresholds": [],
+ "type": "number",
+ "unit": "short"
+ },
+ {
+ "alias": "Transmit Rate",
+ "align": "auto",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "mappingType": 1,
+ "pattern": "Value #B",
+ "thresholds": [],
+ "type": "number",
+ "unit": "Bps"
+ },
+ {
+ "alias": "Receive Rate",
+ "align": "auto",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "mappingType": 1,
+ "pattern": "Value #C",
+ "thresholds": [],
+ "type": "number",
+ "unit": "Bps"
+ }
+ ],
+ "targets": [
+ {
+ "expr": "sum(rate(node_cpu_seconds_total{mode!=\"idle\"}[1m])) by (instance_id, instance_type, instance_az)",
+ "format": "table",
+ "instant": true,
+ "legendFormat": "",
+ "refId": "A"
+ },
+ {
+ "expr": "sum(rate(node_network_transmit_bytes_total[1m])) by (instance_id, instance_type, instance_az)",
+ "format": "table",
+ "instant": true,
+ "refId": "B"
+ },
+ {
+ "expr": "sum(rate(node_network_receive_bytes_total[1m])) by (instance_id, instance_type, instance_az)",
+ "format": "table",
+ "instant": true,
+ "refId": "C"
+ }
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "All Available Nodes",
+ "transform": "table",
+ "type": "table-old"
+ }
+ ],
+ "schemaVersion": 25,
+ "style": "dark",
+ "tags": [],
+ "templating": {
+ "list": [
+ {
+ "datasource": null,
+ "filters": [
+ {
+ "condition": "",
+ "key": "instance_id",
+ "operator": "!=",
+ "value": "__INSTANCE_ID__"
+ }
+ ],
+ "hide": 2,
+ "label": "",
+ "name": "Filters",
+ "skipUrlSync": false,
+ "type": "adhoc"
+ }
+ ]
+ },
+ "time": {
+ "from": "now-15m",
+ "to": "now"
+ },
+ "timepicker": {
+ "refresh_intervals": [
+ "10s",
+ "30s",
+ "1m",
+ "5m",
+ "15m",
+ "30m",
+ "1h",
+ "2h",
+ "1d"
+ ]
+ },
+ "timezone": "",
+ "title": "Compute Node List",
+ "uid": "SugNQvuWk",
+ "version": 1
+}
\ No newline at end of file
diff --git a/monitoring/grafana/dashboards/costs.json b/monitoring/grafana/dashboards/costs.json
new file mode 100755
index 0000000..0aa1b82
--- /dev/null
+++ b/monitoring/grafana/dashboards/costs.json
@@ -0,0 +1,674 @@
+{
+ "annotations": {
+ "list": [
+ {
+ "builtIn": 1,
+ "datasource": "-- Grafana --",
+ "enable": true,
+ "hide": true,
+ "iconColor": "rgba(0, 211, 255, 1)",
+ "name": "Annotations & Alerts",
+ "type": "dashboard"
+ }
+ ]
+ },
+ "editable": true,
+ "gnetId": null,
+ "graphTooltip": 0,
+ "links": [
+ {
+ "$$hashKey": "object:56",
+ "asDropdown": false,
+ "icon": "external link",
+ "tags": [],
+ "title": "Dash",
+ "type": "dashboards"
+ }
+ ],
+ "panels": [
+ {
+ "datasource": "prometheus",
+ "description": "",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {},
+ "decimals": 2,
+ "displayName": "Jobs",
+ "mappings": [],
+ "max": 10,
+ "min": 0,
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ },
+ {
+ "color": "#6ED0E0",
+ "value": 500
+ },
+ {
+ "color": "#EAB839",
+ "value": 1000
+ },
+ {
+ "color": "red",
+ "value": 1800
+ }
+ ]
+ },
+ "unit": "currencyUSD"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 7,
+ "w": 4,
+ "x": 0,
+ "y": 0
+ },
+ "id": 10,
+ "options": {
+ "orientation": "auto",
+ "reduceOptions": {
+ "calcs": [
+ "last"
+ ],
+ "fields": "",
+ "values": false
+ },
+ "showThresholdLabels": false,
+ "showThresholdMarkers": true
+ },
+ "pluginVersion": "7.1.0",
+ "targets": [
+ {
+ "expr": "master_node_cost",
+ "instant": false,
+ "interval": "1",
+ "intervalFactor": 1,
+ "legendFormat": " ",
+ "refId": "A"
+ }
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Master Node hourly Cost",
+ "type": "gauge"
+ },
+ {
+ "datasource": "prometheus",
+ "description": "",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {},
+ "decimals": 2,
+ "displayName": "Jobs",
+ "mappings": [],
+ "max": 5,
+ "min": 0,
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ },
+ {
+ "color": "red",
+ "value": 4.01
+ }
+ ]
+ },
+ "unit": "currencyUSD"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 7,
+ "w": 4,
+ "x": 4,
+ "y": 0
+ },
+ "id": 7,
+ "options": {
+ "orientation": "auto",
+ "reduceOptions": {
+ "calcs": [
+ "last"
+ ],
+ "fields": "",
+ "values": false
+ },
+ "showThresholdLabels": false,
+ "showThresholdMarkers": true
+ },
+ "pluginVersion": "7.1.0",
+ "targets": [
+ {
+ "expr": "ebs_master_cost",
+ "instant": false,
+ "interval": "1",
+ "intervalFactor": 1,
+ "legendFormat": " ",
+ "refId": "A"
+ }
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "EBS (Master node) hourly Cost",
+ "type": "gauge"
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "prometheus",
+ "description": "",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {},
+ "links": []
+ },
+ "overrides": []
+ },
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 14,
+ "w": 16,
+ "x": 8,
+ "y": 0
+ },
+ "hiddenSeries": false,
+ "id": 3,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
+ "min": true,
+ "rightSide": true,
+ "show": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 2,
+ "nullPointMode": "null",
+ "percentage": false,
+ "pluginVersion": "7.1.0",
+ "pointradius": 2,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": true,
+ "targets": [
+ {
+ "expr": "s3_cost+master_node_cost+fsx_cost+ebs_master_cost+compute_nodes_cost+ebs_compute_cost",
+ "instant": false,
+ "interval": "1",
+ "intervalFactor": 1,
+ "legendFormat": "Entire Cluster cost",
+ "refId": "A"
+ },
+ {
+ "expr": "s3_cost",
+ "instant": false,
+ "interval": "1",
+ "intervalFactor": 1,
+ "legendFormat": "S3",
+ "refId": "B"
+ },
+ {
+ "expr": "master_node_cost",
+ "instant": false,
+ "interval": "1",
+ "intervalFactor": 1,
+ "legendFormat": "Master Node",
+ "refId": "C"
+ },
+ {
+ "expr": "fsx_cost",
+ "instant": false,
+ "interval": "1",
+ "intervalFactor": 1,
+ "legendFormat": "FSx",
+ "refId": "D"
+ },
+ {
+ "expr": "ebs_master_cost",
+ "instant": false,
+ "interval": "1",
+ "intervalFactor": 1,
+ "legendFormat": "EBS (Master)",
+ "refId": "E"
+ },
+ {
+ "expr": "compute_nodes_cost",
+ "instant": false,
+ "interval": "1",
+ "intervalFactor": 1,
+ "legendFormat": "Compute Nodes",
+ "refId": "F"
+ },
+ {
+ "expr": "ebs_compute_cost",
+ "instant": false,
+ "interval": "1",
+ "intervalFactor": 1,
+ "legendFormat": "EBS (Compute)",
+ "refId": "G"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Cluster Total Hourly Cost",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "decimals": 1,
+ "format": "currencyUSD",
+ "label": "$ / hour",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "datasource": "prometheus",
+ "description": "",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {},
+ "decimals": 2,
+ "displayName": "Jobs",
+ "mappings": [],
+ "max": 2000,
+ "min": 0,
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ },
+ {
+ "color": "#EAB839",
+ "value": 500.01
+ },
+ {
+ "color": "red",
+ "value": 1800
+ }
+ ]
+ },
+ "unit": "currencyUSD"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 7,
+ "w": 4,
+ "x": 0,
+ "y": 7
+ },
+ "id": 9,
+ "options": {
+ "orientation": "auto",
+ "reduceOptions": {
+ "calcs": [
+ "last"
+ ],
+ "fields": "",
+ "values": false
+ },
+ "showThresholdLabels": false,
+ "showThresholdMarkers": true
+ },
+ "pluginVersion": "7.1.0",
+ "targets": [
+ {
+ "expr": "compute_nodes_cost",
+ "instant": false,
+ "interval": "1",
+ "intervalFactor": 1,
+ "legendFormat": " ",
+ "refId": "A"
+ }
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Compute Node hourly Cost",
+ "type": "gauge"
+ },
+ {
+ "datasource": "prometheus",
+ "description": "",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {},
+ "decimals": 2,
+ "displayName": "Jobs",
+ "mappings": [],
+ "max": 20,
+ "min": 0,
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ },
+ {
+ "color": "#EAB839",
+ "value": 5.01
+ },
+ {
+ "color": "red",
+ "value": 18
+ }
+ ]
+ },
+ "unit": "currencyUSD"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 7,
+ "w": 4,
+ "x": 4,
+ "y": 7
+ },
+ "id": 8,
+ "options": {
+ "orientation": "auto",
+ "reduceOptions": {
+ "calcs": [
+ "last"
+ ],
+ "fields": "",
+ "values": false
+ },
+ "showThresholdLabels": false,
+ "showThresholdMarkers": true
+ },
+ "pluginVersion": "7.1.0",
+ "targets": [
+ {
+ "expr": "ebs_compute_cost",
+ "instant": false,
+ "interval": "1",
+ "intervalFactor": 1,
+ "legendFormat": " ",
+ "refId": "A"
+ }
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "EBS (Compute nodes) hourly Cost",
+ "type": "gauge"
+ },
+ {
+ "datasource": "prometheus",
+ "description": "",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {},
+ "decimals": 2,
+ "displayName": "Jobs",
+ "mappings": [],
+ "max": 20,
+ "min": 0,
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ },
+ {
+ "color": "#EAB839",
+ "value": 5.01
+ },
+ {
+ "color": "red",
+ "value": 18
+ }
+ ]
+ },
+ "unit": "currencyUSD"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 7,
+ "w": 4,
+ "x": 0,
+ "y": 14
+ },
+ "id": 5,
+ "options": {
+ "orientation": "auto",
+ "reduceOptions": {
+ "calcs": [
+ "last"
+ ],
+ "fields": "",
+ "values": false
+ },
+ "showThresholdLabels": false,
+ "showThresholdMarkers": true
+ },
+ "pluginVersion": "7.1.0",
+ "targets": [
+ {
+ "expr": "fsx_cost",
+ "instant": false,
+ "interval": "1",
+ "intervalFactor": 1,
+ "legendFormat": " ",
+ "refId": "A"
+ }
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "FSx hourly Cost",
+ "type": "gauge"
+ },
+ {
+ "datasource": "prometheus",
+ "description": "",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {},
+ "decimals": 2,
+ "displayName": "Jobs",
+ "mappings": [],
+ "max": 20,
+ "min": 0,
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ },
+ {
+ "color": "#EAB839",
+ "value": 5.01
+ },
+ {
+ "color": "red",
+ "value": 18
+ }
+ ]
+ },
+ "unit": "currencyUSD"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 7,
+ "w": 4,
+ "x": 4,
+ "y": 14
+ },
+ "id": 6,
+ "options": {
+ "orientation": "auto",
+ "reduceOptions": {
+ "calcs": [
+ "last"
+ ],
+ "fields": "",
+ "values": false
+ },
+ "showThresholdLabels": false,
+ "showThresholdMarkers": true
+ },
+ "pluginVersion": "7.1.0",
+ "targets": [
+ {
+ "expr": "s3_cost",
+ "instant": false,
+ "interval": "1",
+ "intervalFactor": 1,
+ "legendFormat": " ",
+ "refId": "A"
+ }
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "S3 hourly Cost",
+ "type": "gauge"
+ },
+ {
+ "datasource": "prometheus",
+ "description": "",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {},
+ "decimals": 2,
+ "displayName": "Jobs",
+ "mappings": [],
+ "max": 20,
+ "min": 0,
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ },
+ {
+ "color": "#EAB839",
+ "value": 5.01
+ },
+ {
+ "color": "red",
+ "value": 18
+ }
+ ]
+ },
+ "unit": "currencyUSD"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 7,
+ "w": 4,
+ "x": 8,
+ "y": 14
+ },
+ "id": 11,
+ "options": {
+ "orientation": "auto",
+ "reduceOptions": {
+ "calcs": [
+ "last"
+ ],
+ "fields": "",
+ "values": false
+ },
+ "showThresholdLabels": false,
+ "showThresholdMarkers": true
+ },
+ "pluginVersion": "7.1.0",
+ "targets": [
+ {
+ "expr": "s3_cost",
+ "instant": false,
+ "interval": "1",
+ "intervalFactor": 1,
+ "legendFormat": " ",
+ "refId": "A"
+ }
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "EFS hourly cost",
+ "type": "gauge"
+ }
+ ],
+ "refresh": "1m",
+ "schemaVersion": 26,
+ "style": "dark",
+ "tags": [],
+ "templating": {
+ "list": []
+ },
+ "time": {
+ "from": "now-6h",
+ "to": "now"
+ },
+ "timepicker": {
+ "refresh_intervals": [
+ "10s",
+ "30s",
+ "1m",
+ "5m",
+ "15m",
+ "30m",
+ "1h",
+ "2h",
+ "1d"
+ ]
+ },
+ "timezone": "",
+ "title": "Cluster Cost",
+ "uid": "LTU-foiMk",
+ "version": 1
+}
\ No newline at end of file
diff --git a/monitoring/grafana/dashboards/dashboards.yml b/monitoring/grafana/dashboards/dashboards.yml
new file mode 100755
index 0000000..181f601
--- /dev/null
+++ b/monitoring/grafana/dashboards/dashboards.yml
@@ -0,0 +1,10 @@
+apiVersion: 1
+providers:
+ - name: "Dashboards"
+ orgId: 1
+ folder: ""
+ type: file
+ disableDeletion: false
+ editable: true
+ options:
+ path: /etc/grafana/provisioning/dashboards
diff --git a/monitoring/grafana/dashboards/gpu.json b/monitoring/grafana/dashboards/gpu.json
new file mode 100644
index 0000000..fa3d35c
--- /dev/null
+++ b/monitoring/grafana/dashboards/gpu.json
@@ -0,0 +1,1921 @@
+{
+ "annotations": {
+ "list": [
+ {
+ "builtIn": 1,
+ "datasource": "-- Grafana --",
+ "enable": true,
+ "hide": true,
+ "iconColor": "rgba(0, 211, 255, 1)",
+ "name": "Annotations & Alerts",
+ "type": "dashboard"
+ }
+ ]
+ },
+ "description": "Dashboard to visualize data from the NVIDIA Data Center GPU Manager (DCGM)",
+ "editable": true,
+ "gnetId": 11752,
+ "graphTooltip": 0,
+ "iteration": 1606149765866,
+ "links": [
+ {
+ "icon": "external link",
+ "tags": [],
+ "type": "dashboards"
+ }
+ ],
+ "panels": [
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorValue": false,
+ "colors": [
+ "#299c46",
+ "rgba(237, 129, 40, 0.89)",
+ "#d44a3a"
+ ],
+ "datasource": "prometheus",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {}
+ },
+ "overrides": []
+ },
+ "format": "hertz",
+ "gauge": {
+ "maxValue": 100,
+ "minValue": 0,
+ "show": false,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "gridPos": {
+ "h": 2,
+ "w": 3,
+ "x": 0,
+ "y": 0
+ },
+ "hideTimeOverride": true,
+ "id": 59,
+ "interval": null,
+ "links": [],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "nullPointMode": "null",
+ "nullText": null,
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": false
+ },
+ "tableColumn": "nodename",
+ "targets": [
+ {
+ "expr": "node_uname_info{instance_id=~\"$instance_id\"}",
+ "format": "table",
+ "hide": false,
+ "interval": "",
+ "intervalFactor": 1,
+ "legendFormat": "",
+ "refId": "A"
+ }
+ ],
+ "thresholds": "",
+ "title": "Hostname",
+ "type": "singlestat",
+ "valueFontSize": "50%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "N/A",
+ "value": "null"
+ }
+ ],
+ "valueName": "current"
+ },
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorValue": false,
+ "colors": [
+ "#299c46",
+ "rgba(237, 129, 40, 0.89)",
+ "#d44a3a"
+ ],
+ "datasource": "prometheus",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {}
+ },
+ "overrides": []
+ },
+ "format": "hertz",
+ "gauge": {
+ "maxValue": 100,
+ "minValue": 0,
+ "show": false,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "gridPos": {
+ "h": 2,
+ "w": 3,
+ "x": 3,
+ "y": 0
+ },
+ "id": 60,
+ "interval": null,
+ "links": [],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "nullPointMode": "connected",
+ "nullText": null,
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": false
+ },
+ "tableColumn": "release",
+ "targets": [
+ {
+ "expr": "node_uname_info{instance_id=\"$instance_id\"}",
+ "format": "table",
+ "intervalFactor": 1,
+ "refId": "A"
+ }
+ ],
+ "thresholds": "",
+ "title": "Kernel",
+ "type": "singlestat",
+ "valueFontSize": "50%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "N/A",
+ "value": "null"
+ }
+ ],
+ "valueName": "current"
+ },
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorValue": false,
+ "colors": [
+ "#299c46",
+ "rgba(237, 129, 40, 0.89)",
+ "#d44a3a"
+ ],
+ "datasource": "prometheus",
+ "description": "",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {}
+ },
+ "overrides": []
+ },
+ "format": "percent",
+ "gauge": {
+ "maxValue": 100,
+ "minValue": 0,
+ "show": true,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "gridPos": {
+ "h": 5,
+ "w": 3,
+ "x": 6,
+ "y": 0
+ },
+ "id": 58,
+ "interval": null,
+ "links": [],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "nullPointMode": "connected",
+ "nullText": null,
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": false
+ },
+ "tableColumn": "",
+ "targets": [
+ {
+ "expr": "DCGM_FI_DEV_GPU_UTIL{instance_id=\"$instance_id\"}",
+ "format": "time_series",
+ "instant": true,
+ "interval": "",
+ "intervalFactor": 1,
+ "legendFormat": "",
+ "refId": "A"
+ }
+ ],
+ "thresholds": "80,90",
+ "title": "GPU Total Utilization",
+ "type": "singlestat",
+ "valueFontSize": "80%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "N/A",
+ "value": "null"
+ }
+ ],
+ "valueName": "current"
+ },
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorValue": false,
+ "colors": [
+ "#299c46",
+ "rgba(237, 129, 40, 0.89)",
+ "#d44a3a"
+ ],
+ "datasource": "prometheus",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {}
+ },
+ "overrides": []
+ },
+ "format": "watt",
+ "gauge": {
+ "maxValue": 100,
+ "minValue": 0,
+ "show": true,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "gridPos": {
+ "h": 5,
+ "w": 3,
+ "x": 9,
+ "y": 0
+ },
+ "id": 30,
+ "interval": null,
+ "links": [],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "nullPointMode": "connected",
+ "nullText": null,
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": false
+ },
+ "tableColumn": "",
+ "targets": [
+ {
+ "expr": "DCGM_FI_DEV_POWER_USAGE{instance_id=\"$instance_id\"}",
+ "format": "time_series",
+ "instant": true,
+ "interval": "",
+ "intervalFactor": 1,
+ "legendFormat": "",
+ "refId": "A"
+ }
+ ],
+ "thresholds": "60,90",
+ "title": "GPU Total Power",
+ "type": "singlestat",
+ "valueFontSize": "80%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "N/A",
+ "value": "null"
+ }
+ ],
+ "valueName": "current"
+ },
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorValue": false,
+ "colors": [
+ "#299c46",
+ "rgba(237, 129, 40, 0.89)",
+ "#d44a3a"
+ ],
+ "datasource": "prometheus",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {}
+ },
+ "overrides": []
+ },
+ "format": "celsius",
+ "gauge": {
+ "maxValue": 90,
+ "minValue": 0,
+ "show": true,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "gridPos": {
+ "h": 5,
+ "w": 3,
+ "x": 12,
+ "y": 0
+ },
+ "id": 31,
+ "interval": null,
+ "links": [],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "nullPointMode": "connected",
+ "nullText": null,
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": false
+ },
+ "tableColumn": "",
+ "targets": [
+ {
+ "expr": "DCGM_FI_DEV_GPU_TEMP{instance_id=\"$instance_id\"}",
+ "format": "time_series",
+ "instant": true,
+ "interval": "",
+ "intervalFactor": 1,
+ "legendFormat": "",
+ "refId": "A"
+ }
+ ],
+ "thresholds": "83,87",
+ "title": "GPU Avg. Temperature",
+ "type": "singlestat",
+ "valueFontSize": "80%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "N/A",
+ "value": "null"
+ }
+ ],
+ "valueName": "current"
+ },
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorValue": false,
+ "colors": [
+ "#299c46",
+ "rgba(237, 129, 40, 0.89)",
+ "#d44a3a"
+ ],
+ "datasource": "prometheus",
+ "description": "",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {}
+ },
+ "overrides": []
+ },
+ "format": "percent",
+ "gauge": {
+ "maxValue": 100,
+ "minValue": 0,
+ "show": true,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "gridPos": {
+ "h": 5,
+ "w": 3,
+ "x": 15,
+ "y": 0
+ },
+ "id": 68,
+ "interval": null,
+ "links": [],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "nullPointMode": "connected",
+ "nullText": null,
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": false
+ },
+ "tableColumn": "",
+ "targets": [
+ {
+ "expr": "DCGM_FI_DEV_DEC_UTIL{instance_id=\"$instance_id\"}",
+ "format": "time_series",
+ "instant": true,
+ "interval": "",
+ "intervalFactor": 1,
+ "legendFormat": "",
+ "refId": "A"
+ }
+ ],
+ "thresholds": "80,90",
+ "title": "GPU Decored Utilization",
+ "type": "singlestat",
+ "valueFontSize": "80%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "N/A",
+ "value": "null"
+ }
+ ],
+ "valueName": "current"
+ },
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorValue": false,
+ "colors": [
+ "#299c46",
+ "rgba(237, 129, 40, 0.89)",
+ "#d44a3a"
+ ],
+ "datasource": "prometheus",
+ "description": "",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {}
+ },
+ "overrides": []
+ },
+ "format": "percent",
+ "gauge": {
+ "maxValue": 100,
+ "minValue": 0,
+ "show": true,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "gridPos": {
+ "h": 5,
+ "w": 3,
+ "x": 18,
+ "y": 0
+ },
+ "id": 67,
+ "interval": null,
+ "links": [],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "nullPointMode": "connected",
+ "nullText": null,
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": false
+ },
+ "tableColumn": "",
+ "targets": [
+ {
+ "expr": "DCGM_FI_DEV_ENC_UTIL{instance_id=\"$instance_id\"}",
+ "format": "time_series",
+ "instant": true,
+ "interval": "",
+ "intervalFactor": 1,
+ "legendFormat": "",
+ "refId": "A"
+ }
+ ],
+ "thresholds": "80,90",
+ "title": "GPU Encored Utilization",
+ "type": "singlestat",
+ "valueFontSize": "80%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "N/A",
+ "value": "null"
+ }
+ ],
+ "valueName": "current"
+ },
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorValue": false,
+ "colors": [
+ "#299c46",
+ "rgba(237, 129, 40, 0.89)",
+ "#d44a3a"
+ ],
+ "datasource": "prometheus",
+ "description": "",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {}
+ },
+ "overrides": []
+ },
+ "format": "percent",
+ "gauge": {
+ "maxValue": 100,
+ "minValue": 0,
+ "show": true,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "gridPos": {
+ "h": 5,
+ "w": 3,
+ "x": 21,
+ "y": 0
+ },
+ "id": 40,
+ "interval": null,
+ "links": [],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "nullPointMode": "connected",
+ "nullText": null,
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": false
+ },
+ "tableColumn": "",
+ "targets": [
+ {
+ "expr": "DCGM_FI_DEV_FB_USED{instance_id=\"$instance_id\"}/(DCGM_FI_DEV_FB_FREE{instance_id=\"$instance_id\"}+DCGM_FI_DEV_FB_USED{instance_id=\"$instance_id\"})*100",
+ "format": "time_series",
+ "instant": true,
+ "interval": "",
+ "intervalFactor": 1,
+ "legendFormat": "",
+ "refId": "A"
+ }
+ ],
+ "thresholds": "70,90",
+ "title": "GPU Mem Util.",
+ "type": "singlestat",
+ "valueFontSize": "80%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "N/A",
+ "value": "null"
+ }
+ ],
+ "valueName": "current"
+ },
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorValue": false,
+ "colors": [
+ "#299c46",
+ "rgba(237, 129, 40, 0.89)",
+ "#d44a3a"
+ ],
+ "datasource": "prometheus",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {}
+ },
+ "overrides": []
+ },
+ "format": "hertz",
+ "gauge": {
+ "maxValue": 100,
+ "minValue": 0,
+ "show": false,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "gridPos": {
+ "h": 3,
+ "w": 3,
+ "x": 0,
+ "y": 2
+ },
+ "id": 44,
+ "interval": null,
+ "links": [],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "nullPointMode": "connected",
+ "nullText": null,
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": false
+ },
+ "tableColumn": "",
+ "targets": [
+ {
+ "expr": "DCGM_FI_DEV_SM_CLOCK{instance_id=\"$instance_id\"}*1000000",
+ "format": "time_series",
+ "interval": "",
+ "intervalFactor": 1,
+ "legendFormat": "",
+ "refId": "A"
+ }
+ ],
+ "thresholds": "",
+ "title": "GPU SM Clocks",
+ "type": "singlestat",
+ "valueFontSize": "80%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "N/A",
+ "value": "null"
+ }
+ ],
+ "valueName": "current"
+ },
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorValue": false,
+ "colors": [
+ "#299c46",
+ "rgba(237, 129, 40, 0.89)",
+ "#d44a3a"
+ ],
+ "datasource": "prometheus",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {}
+ },
+ "overrides": []
+ },
+ "format": "hertz",
+ "gauge": {
+ "maxValue": 100,
+ "minValue": 0,
+ "show": false,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "gridPos": {
+ "h": 3,
+ "w": 3,
+ "x": 3,
+ "y": 2
+ },
+ "id": 45,
+ "interval": null,
+ "links": [],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "nullPointMode": "connected",
+ "nullText": null,
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": false
+ },
+ "tableColumn": "",
+ "targets": [
+ {
+ "expr": "DCGM_FI_DEV_MEM_CLOCK{instance_id=\"$instance_id\"}*1000000",
+ "format": "time_series",
+ "interval": "",
+ "intervalFactor": 1,
+ "legendFormat": "",
+ "refId": "A"
+ }
+ ],
+ "thresholds": "",
+ "title": "GPU Memory Clocks",
+ "type": "singlestat",
+ "valueFontSize": "80%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "N/A",
+ "value": "null"
+ }
+ ],
+ "valueName": "current"
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "prometheus",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {},
+ "links": []
+ },
+ "overrides": []
+ },
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 5,
+ "w": 8,
+ "x": 0,
+ "y": 5
+ },
+ "hiddenSeries": false,
+ "id": 35,
+ "legend": {
+ "avg": false,
+ "current": true,
+ "max": false,
+ "min": false,
+ "show": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null as zero",
+ "options": {
+ "alertThreshold": true
+ },
+ "percentage": false,
+ "pluginVersion": "7.3.3",
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": true,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "sum(rate(DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL{instance_id=\"$instance_id\"}[$__interval]))",
+ "format": "time_series",
+ "hide": false,
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "Total",
+ "refId": "A"
+ },
+ {
+ "expr": "dcgm_nv",
+ "format": "time_series",
+ "hide": true,
+ "interval": "",
+ "intervalFactor": 1,
+ "legendFormat": "",
+ "refId": "B"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "NVLINK Bandwidth",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "Bps",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "watt",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "prometheus",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {},
+ "links": []
+ },
+ "overrides": []
+ },
+ "fill": 0,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 5,
+ "w": 8,
+ "x": 8,
+ "y": 5
+ },
+ "hiddenSeries": false,
+ "id": 57,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
+ "min": true,
+ "rightSide": true,
+ "show": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 2,
+ "links": [],
+ "nullPointMode": "connected",
+ "options": {
+ "alertThreshold": true
+ },
+ "percentage": false,
+ "pluginVersion": "7.3.3",
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "DCGM_FI_DEV_GPU_UTIL{instance_id=\"$instance_id\"}",
+ "format": "time_series",
+ "hide": false,
+ "interval": "",
+ "intervalFactor": 1,
+ "legendFormat": "GPU {{gpu}}",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "GPU Utilization",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "percent",
+ "label": null,
+ "logBase": 1,
+ "max": "100",
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "watt",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "prometheus",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {},
+ "links": []
+ },
+ "overrides": []
+ },
+ "fill": 0,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 5,
+ "w": 8,
+ "x": 16,
+ "y": 5
+ },
+ "hiddenSeries": false,
+ "id": 25,
+ "legend": {
+ "avg": false,
+ "current": true,
+ "max": false,
+ "min": false,
+ "show": false,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 2,
+ "links": [],
+ "nullPointMode": "connected",
+ "options": {
+ "alertThreshold": true
+ },
+ "percentage": false,
+ "pluginVersion": "7.3.3",
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "DCGM_FI_DEV_GPU_TEMP{instance_id=\"$instance_id\"}",
+ "format": "time_series",
+ "hide": false,
+ "interval": "",
+ "intervalFactor": 1,
+ "legendFormat": "GPU {{gpu}}",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "GPU Temperature",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "celsius",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "prometheus",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {},
+ "links": []
+ },
+ "overrides": []
+ },
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 5,
+ "w": 8,
+ "x": 0,
+ "y": 10
+ },
+ "hiddenSeries": false,
+ "id": 36,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "hideEmpty": false,
+ "hideZero": false,
+ "max": true,
+ "min": false,
+ "rightSide": true,
+ "show": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null as zero",
+ "options": {
+ "alertThreshold": true
+ },
+ "percentage": false,
+ "pluginVersion": "7.3.3",
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [
+ {
+ "alias": "Rx",
+ "transform": "negative-Y"
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "DCGM_FI_DEV_PCIE_TX_THROUGHPUT{instance_id=\"$instance_id\"}",
+ "format": "time_series",
+ "hide": false,
+ "instant": false,
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "GPU{{gpu}}_Tx",
+ "refId": "A"
+ },
+ {
+ "expr": "DCGM_FI_DEV_PCIE_RX_THROUGHPUT{instance_id=\"$instance_id\"}",
+ "format": "time_series",
+ "hide": false,
+ "instant": false,
+ "interval": "",
+ "intervalFactor": 2,
+ "legendFormat": "GPU{{gpu}}_Rx",
+ "refId": "B"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "PCIe Throughput",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "KBs",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "watt",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "prometheus",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {},
+ "links": []
+ },
+ "overrides": []
+ },
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 5,
+ "w": 8,
+ "x": 8,
+ "y": 10
+ },
+ "hiddenSeries": false,
+ "id": 64,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
+ "min": false,
+ "rightSide": true,
+ "show": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 2,
+ "nullPointMode": "null",
+ "options": {
+ "alertThreshold": true
+ },
+ "percentage": false,
+ "pluginVersion": "7.3.3",
+ "pointradius": 2,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "DCGM_FI_DEV_SM_CLOCK{instance_id=\"$instance_id\"}",
+ "interval": "",
+ "legendFormat": "GPU {{gpu}}",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "GPU SM Clocks",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "hertz",
+ "label": null,
+ "logBase": 1,
+ "max": "100",
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "prometheus",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {},
+ "links": []
+ },
+ "overrides": []
+ },
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 5,
+ "w": 8,
+ "x": 16,
+ "y": 10
+ },
+ "hiddenSeries": false,
+ "id": 24,
+ "legend": {
+ "avg": false,
+ "current": true,
+ "max": false,
+ "min": false,
+ "show": false,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 2,
+ "links": [],
+ "nullPointMode": "connected",
+ "options": {
+ "alertThreshold": true
+ },
+ "percentage": false,
+ "pluginVersion": "7.3.3",
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": true,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "DCGM_FI_DEV_POWER_USAGE{instance_id=\"$instance_id\"}",
+ "format": "time_series",
+ "hide": false,
+ "interval": "",
+ "intervalFactor": 1,
+ "legendFormat": "GPU {{gpu}}",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "GPU Power Usage",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "watt",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "watt",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "prometheus",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {},
+ "links": []
+ },
+ "overrides": []
+ },
+ "fill": 0,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 5,
+ "w": 8,
+ "x": 0,
+ "y": 15
+ },
+ "hiddenSeries": false,
+ "id": 42,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
+ "min": true,
+ "rightSide": true,
+ "show": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 2,
+ "links": [],
+ "nullPointMode": "connected",
+ "options": {
+ "alertThreshold": true
+ },
+ "percentage": false,
+ "pluginVersion": "7.3.3",
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "DCGM_FI_DEV_FB_USED{instance_id=\"$instance_id\"}",
+ "format": "time_series",
+ "hide": false,
+ "interval": "",
+ "intervalFactor": 1,
+ "legendFormat": "GPU {{gpu}}",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "GPU Mem Usage",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "$$hashKey": "object:1193",
+ "decimals": null,
+ "format": "decmbytes",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "$$hashKey": "object:1194",
+ "format": "watt",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "prometheus",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {},
+ "links": []
+ },
+ "overrides": []
+ },
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 5,
+ "w": 8,
+ "x": 8,
+ "y": 15
+ },
+ "hiddenSeries": false,
+ "id": 62,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
+ "min": false,
+ "rightSide": true,
+ "show": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 2,
+ "nullPointMode": "null",
+ "options": {
+ "alertThreshold": true
+ },
+ "percentage": false,
+ "pluginVersion": "7.3.3",
+ "pointradius": 2,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "DCGM_FI_DEV_MEM_CLOCK{instance_id=\"$instance_id\"}",
+ "interval": "",
+ "legendFormat": "GPU {{gpu}}",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "GPU Memory Clocks",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "hertz",
+ "label": null,
+ "logBase": 1,
+ "max": "100",
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "prometheus",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {},
+ "links": []
+ },
+ "overrides": []
+ },
+ "fill": 0,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 5,
+ "w": 8,
+ "x": 16,
+ "y": 15
+ },
+ "hiddenSeries": false,
+ "id": 39,
+ "legend": {
+ "avg": false,
+ "current": true,
+ "max": false,
+ "min": false,
+ "show": false,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 2,
+ "links": [],
+ "nullPointMode": "connected",
+ "options": {
+ "alertThreshold": true
+ },
+ "percentage": false,
+ "pluginVersion": "7.3.3",
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "DCGM_FI_DEV_MEM_COPY_UTIL{instance_id=\"$instance_id\"}",
+ "format": "time_series",
+ "hide": false,
+ "interval": "",
+ "intervalFactor": 1,
+ "legendFormat": "GPU {{gpu}}",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "GPU Mem Cpy Utilization",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "percent",
+ "label": null,
+ "logBase": 1,
+ "max": "100",
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "watt",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ }
+ ],
+ "refresh": "10s",
+ "schemaVersion": 26,
+ "style": "dark",
+ "tags": [
+ "GPU"
+ ],
+ "templating": {
+ "list": [
+ {
+ "allValue": null,
+ "current": {
+ "selected": true,
+ "text": "i-0edc71e97259faa4c",
+ "value": "i-0edc71e97259faa4c"
+ },
+ "datasource": "prometheus",
+ "definition": "label_values(node_uname_info{job=~\"ec2_instances\",instance_type=~\"[pg][2-4].*\"}, instance_id)",
+ "error": null,
+ "hide": 0,
+ "includeAll": false,
+ "label": "Instance ID",
+ "multi": false,
+ "name": "instance_id",
+ "options": [],
+ "query": "label_values(node_uname_info{job=~\"ec2_instances\",instance_type=~\"[pg][2-4].*\"}, instance_id)",
+ "refresh": 1,
+ "regex": "",
+ "skipUrlSync": false,
+ "sort": 0,
+ "tagValuesQuery": "",
+ "tags": [],
+ "tagsQuery": "",
+ "type": "query",
+ "useTags": false
+ }
+ ]
+ },
+ "time": {
+ "from": "now-15m",
+ "to": "now"
+ },
+ "timepicker": {
+ "refresh_intervals": [
+ "5s",
+ "10s",
+ "30s",
+ "1m",
+ "5m",
+ "15m",
+ "30m",
+ "1h",
+ "2h",
+ "1d"
+ ],
+ "time_options": [
+ "5m",
+ "15m",
+ "1h",
+ "6h",
+ "12h",
+ "24h",
+ "2d",
+ "7d",
+ "30d"
+ ]
+ },
+ "timezone": "browser",
+ "title": "GPU Nodes",
+ "uid": "hpcsyl6zhqk",
+ "version": 2
+}
\ No newline at end of file
diff --git a/monitoring/grafana/dashboards/logs.json b/monitoring/grafana/dashboards/logs.json
new file mode 100755
index 0000000..c3c2803
--- /dev/null
+++ b/monitoring/grafana/dashboards/logs.json
@@ -0,0 +1,729 @@
+{
+ "annotations": {
+ "list": [
+ {
+ "builtIn": 1,
+ "datasource": "-- Grafana --",
+ "enable": true,
+ "hide": true,
+ "iconColor": "rgba(0, 211, 255, 1)",
+ "name": "Annotations & Alerts",
+ "type": "dashboard"
+ }
+ ]
+ },
+ "editable": true,
+ "gnetId": null,
+ "graphTooltip": 0,
+ "links": [
+ {
+ "$$hashKey": "object:58",
+ "icon": "external link",
+ "tags": [],
+ "type": "dashboards"
+ }
+ ],
+ "panels": [
+ {
+ "collapsed": false,
+ "datasource": null,
+ "gridPos": {
+ "h": 1,
+ "w": 24,
+ "x": 0,
+ "y": 0
+ },
+ "id": 15,
+ "panels": [],
+ "title": "Slurm",
+ "type": "row"
+ },
+ {
+ "datasource": "cloudwatch",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {}
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 12,
+ "w": 12,
+ "x": 0,
+ "y": 1
+ },
+ "id": 16,
+ "options": {
+ "showLabels": false,
+ "showTime": false,
+ "sortOrder": "Descending",
+ "wrapLogMessage": true
+ },
+ "targets": [
+ {
+ "alias": "",
+ "apiMode": "Logs",
+ "dimensions": {},
+ "expression": "fields @timestamp, @message, @logStream\n| sort @timestamp desc\n| limit 20\n| filter @logStream like /slurmctld/",
+ "id": "",
+ "logGroupNames": [
+ "__LOG_GROUP__NAMES__"
+ ],
+ "matchExact": true,
+ "metricName": "",
+ "namespace": "",
+ "period": "",
+ "queryMode": "Logs",
+ "refId": "A",
+ "region": "__AWS_REGION__",
+ "statistics": [
+ "Average"
+ ],
+ "statsGroups": []
+ }
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Slurmctld",
+ "type": "logs"
+ },
+ {
+ "datasource": "cloudwatch",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {}
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 12,
+ "w": 12,
+ "x": 12,
+ "y": 1
+ },
+ "id": 17,
+ "options": {
+ "showLabels": false,
+ "showTime": false,
+ "sortOrder": "Descending",
+ "wrapLogMessage": true
+ },
+ "targets": [
+ {
+ "alias": "",
+ "apiMode": "Logs",
+ "dimensions": {},
+ "expression": "fields @timestamp, @message, @logStream\n| sort @timestamp desc\n| limit 20\n| filter @logStream like /slurmd/",
+ "id": "",
+ "logGroupNames": [
+ "__LOG_GROUP__NAMES__"
+ ],
+ "matchExact": true,
+ "metricName": "",
+ "namespace": "",
+ "period": "",
+ "queryMode": "Logs",
+ "refId": "A",
+ "region": "__AWS_REGION__",
+ "statistics": [
+ "Average"
+ ],
+ "statsGroups": []
+ }
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Slurmd",
+ "type": "logs"
+ },
+ {
+ "collapsed": false,
+ "datasource": null,
+ "gridPos": {
+ "h": 1,
+ "w": 24,
+ "x": 0,
+ "y": 13
+ },
+ "id": 10,
+ "panels": [],
+ "title": "Watchers",
+ "type": "row"
+ },
+ {
+ "datasource": "cloudwatch",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {}
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 15,
+ "w": 8,
+ "x": 0,
+ "y": 14
+ },
+ "id": 11,
+ "options": {
+ "showLabels": false,
+ "showTime": false,
+ "sortOrder": "Descending",
+ "wrapLogMessage": true
+ },
+ "targets": [
+ {
+ "alias": "",
+ "apiMode": "Logs",
+ "dimensions": {},
+ "expression": "fields @timestamp, @message, @logStream\n| sort @timestamp desc\n| limit 20\n| filter @logStream like /sqswatcher/",
+ "id": "",
+ "logGroupNames": [
+ "__LOG_GROUP__NAMES__"
+ ],
+ "matchExact": true,
+ "metricName": "",
+ "namespace": "",
+ "period": "",
+ "queryMode": "Logs",
+ "refId": "A",
+ "region": "__AWS_REGION__",
+ "statistics": [
+ "Average"
+ ],
+ "statsGroups": []
+ }
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "SQS-Watcher Logs",
+ "type": "logs"
+ },
+ {
+ "datasource": "cloudwatch",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {}
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 15,
+ "w": 8,
+ "x": 8,
+ "y": 14
+ },
+ "id": 12,
+ "options": {
+ "showLabels": false,
+ "showTime": false,
+ "sortOrder": "Descending",
+ "wrapLogMessage": true
+ },
+ "targets": [
+ {
+ "alias": "",
+ "apiMode": "Logs",
+ "dimensions": {},
+ "expression": "fields @timestamp, @message, @logStream\n| sort @timestamp desc\n| limit 20\n| filter @logStream like /nodewatcher/",
+ "id": "",
+ "logGroupNames": [
+ "__LOG_GROUP__NAMES__"
+ ],
+ "matchExact": true,
+ "metricName": "",
+ "namespace": "",
+ "period": "",
+ "queryMode": "Logs",
+ "refId": "A",
+ "region": "__AWS_REGION__",
+ "statistics": [
+ "Average"
+ ],
+ "statsGroups": []
+ }
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Node Watcher Logs",
+ "type": "logs"
+ },
+ {
+ "datasource": "cloudwatch",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {}
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 15,
+ "w": 8,
+ "x": 16,
+ "y": 14
+ },
+ "id": 13,
+ "options": {
+ "showLabels": false,
+ "showTime": false,
+ "sortOrder": "Descending",
+ "wrapLogMessage": true
+ },
+ "targets": [
+ {
+ "alias": "",
+ "apiMode": "Logs",
+ "dimensions": {},
+ "expression": "fields @timestamp, @message, @logStream\n| sort @timestamp desc\n| limit 20\n| filter @logStream like /jobwatcher/",
+ "id": "",
+ "logGroupNames": [
+ "__LOG_GROUP__NAMES__"
+ ],
+ "matchExact": true,
+ "metricName": "",
+ "namespace": "",
+ "period": "",
+ "queryMode": "Logs",
+ "refId": "A",
+ "region": "__AWS_REGION__",
+ "statistics": [
+ "Average"
+ ],
+ "statsGroups": []
+ }
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "JOB Watcher Logs",
+ "type": "logs"
+ },
+ {
+ "collapsed": false,
+ "datasource": null,
+ "gridPos": {
+ "h": 1,
+ "w": 24,
+ "x": 0,
+ "y": 29
+ },
+ "id": 19,
+ "panels": [],
+ "title": "Init",
+ "type": "row"
+ },
+ {
+ "datasource": "cloudwatch",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {}
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 13,
+ "w": 12,
+ "x": 0,
+ "y": 30
+ },
+ "id": 20,
+ "options": {
+ "showLabels": false,
+ "showTime": false,
+ "sortOrder": "Descending",
+ "wrapLogMessage": true
+ },
+ "targets": [
+ {
+ "alias": "",
+ "apiMode": "Logs",
+ "dimensions": {},
+ "expression": "fields @timestamp, @message, @logStream\n| sort @timestamp desc\n| limit 20\n| filter @logStream like /cfn-init/",
+ "id": "",
+ "logGroupNames": [
+ "__LOG_GROUP__NAMES__"
+ ],
+ "matchExact": true,
+ "metricName": "",
+ "namespace": "",
+ "period": "",
+ "queryMode": "Logs",
+ "refId": "A",
+ "region": "__AWS_REGION__",
+ "statistics": [
+ "Average"
+ ],
+ "statsGroups": []
+ }
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "cfn-init",
+ "type": "logs"
+ },
+ {
+ "datasource": "cloudwatch",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {}
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 13,
+ "w": 11,
+ "x": 12,
+ "y": 30
+ },
+ "id": 21,
+ "options": {
+ "showLabels": false,
+ "showTime": false,
+ "sortOrder": "Descending",
+ "wrapLogMessage": true
+ },
+ "targets": [
+ {
+ "alias": "",
+ "apiMode": "Logs",
+ "dimensions": {},
+ "expression": "fields @timestamp, @message, @logStream\n| sort @timestamp desc\n| limit 20\n| filter @logStream like /cloud-init/",
+ "id": "",
+ "logGroupNames": [
+ "__LOG_GROUP__NAMES__"
+ ],
+ "matchExact": true,
+ "metricName": "",
+ "namespace": "",
+ "period": "",
+ "queryMode": "Logs",
+ "refId": "A",
+ "region": "__AWS_REGION__",
+ "statistics": [
+ "Average"
+ ],
+ "statsGroups": []
+ }
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "cloud-init",
+ "type": "logs"
+ },
+ {
+ "collapsed": false,
+ "datasource": null,
+ "gridPos": {
+ "h": 1,
+ "w": 24,
+ "x": 0,
+ "y": 43
+ },
+ "id": 23,
+ "panels": [],
+ "title": "Others",
+ "type": "row"
+ },
+ {
+ "datasource": "cloudwatch",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {}
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 14,
+ "w": 12,
+ "x": 0,
+ "y": 44
+ },
+ "id": 24,
+ "options": {
+ "showLabels": false,
+ "showTime": false,
+ "sortOrder": "Descending",
+ "wrapLogMessage": true
+ },
+ "targets": [
+ {
+ "alias": "",
+ "apiMode": "Logs",
+ "dimensions": {},
+ "expression": "fields @timestamp, @message, @logStream\n| sort @timestamp desc\n| limit 20\n| filter @logStream like /supervisord/",
+ "id": "",
+ "logGroupNames": [
+ "__LOG_GROUP__NAMES__"
+ ],
+ "matchExact": true,
+ "metricName": "",
+ "namespace": "",
+ "period": "",
+ "queryMode": "Logs",
+ "refId": "A",
+ "region": "__AWS_REGION__",
+ "statistics": [
+ "Average"
+ ],
+ "statsGroups": []
+ }
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Supervisord",
+ "type": "logs"
+ },
+ {
+ "datasource": "cloudwatch",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {}
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 14,
+ "w": 12,
+ "x": 12,
+ "y": 44
+ },
+ "id": 25,
+ "options": {
+ "showLabels": false,
+ "showTime": false,
+ "sortOrder": "Descending",
+ "wrapLogMessage": true
+ },
+ "targets": [
+ {
+ "alias": "",
+ "apiMode": "Logs",
+ "dimensions": {},
+ "expression": "fields @timestamp, @message, @logStream\n| sort @timestamp desc\n| limit 20\n| filter @logStream like /system-messages/",
+ "id": "",
+ "logGroupNames": [
+ "__LOG_GROUP__NAMES__"
+ ],
+ "matchExact": true,
+ "metricName": "",
+ "namespace": "",
+ "period": "",
+ "queryMode": "Logs",
+ "refId": "A",
+ "region": "__AWS_REGION__",
+ "statistics": [
+ "Average"
+ ],
+ "statsGroups": []
+ }
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "System Messages",
+ "type": "logs"
+ },
+ {
+ "collapsed": false,
+ "datasource": null,
+ "gridPos": {
+ "h": 1,
+ "w": 24,
+ "x": 0,
+ "y": 58
+ },
+ "id": 6,
+ "panels": [],
+ "title": "DCV",
+ "type": "row"
+ },
+ {
+ "datasource": "cloudwatch",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {}
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 13,
+ "w": 12,
+ "x": 0,
+ "y": 59
+ },
+ "id": 3,
+ "options": {
+ "showLabels": false,
+ "showTime": false,
+ "sortOrder": "Descending",
+ "wrapLogMessage": true
+ },
+ "targets": [
+ {
+ "alias": "",
+ "apiMode": "Logs",
+ "dimensions": {},
+ "expression": "fields @timestamp, @message, @logStream\n| sort @timestamp desc\n| limit 20\n| filter @logStream like /dcv-authenticator/",
+ "id": "",
+ "logGroupNames": [
+ "__LOG_GROUP__NAMES__"
+ ],
+ "matchExact": true,
+ "metricName": "",
+ "namespace": "",
+ "period": "",
+ "queryMode": "Logs",
+ "refId": "A",
+ "region": "__AWS_REGION__",
+ "statistics": [
+ "Average"
+ ],
+ "statsGroups": []
+ }
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "DCV Authenticator Logs",
+ "type": "logs"
+ },
+ {
+ "datasource": "cloudwatch",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {}
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 13,
+ "w": 12,
+ "x": 12,
+ "y": 59
+ },
+ "id": 4,
+ "options": {
+ "showLabels": false,
+ "showTime": false,
+ "sortOrder": "Descending",
+ "wrapLogMessage": true
+ },
+ "targets": [
+ {
+ "alias": "",
+ "apiMode": "Logs",
+ "dimensions": {},
+ "expression": "fields @timestamp, @message, @logStream\n| sort @timestamp desc\n| limit 20\n| filter @logStream like /dcv-server/",
+ "id": "",
+ "logGroupNames": [
+ "__LOG_GROUP__NAMES__"
+ ],
+ "matchExact": true,
+ "metricName": "",
+ "namespace": "",
+ "period": "",
+ "queryMode": "Logs",
+ "refId": "A",
+ "region": "__AWS_REGION__",
+ "statistics": [
+ "Average"
+ ],
+ "statsGroups": []
+ }
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "DCV Server Logs",
+ "type": "logs"
+ },
+ {
+ "collapsed": false,
+ "datasource": null,
+ "gridPos": {
+ "h": 1,
+ "w": 24,
+ "x": 0,
+ "y": 72
+ },
+ "id": 8,
+ "panels": [],
+ "title": "Aggregate Logs",
+ "type": "row"
+ },
+ {
+ "datasource": "cloudwatch",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {}
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 4,
+ "w": 24,
+ "x": 0,
+ "y": 73
+ },
+ "id": 2,
+ "options": {
+ "showLabels": false,
+ "showTime": true,
+ "sortOrder": "Descending",
+ "wrapLogMessage": true
+ },
+ "targets": [
+ {
+ "alias": "",
+ "apiMode": "Logs",
+ "dimensions": {},
+ "expression": "fields @timestamp, @message \n| sort @timestamp desc\n| limit 20",
+ "id": "",
+ "logGroupNames": [
+ "__LOG_GROUP__NAMES__"
+ ],
+ "matchExact": true,
+ "metricName": "",
+ "namespace": "",
+ "period": "",
+ "queryMode": "Logs",
+ "refId": "A",
+ "region": "__AWS_REGION__",
+ "statistics": [
+ "Average"
+ ],
+ "statsGroups": []
+ }
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "Cluster Logs (ALL)",
+ "type": "logs"
+ }
+ ],
+ "refresh": "30s",
+ "schemaVersion": 25,
+ "style": "dark",
+ "tags": [],
+ "templating": {
+ "list": []
+ },
+ "time": {
+ "from": "now-6h",
+ "to": "now"
+ },
+ "timepicker": {
+ "hidden": false,
+ "refresh_intervals": [
+ "10s",
+ "30s",
+ "1m",
+ "5m",
+ "15m",
+ "30m",
+ "1h",
+ "2h",
+ "1d"
+ ]
+ },
+ "timezone": "",
+ "title": "Cluster Logs",
+ "uid": "o_8MQ5mMk",
+ "version": 1
+}
\ No newline at end of file
diff --git a/monitoring/grafana/dashboards/master-node-details.json b/monitoring/grafana/dashboards/master-node-details.json
new file mode 100755
index 0000000..8c4e229
--- /dev/null
+++ b/monitoring/grafana/dashboards/master-node-details.json
@@ -0,0 +1,2299 @@
+{
+ "annotations": {
+ "list": [
+ {
+ "builtIn": 1,
+ "datasource": "-- Grafana --",
+ "enable": true,
+ "hide": true,
+ "iconColor": "rgba(0, 211, 255, 1)",
+ "name": "Annotations & Alerts",
+ "type": "dashboard"
+ }
+ ]
+ },
+ "description": "",
+ "editable": true,
+ "gnetId": 11074,
+ "graphTooltip": 0,
+ "iteration": 1592040183080,
+ "links": [
+ {
+ "icon": "external link",
+ "tags": [],
+ "type": "dashboards"
+ }
+ ],
+ "panels": [
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorPostfix": false,
+ "colorPrefix": false,
+ "colorValue": true,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "datasource": "prometheus",
+ "decimals": 1,
+ "description": "",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {}
+ },
+ "overrides": []
+ },
+ "format": "s",
+ "gauge": {
+ "maxValue": 100,
+ "minValue": 0,
+ "show": false,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "gridPos": {
+ "h": 3,
+ "w": 2,
+ "x": 0,
+ "y": 0
+ },
+ "hideTimeOverride": true,
+ "id": 15,
+ "interval": null,
+ "links": [],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "nullPointMode": "null",
+ "nullText": null,
+ "pluginVersion": "6.4.2",
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": false
+ },
+ "tableColumn": "",
+ "targets": [
+ {
+ "expr": "sum(time() - node_boot_time_seconds{instance=~\"$node\"})",
+ "format": "time_series",
+ "hide": false,
+ "instant": true,
+ "intervalFactor": 1,
+ "refId": "A",
+ "step": 40
+ }
+ ],
+ "thresholds": "1,2",
+ "title": "System Uptime",
+ "type": "singlestat",
+ "valueFontSize": "100%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "N/A",
+ "value": "null"
+ }
+ ],
+ "valueName": "current"
+ },
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorValue": true,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "datasource": "prometheus",
+ "decimals": 2,
+ "description": "",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {}
+ },
+ "overrides": []
+ },
+ "format": "bytes",
+ "gauge": {
+ "maxValue": 100,
+ "minValue": 0,
+ "show": false,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "gridPos": {
+ "h": 3,
+ "w": 2,
+ "x": 2,
+ "y": 0
+ },
+ "id": 75,
+ "interval": null,
+ "links": [],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "maxPerRow": 6,
+ "nullPointMode": "null",
+ "nullText": null,
+ "postfix": "",
+ "postfixFontSize": "70%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": false
+ },
+ "tableColumn": "",
+ "targets": [
+ {
+ "expr": "sum(node_memory_MemTotal_bytes{instance=~\"$node\"})",
+ "format": "time_series",
+ "instant": true,
+ "intervalFactor": 1,
+ "legendFormat": "{{instance}}",
+ "refId": "A",
+ "step": 20
+ }
+ ],
+ "thresholds": "2,3",
+ "title": "Total RAM",
+ "type": "singlestat",
+ "valueFontSize": "80%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "N/A",
+ "value": "null"
+ }
+ ],
+ "valueName": "current"
+ },
+ {
+ "datasource": "prometheus",
+ "fieldConfig": {
+ "defaults": {
+ "color": {
+ "mode": "thresholds"
+ },
+ "custom": {},
+ "displayName": "",
+ "mappings": [],
+ "max": 100,
+ "min": 0,
+ "thresholds": {
+ "mode": "absolute",
+ "steps": [
+ {
+ "color": "green",
+ "value": null
+ },
+ {
+ "color": "#EAB839",
+ "value": 60
+ },
+ {
+ "color": "red",
+ "value": 80
+ }
+ ]
+ },
+ "unit": "percent"
+ },
+ "overrides": []
+ },
+ "gridPos": {
+ "h": 6,
+ "w": 5,
+ "x": 4,
+ "y": 0
+ },
+ "id": 177,
+ "options": {
+ "displayMode": "lcd",
+ "orientation": "horizontal",
+ "reduceOptions": {
+ "calcs": [
+ "last"
+ ],
+ "fields": "",
+ "values": false
+ },
+ "showUnfilled": true
+ },
+ "pluginVersion": "7.0.3",
+ "targets": [
+ {
+ "expr": "100 - (avg(irate(node_cpu_seconds_total{instance=~\"$node\",mode=\"idle\"}[30m])) * 100)",
+ "instant": true,
+ "legendFormat": "CPU Busy",
+ "refId": "A"
+ },
+ {
+ "expr": "avg(irate(node_cpu_seconds_total{instance=~\"$node\",mode=\"iowait\"}[30m])) * 100",
+ "hide": true,
+ "instant": true,
+ "legendFormat": "Busy Iowait",
+ "refId": "C"
+ },
+ {
+ "expr": "(1 - (node_memory_MemAvailable_bytes{instance=~\"$node\"} / (node_memory_MemTotal_bytes{instance=~\"$node\"})))* 100",
+ "instant": true,
+ "legendFormat": "Used RAM Memory",
+ "refId": "B"
+ },
+ {
+ "expr": "100 - ((node_filesystem_avail_bytes{instance=~\"$node\",mountpoint=\"$maxmount\",fstype=~\"ext4|xfs\"} * 100) / node_filesystem_size_bytes {instance=~\"$node\",mountpoint=\"$maxmount\",fstype=~\"ext4|xfs\"})",
+ "hide": false,
+ "instant": true,
+ "legendFormat": "Used Max Mount($maxmount)",
+ "refId": "D"
+ },
+ {
+ "expr": "(1 - (node_memory_SwapFree_bytes{instance=~\"$node\"} / node_memory_SwapTotal_bytes{instance=~\"$node\"})) * 100",
+ "instant": true,
+ "legendFormat": "Used SWAP",
+ "refId": "E"
+ }
+ ],
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "",
+ "type": "bargauge"
+ },
+ {
+ "aliasColors": {
+ "15分钟": "#6ED0E0",
+ "1分钟": "#BF1B00",
+ "5分钟": "#CCA300"
+ },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "prometheus",
+ "editable": true,
+ "error": false,
+ "fieldConfig": {
+ "defaults": {
+ "custom": {}
+ },
+ "overrides": []
+ },
+ "fill": 1,
+ "fillGradient": 1,
+ "grid": {},
+ "gridPos": {
+ "h": 6,
+ "w": 8,
+ "x": 9,
+ "y": 0
+ },
+ "height": "300",
+ "hiddenSeries": false,
+ "id": 13,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
+ "min": false,
+ "rightSide": true,
+ "show": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 2,
+ "links": [],
+ "maxPerRow": 6,
+ "nullPointMode": "null as zero",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "repeat": null,
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "node_load1{instance=~\"$node\"}",
+ "format": "time_series",
+ "instant": false,
+ "interval": "",
+ "intervalFactor": 1,
+ "legendFormat": "{{instance}}_1m",
+ "metric": "",
+ "refId": "A",
+ "step": 20,
+ "target": ""
+ },
+ {
+ "expr": "node_load5{instance=~\"$node\"}",
+ "format": "time_series",
+ "instant": false,
+ "interval": "",
+ "intervalFactor": 1,
+ "legendFormat": "{{instance}}_5m",
+ "refId": "B",
+ "step": 20
+ },
+ {
+ "expr": "node_load15{instance=~\"$node\"}",
+ "format": "time_series",
+ "instant": false,
+ "interval": "",
+ "intervalFactor": 1,
+ "legendFormat": "{{instance}}_15m",
+ "refId": "C",
+ "step": 20
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "System Load",
+ "tooltip": {
+ "msResolution": false,
+ "shared": true,
+ "sort": 2,
+ "value_type": "cumulative"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "columns": [],
+ "datasource": "prometheus",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {}
+ },
+ "overrides": []
+ },
+ "fontSize": "110%",
+ "gridPos": {
+ "h": 6,
+ "w": 7,
+ "x": 17,
+ "y": 0
+ },
+ "id": 164,
+ "links": [],
+ "pageSize": null,
+ "scroll": true,
+ "showHeader": true,
+ "sort": {
+ "col": 6,
+ "desc": false
+ },
+ "styles": [
+ {
+ "alias": "Mounted on",
+ "align": "auto",
+ "colorMode": null,
+ "colors": [
+ "rgba(50, 172, 45, 0.97)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(245, 54, 54, 0.9)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "mappingType": 1,
+ "pattern": "mountpoint",
+ "thresholds": [
+ ""
+ ],
+ "type": "string",
+ "unit": "bytes"
+ },
+ {
+ "alias": "Avail",
+ "align": "auto",
+ "colorMode": "value",
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "mappingType": 1,
+ "pattern": "Value #A",
+ "thresholds": [
+ "10000000000",
+ "20000000000"
+ ],
+ "type": "number",
+ "unit": "bytes"
+ },
+ {
+ "alias": "Used",
+ "align": "auto",
+ "colorMode": "cell",
+ "colors": [
+ "rgba(50, 172, 45, 0.97)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(245, 54, 54, 0.9)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "mappingType": 1,
+ "pattern": "Value #B",
+ "thresholds": [
+ "0.6",
+ "0.8"
+ ],
+ "type": "number",
+ "unit": "percentunit"
+ },
+ {
+ "alias": "Size",
+ "align": "auto",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 1,
+ "link": false,
+ "mappingType": 1,
+ "pattern": "Value #C",
+ "thresholds": [],
+ "type": "number",
+ "unit": "bytes"
+ },
+ {
+ "alias": "Filesystem",
+ "align": "auto",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "link": false,
+ "mappingType": 1,
+ "pattern": "fstype",
+ "thresholds": [],
+ "type": "string",
+ "unit": "short"
+ },
+ {
+ "alias": "IP",
+ "align": "auto",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "dateFormat": "YYYY-MM-DD HH:mm:ss",
+ "decimals": 2,
+ "link": false,
+ "mappingType": 1,
+ "pattern": "instance",
+ "preserveFormat": false,
+ "sanitize": false,
+ "thresholds": [],
+ "type": "string",
+ "unit": "short"
+ },
+ {
+ "alias": "",
+ "align": "auto",
+ "colorMode": null,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "decimals": 2,
+ "pattern": "/.*/",
+ "preserveFormat": true,
+ "sanitize": false,
+ "thresholds": [],
+ "type": "hidden",
+ "unit": "short"
+ }
+ ],
+ "targets": [
+ {
+ "expr": "node_filesystem_size_bytes{instance=~'$node',fstype=~\"ext4|xfs\"}-0",
+ "format": "table",
+ "hide": false,
+ "instant": true,
+ "intervalFactor": 1,
+ "legendFormat": "",
+ "refId": "C"
+ },
+ {
+ "expr": "node_filesystem_avail_bytes {instance=~'$node',fstype=~\"ext4|xfs\"}-0",
+ "format": "table",
+ "hide": false,
+ "instant": true,
+ "interval": "10s",
+ "intervalFactor": 1,
+ "legendFormat": "",
+ "refId": "A"
+ },
+ {
+ "expr": "1-(node_filesystem_free_bytes{instance=~'$node',fstype=~\"ext4|xfs\"} / node_filesystem_size_bytes{instance=~'$node',fstype=~\"ext4|xfs\"})",
+ "format": "table",
+ "hide": false,
+ "instant": true,
+ "intervalFactor": 1,
+ "legendFormat": "",
+ "refId": "B"
+ }
+ ],
+ "title": "Disk Space Used Basic(EXT4/XFS)",
+ "transform": "table",
+ "type": "table-old"
+ },
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorPostfix": false,
+ "colorValue": true,
+ "colors": [
+ "rgba(245, 54, 54, 0.9)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(50, 172, 45, 0.97)"
+ ],
+ "datasource": "prometheus",
+ "description": "",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {}
+ },
+ "overrides": []
+ },
+ "format": "short",
+ "gauge": {
+ "maxValue": 100,
+ "minValue": 0,
+ "show": false,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "gridPos": {
+ "h": 3,
+ "w": 2,
+ "x": 0,
+ "y": 3
+ },
+ "id": 14,
+ "interval": null,
+ "links": [],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "maxPerRow": 6,
+ "nullPointMode": "null",
+ "nullText": null,
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": false
+ },
+ "tableColumn": "",
+ "targets": [
+ {
+ "expr": "sum(count(node_cpu_seconds_total{instance=~\"$node\", mode='system'}) by (cpu))",
+ "format": "time_series",
+ "instant": true,
+ "intervalFactor": 1,
+ "legendFormat": "",
+ "refId": "A",
+ "step": 20
+ }
+ ],
+ "thresholds": "1,2",
+ "title": "CPU Cores",
+ "type": "singlestat",
+ "valueFontSize": "100%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "N/A",
+ "value": "null"
+ }
+ ],
+ "valueName": "current"
+ },
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorValue": true,
+ "colors": [
+ "#299c46",
+ "rgba(237, 129, 40, 0.89)",
+ "#d44a3a"
+ ],
+ "datasource": "prometheus",
+ "decimals": 2,
+ "description": "",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {}
+ },
+ "overrides": []
+ },
+ "format": "percent",
+ "gauge": {
+ "maxValue": 100,
+ "minValue": 0,
+ "show": false,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "gridPos": {
+ "h": 3,
+ "w": 2,
+ "x": 2,
+ "y": 3
+ },
+ "id": 20,
+ "interval": null,
+ "links": [],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "nullPointMode": "connected",
+ "nullText": null,
+ "pluginVersion": "6.4.2",
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "#3274D9",
+ "show": true,
+ "ymax": null,
+ "ymin": null
+ },
+ "tableColumn": "",
+ "targets": [
+ {
+ "expr": "avg(irate(node_cpu_seconds_total{instance=~\"$node\",mode=\"iowait\"}[30m])) * 100",
+ "format": "time_series",
+ "hide": false,
+ "instant": false,
+ "interval": "",
+ "intervalFactor": 1,
+ "legendFormat": "",
+ "refId": "A",
+ "step": 20
+ }
+ ],
+ "thresholds": "20,50",
+ "timeFrom": null,
+ "timeShift": null,
+ "title": "CPU IOwait",
+ "type": "singlestat",
+ "valueFontSize": "100%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "N/A",
+ "value": "null"
+ }
+ ],
+ "valueName": "avg"
+ },
+ {
+ "aliasColors": {
+ "192.168.200.241:9100_Total": "dark-red",
+ "Idle - Waiting for something to happen": "#052B51",
+ "guest": "#9AC48A",
+ "idle": "#052B51",
+ "iowait": "#EAB839",
+ "irq": "#BF1B00",
+ "nice": "#C15C17",
+ "sdb_每秒I/O操作%": "#d683ce",
+ "softirq": "#E24D42",
+ "steal": "#FCE2DE",
+ "system": "#508642",
+ "user": "#5195CE",
+ "磁盘花费在I/O操作占比": "#ba43a9"
+ },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "prometheus",
+ "decimals": 2,
+ "description": "",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {}
+ },
+ "overrides": []
+ },
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 11,
+ "w": 12,
+ "x": 0,
+ "y": 6
+ },
+ "hiddenSeries": false,
+ "id": 7,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "hideEmpty": true,
+ "hideZero": true,
+ "max": true,
+ "min": false,
+ "rightSide": true,
+ "show": true,
+ "sideWidth": null,
+ "sort": "current",
+ "sortDesc": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 2,
+ "links": [],
+ "maxPerRow": 6,
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "repeat": null,
+ "seriesOverrides": [
+ {
+ "alias": "/.*_Total/",
+ "color": "#C4162A",
+ "fill": 0
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "avg(irate(node_cpu_seconds_total{instance=~\"$node\",mode=\"system\"}[30m])) by (instance)",
+ "format": "time_series",
+ "hide": false,
+ "instant": false,
+ "interval": "",
+ "intervalFactor": 1,
+ "legendFormat": "{{instance}}_System",
+ "refId": "A",
+ "step": 20
+ },
+ {
+ "expr": "avg(irate(node_cpu_seconds_total{instance=~\"$node\",mode=\"user\"}[30m])) by (instance)",
+ "format": "time_series",
+ "hide": false,
+ "intervalFactor": 1,
+ "legendFormat": "{{instance}}_User",
+ "refId": "B",
+ "step": 240
+ },
+ {
+ "expr": "avg(irate(node_cpu_seconds_total{instance=~\"$node\",mode=\"iowait\"}[30m])) by (instance)",
+ "format": "time_series",
+ "hide": false,
+ "instant": false,
+ "intervalFactor": 1,
+ "legendFormat": "{{instance}}_Iowait",
+ "refId": "D",
+ "step": 240
+ },
+ {
+ "expr": "1 - avg(irate(node_cpu_seconds_total{instance=~\"$node\",mode=\"idle\"}[30m])) by (instance)",
+ "format": "time_series",
+ "hide": false,
+ "intervalFactor": 1,
+ "legendFormat": "{{instance}}_Total",
+ "refId": "F",
+ "step": 240
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "CPU Basic",
+ "tooltip": {
+ "shared": true,
+ "sort": 2,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "decimals": 2,
+ "format": "percentunit",
+ "label": "",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {
+ "192.168.10.227:9100_em1_in下载": "super-light-green",
+ "192.168.10.227:9100_em1_out上传": "dark-blue"
+ },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "prometheus",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {}
+ },
+ "overrides": []
+ },
+ "fill": 1,
+ "fillGradient": 3,
+ "gridPos": {
+ "h": 11,
+ "w": 12,
+ "x": 12,
+ "y": 6
+ },
+ "height": "300",
+ "hiddenSeries": false,
+ "id": 157,
+ "legend": {
+ "alignAsTable": true,
+ "avg": false,
+ "current": true,
+ "hideEmpty": true,
+ "hideZero": true,
+ "max": true,
+ "min": false,
+ "rightSide": false,
+ "show": true,
+ "sort": "current",
+ "sortDesc": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 2,
+ "links": [],
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 2,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [
+ {
+ "alias": "/.*_transmit$/",
+ "transform": "negative-Y"
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "irate(node_network_receive_bytes_total{instance=~'$node',device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*'}[30m])*8",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "{{instance}}_{{device}}_receive",
+ "refId": "A",
+ "step": 4
+ },
+ {
+ "expr": "irate(node_network_transmit_bytes_total{instance=~'$node',device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*'}[30m])*8",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "{{instance}}_{{device}}_transmit",
+ "refId": "B",
+ "step": 4
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Network Traffic Basic",
+ "tooltip": {
+ "shared": true,
+ "sort": 2,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "$$hashKey": "object:228",
+ "format": "bps",
+ "label": "transmit(-)/receive(+)",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "$$hashKey": "object:229",
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {
+ "192.168.200.241:9100_总内存": "dark-red",
+ "内存_Avaliable": "#6ED0E0",
+ "内存_Cached": "#EF843C",
+ "内存_Free": "#629E51",
+ "内存_Total": "#6d1f62",
+ "内存_Used": "#eab839",
+ "可用": "#9ac48a",
+ "总内存": "#bf1b00"
+ },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "prometheus",
+ "decimals": 2,
+ "fieldConfig": {
+ "defaults": {
+ "custom": {}
+ },
+ "overrides": []
+ },
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 12,
+ "w": 12,
+ "x": 0,
+ "y": 17
+ },
+ "height": "300",
+ "hiddenSeries": false,
+ "id": 156,
+ "legend": {
+ "alignAsTable": true,
+ "avg": false,
+ "current": true,
+ "max": false,
+ "min": false,
+ "rightSide": true,
+ "show": true,
+ "sort": "current",
+ "sortDesc": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 2,
+ "links": [],
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [
+ {
+ "alias": "/.*_Total/",
+ "color": "#C4162A",
+ "fill": 0
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "node_memory_MemTotal_bytes{instance=~\"$node\"}",
+ "format": "time_series",
+ "hide": false,
+ "instant": false,
+ "intervalFactor": 1,
+ "legendFormat": "{{instance}}_Total",
+ "refId": "A",
+ "step": 4
+ },
+ {
+ "expr": "node_memory_MemTotal_bytes{instance=~\"$node\"} - node_memory_MemAvailable_bytes{instance=~\"$node\"}",
+ "format": "time_series",
+ "hide": false,
+ "intervalFactor": 1,
+ "legendFormat": "{{instance}}_Used",
+ "refId": "B",
+ "step": 4
+ },
+ {
+ "expr": "node_memory_MemAvailable_bytes{instance=~\"$node\"}",
+ "format": "time_series",
+ "hide": false,
+ "interval": "",
+ "intervalFactor": 1,
+ "legendFormat": "{{instance}}_Avaliable",
+ "refId": "F",
+ "step": 4
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Memory Basic",
+ "tooltip": {
+ "shared": true,
+ "sort": 2,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "bytes",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": "0",
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {
+ "vda_write": "#6ED0E0"
+ },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "prometheus",
+ "description": "Per second read / write bytes ",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {}
+ },
+ "overrides": []
+ },
+ "fill": 1,
+ "fillGradient": 1,
+ "gridPos": {
+ "h": 12,
+ "w": 12,
+ "x": 12,
+ "y": 17
+ },
+ "height": "300",
+ "hiddenSeries": false,
+ "id": 168,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "hideEmpty": true,
+ "hideZero": true,
+ "max": true,
+ "min": false,
+ "rightSide": false,
+ "show": true,
+ "sort": "current",
+ "sortDesc": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 2,
+ "links": [],
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [
+ {
+ "$$hashKey": "object:131",
+ "alias": "/.*_Read bytes$/",
+ "transform": "negative-Y"
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "irate(node_disk_read_bytes_total{instance=~\"$node\"}[30m])",
+ "format": "time_series",
+ "interval": "",
+ "intervalFactor": 1,
+ "legendFormat": "{{instance}}_{{device}}_Read bytes",
+ "refId": "A",
+ "step": 10
+ },
+ {
+ "expr": "irate(node_disk_written_bytes_total{instance=~\"$node\"}[30m])",
+ "format": "time_series",
+ "hide": false,
+ "intervalFactor": 1,
+ "legendFormat": "{{instance}}_{{device}}_Written bytes",
+ "refId": "B",
+ "step": 10
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Disk R/W Data",
+ "tooltip": {
+ "shared": true,
+ "sort": 2,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "$$hashKey": "object:144",
+ "decimals": null,
+ "format": "Bps",
+ "label": "Bytes read (-) / write (+)",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "$$hashKey": "object:145",
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "prometheus",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {}
+ },
+ "overrides": []
+ },
+ "fill": 1,
+ "fillGradient": 3,
+ "gridPos": {
+ "h": 7,
+ "w": 8,
+ "x": 0,
+ "y": 29
+ },
+ "hiddenSeries": false,
+ "id": 174,
+ "legend": {
+ "alignAsTable": true,
+ "avg": false,
+ "current": true,
+ "hideEmpty": false,
+ "hideZero": false,
+ "max": false,
+ "min": false,
+ "rightSide": false,
+ "show": true,
+ "sideWidth": null,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 2,
+ "links": [],
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [
+ {
+ "alias": "/Inodes.*/",
+ "yaxis": 2
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "1-(node_filesystem_free_bytes{instance=~'$node',fstype=~\"ext4|xfs\"} / node_filesystem_size_bytes{instance=~'$node',fstype=~\"ext4|xfs\"})",
+ "format": "time_series",
+ "instant": false,
+ "intervalFactor": 1,
+ "legendFormat": "{{instance}}:{{mountpoint}}",
+ "refId": "A"
+ },
+ {
+ "expr": "node_filesystem_files_free{instance=~'$node',fstype=~\"ext4|xfs\"} / node_filesystem_files{instance=~'$node',fstype=~\"ext4|xfs\"}",
+ "hide": true,
+ "legendFormat": "Inodes:{{instance}}:{{mountpoint}}",
+ "refId": "B"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Disk Space Used Basic",
+ "tooltip": {
+ "shared": true,
+ "sort": 2,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "decimals": 2,
+ "format": "percentunit",
+ "label": "",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "decimals": 2,
+ "format": "percentunit",
+ "label": null,
+ "logBase": 1,
+ "max": "1",
+ "min": null,
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {
+ "filefd_192.168.200.241:9100": "super-light-green",
+ "switches_192.168.200.241:9100": "semi-dark-red"
+ },
+ "bars": false,
+ "cacheTimeout": null,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "prometheus",
+ "description": "",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {}
+ },
+ "overrides": []
+ },
+ "fill": 0,
+ "fillGradient": 1,
+ "gridPos": {
+ "h": 6,
+ "w": 8,
+ "x": 8,
+ "y": 29
+ },
+ "hiddenSeries": false,
+ "hideTimeOverride": false,
+ "id": 16,
+ "legend": {
+ "alignAsTable": false,
+ "avg": false,
+ "current": true,
+ "max": true,
+ "min": false,
+ "rightSide": false,
+ "show": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 2,
+ "links": [],
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pluginVersion": "6.4.2",
+ "pointradius": 1,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [
+ {
+ "alias": "/filefd_.*/",
+ "lines": false,
+ "pointradius": 1,
+ "points": true
+ },
+ {
+ "alias": "/switches_.*/",
+ "color": "#F2495C",
+ "yaxis": 2
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "node_filefd_allocated{instance=~\"$node\"}",
+ "format": "time_series",
+ "instant": false,
+ "interval": "",
+ "intervalFactor": 5,
+ "legendFormat": "filefd_{{instance}}",
+ "refId": "B"
+ },
+ {
+ "expr": "irate(node_context_switches_total{instance=~\"$node\"}[30m])",
+ "intervalFactor": 5,
+ "legendFormat": "switches_{{instance}}",
+ "refId": "A"
+ },
+ {
+ "expr": "node_filefd_maximum{instance=~\"$node\"}",
+ "hide": true,
+ "refId": "C"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Open File Descriptor(left)/Context switches(right)",
+ "tooltip": {
+ "shared": true,
+ "sort": 2,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": "",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": "context_switches",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {
+ "Idle - Waiting for something to happen": "#052B51",
+ "guest": "#9AC48A",
+ "idle": "#052B51",
+ "iowait": "#EAB839",
+ "irq": "#BF1B00",
+ "nice": "#C15C17",
+ "sdb_每秒I/O操作%": "#d683ce",
+ "softirq": "#E24D42",
+ "steal": "#FCE2DE",
+ "system": "#508642",
+ "user": "#5195CE",
+ "磁盘花费在I/O操作占比": "#ba43a9"
+ },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "prometheus",
+ "decimals": null,
+ "description": "The time spent on I/O in the natural time of each second.(wall-clock time)",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {}
+ },
+ "overrides": []
+ },
+ "fill": 1,
+ "fillGradient": 5,
+ "gridPos": {
+ "h": 6,
+ "w": 8,
+ "x": 16,
+ "y": 29
+ },
+ "hiddenSeries": false,
+ "id": 175,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "hideEmpty": true,
+ "hideZero": true,
+ "max": true,
+ "min": false,
+ "rightSide": false,
+ "show": true,
+ "sideWidth": null,
+ "sort": null,
+ "sortDesc": null,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 2,
+ "links": [],
+ "maxPerRow": 6,
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "irate(node_disk_io_time_seconds_total{instance=~\"$node\"}[30m])",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "{{instance}}_{{device}}_ IO time",
+ "refId": "C"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Time Spent Doing I/Os",
+ "tooltip": {
+ "shared": true,
+ "sort": 2,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "decimals": null,
+ "format": "s",
+ "label": "",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {
+ "vda_write": "#6ED0E0"
+ },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "prometheus",
+ "description": "Read/write completions per second",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {}
+ },
+ "overrides": []
+ },
+ "fill": 1,
+ "fillGradient": 1,
+ "gridPos": {
+ "h": 6,
+ "w": 8,
+ "x": 8,
+ "y": 35
+ },
+ "height": "300",
+ "hiddenSeries": false,
+ "id": 161,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "hideEmpty": true,
+ "hideZero": true,
+ "max": true,
+ "min": false,
+ "show": true,
+ "sort": "current",
+ "sortDesc": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 2,
+ "links": [],
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [
+ {
+ "alias": "/.*_Reads completed$/",
+ "transform": "negative-Y"
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "irate(node_disk_reads_completed_total{instance=~\"$node\"}[30m])",
+ "format": "time_series",
+ "hide": false,
+ "interval": "",
+ "intervalFactor": 1,
+ "legendFormat": "{{instance}}_{{device}}_Reads completed",
+ "refId": "A",
+ "step": 10
+ },
+ {
+ "expr": "irate(node_disk_writes_completed_total{instance=~\"$node\"}[30m])",
+ "format": "time_series",
+ "hide": false,
+ "intervalFactor": 1,
+ "legendFormat": "{{instance}}_{{device}}_Writes completed",
+ "refId": "B",
+ "step": 10
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Disk IOps Completed",
+ "tooltip": {
+ "shared": true,
+ "sort": 2,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "decimals": null,
+ "format": "iops",
+ "label": "IO read (-) / write (+)",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {
+ "vda": "#6ED0E0"
+ },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "prometheus",
+ "description": "Time spent on each read/write operation",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {}
+ },
+ "overrides": []
+ },
+ "fill": 1,
+ "fillGradient": 1,
+ "gridPos": {
+ "h": 6,
+ "w": 8,
+ "x": 16,
+ "y": 35
+ },
+ "height": "300",
+ "hiddenSeries": false,
+ "id": 160,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "hideEmpty": true,
+ "hideZero": true,
+ "max": true,
+ "min": false,
+ "show": true,
+ "sort": "current",
+ "sortDesc": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 2,
+ "links": [],
+ "nullPointMode": "null as zero",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [
+ {
+ "alias": "/,*_Read time$/",
+ "transform": "negative-Y"
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "irate(node_disk_read_time_seconds_total{instance=~\"$node\"}[30m]) / irate(node_disk_reads_completed_total{instance=~\"$node\"}[30m])",
+ "format": "time_series",
+ "hide": false,
+ "instant": false,
+ "interval": "",
+ "intervalFactor": 1,
+ "legendFormat": "{{instance}}_{{device}}_Read time",
+ "refId": "B"
+ },
+ {
+ "expr": "irate(node_disk_write_time_seconds_total{instance=~\"$node\"}[30m]) / irate(node_disk_writes_completed_total{instance=~\"$node\"}[30m])",
+ "format": "time_series",
+ "hide": false,
+ "instant": false,
+ "intervalFactor": 1,
+ "legendFormat": "{{instance}}_{{device}}_Write time",
+ "refId": "C"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Disk R/W Time(Reference: less than 100ms)(beta)",
+ "tooltip": {
+ "shared": true,
+ "sort": 2,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "s",
+ "label": "Time. read (-) / write (+)",
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": false
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {
+ "TCP": "#6ED0E0"
+ },
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "prometheus",
+ "description": "TCP_alloc - Allocated sockets\n\nCurrEstab - TCP connections for which the current state is either ESTABLISHED or CLOSE- WAIT\n\nTCP_tw - Sockets wating close\n\nUDP_inuse - Udp sockets currently in use\n\nSockets_used - Sockets currently in use",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {}
+ },
+ "overrides": []
+ },
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 6,
+ "w": 8,
+ "x": 0,
+ "y": 36
+ },
+ "height": "300",
+ "hiddenSeries": false,
+ "id": 158,
+ "interval": "",
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "hideEmpty": true,
+ "hideZero": true,
+ "max": true,
+ "min": false,
+ "rightSide": false,
+ "show": true,
+ "sort": "current",
+ "sortDesc": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 2,
+ "links": [],
+ "nullPointMode": "null",
+ "options": {
+ "dataLinks": []
+ },
+ "percentage": false,
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [
+ {
+ "alias": "/.*_Sockets_used/",
+ "color": "#C4162A",
+ "fill": 0
+ }
+ ],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "node_netstat_Tcp_CurrEstab{instance=~'$node'}",
+ "format": "time_series",
+ "hide": false,
+ "instant": false,
+ "interval": "",
+ "intervalFactor": 1,
+ "legendFormat": "{{instance}}_CurrEstab",
+ "refId": "A",
+ "step": 20
+ },
+ {
+ "expr": "node_sockstat_TCP_tw{instance=~'$node'}",
+ "format": "time_series",
+ "intervalFactor": 1,
+ "legendFormat": "{{instance}}_TCP_tw",
+ "refId": "D"
+ },
+ {
+ "expr": "node_sockstat_sockets_used{instance=~'$node'}",
+ "legendFormat": "{{instance}}_Sockets_used",
+ "refId": "B"
+ },
+ {
+ "expr": "node_sockstat_UDP_inuse{instance=~'$node'}",
+ "legendFormat": "{{instance}}_UDP_inuse",
+ "refId": "C"
+ },
+ {
+ "expr": "node_sockstat_TCP_alloc{instance=~'$node'}",
+ "legendFormat": "{{instance}}_TCP_alloc",
+ "refId": "E"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Network Sockstat",
+ "tooltip": {
+ "shared": true,
+ "sort": 2,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ }
+ ],
+ "refresh": "5s",
+ "schemaVersion": 25,
+ "style": "dark",
+ "tags": [],
+ "templating": {
+ "list": [
+ {
+ "allValue": null,
+ "current": {
+ "selected": false,
+ "text": "__INSTANCE_ID__",
+ "value": "__INSTANCE_ID__"
+ },
+ "datasource": "prometheus",
+ "definition": "label_values(node_uname_info{job=~\"ec2_instances\"}, instance_id)",
+ "hide": 2,
+ "includeAll": false,
+ "label": "Instance ID",
+ "multi": false,
+ "name": "instance_id",
+ "options": [],
+ "query": "label_values(node_uname_info{job=~\"ec2_instances\"}, instance_id)",
+ "refresh": 1,
+ "regex": "",
+ "skipUrlSync": false,
+ "sort": 0,
+ "tagValuesQuery": "",
+ "tags": [],
+ "tagsQuery": "",
+ "type": "query",
+ "useTags": false
+ },
+ {
+ "allFormat": "glob",
+ "allValue": null,
+ "current": {
+ "selected": false,
+ "text": "All",
+ "value": "$__all"
+ },
+ "datasource": "prometheus",
+ "definition": "label_values(node_uname_info{instance_id=~\"$instance_id\"},instance)",
+ "hide": 2,
+ "includeAll": true,
+ "label": "IP",
+ "multi": false,
+ "multiFormat": "regex values",
+ "name": "node",
+ "options": [],
+ "query": "label_values(node_uname_info{instance_id=~\"$instance_id\"},instance)",
+ "refresh": 2,
+ "regex": "",
+ "skipUrlSync": false,
+ "sort": 1,
+ "tagValuesQuery": "",
+ "tags": [],
+ "tagsQuery": "",
+ "type": "query",
+ "useTags": false
+ },
+ {
+ "allValue": null,
+ "current": {
+ "selected": false,
+ "text": "/",
+ "value": "/"
+ },
+ "datasource": "prometheus",
+ "definition": "",
+ "hide": 2,
+ "includeAll": false,
+ "label": "",
+ "multi": false,
+ "name": "maxmount",
+ "options": [],
+ "query": "query_result(topk(1,sort_desc (max(node_filesystem_size_bytes{instance=~'$node',fstype=~\"ext4|xfs\"}) by (mountpoint))))",
+ "refresh": 2,
+ "regex": "/.*\\\"(.*)\\\".*/",
+ "skipUrlSync": false,
+ "sort": 0,
+ "tagValuesQuery": "",
+ "tags": [],
+ "tagsQuery": "",
+ "type": "query",
+ "useTags": false
+ },
+ {
+ "allFormat": "glob",
+ "allValue": null,
+ "current": {
+ "isNone": true,
+ "selected": false,
+ "text": "None",
+ "value": ""
+ },
+ "datasource": "prometheus",
+ "definition": "",
+ "hide": 2,
+ "includeAll": false,
+ "label": "环境",
+ "multi": false,
+ "multiFormat": "regex values",
+ "name": "env",
+ "options": [],
+ "query": "label_values(node_exporter_build_info,env)",
+ "refresh": 2,
+ "regex": "",
+ "skipUrlSync": false,
+ "sort": 1,
+ "tagValuesQuery": "",
+ "tags": [],
+ "tagsQuery": "",
+ "type": "query",
+ "useTags": false
+ },
+ {
+ "allFormat": "glob",
+ "allValue": "",
+ "current": {
+ "selected": true,
+ "text": "None",
+ "value": [
+ ""
+ ]
+ },
+ "datasource": "prometheus",
+ "definition": "label_values(node_exporter_build_info{env=~'$env'},name)",
+ "hide": 2,
+ "includeAll": false,
+ "label": "名称",
+ "multi": true,
+ "multiFormat": "regex values",
+ "name": "name",
+ "options": [],
+ "query": "label_values(node_exporter_build_info{env=~'$env'},name)",
+ "refresh": 2,
+ "regex": "",
+ "skipUrlSync": false,
+ "sort": 1,
+ "tagValuesQuery": "/.*/",
+ "tags": [],
+ "tagsQuery": "",
+ "type": "query",
+ "useTags": false
+ }
+ ]
+ },
+ "time": {
+ "from": "now-15m",
+ "to": "now"
+ },
+ "timepicker": {
+ "now": true,
+ "refresh_intervals": [
+ "10s",
+ "30s",
+ "1m",
+ "5m",
+ "15m",
+ "30m",
+ "1h",
+ "2h",
+ "1d"
+ ],
+ "time_options": [
+ "5m",
+ "15m",
+ "1h",
+ "6h",
+ "12h",
+ "24h",
+ "2d",
+ "7d",
+ "30d"
+ ]
+ },
+ "timezone": "browser",
+ "title": "Master Node Details",
+ "uid": "3NR7BmmMk",
+ "version": 1
+}
\ No newline at end of file
diff --git a/monitoring/grafana/datasources/datasource.yml b/monitoring/grafana/datasources/datasource.yml
new file mode 100755
index 0000000..ae235f3
--- /dev/null
+++ b/monitoring/grafana/datasources/datasource.yml
@@ -0,0 +1,27 @@
+#
+#
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: MIT-0
+#
+#
+apiVersion: 1
+datasources:
+ - name: prometheus
+ type: prometheus
+ access: proxy
+ orgId: 1
+ version: 1
+ url: http://localhost:9090
+ isDefault: true
+ editable: true
+ jsonData:
+ timeInterval: 5s
+ - name: cloudwatch
+ type: cloudwatch
+ orgId: 1
+ version: 1
+ editable: true
+ jsonData:
+ authType: default
+ defaultRegion: us-east
+
\ No newline at end of file
diff --git a/monitoring/nginx/conf.d/nginx.conf b/monitoring/nginx/conf.d/nginx.conf
new file mode 100644
index 0000000..0791b19
--- /dev/null
+++ b/monitoring/nginx/conf.d/nginx.conf
@@ -0,0 +1,22 @@
+server {
+ listen 80 default_server;
+ listen [::]:80 default_server;
+ server_name _;
+ server_tokens off;
+
+ location /grafana/ {
+ proxy_pass http://localhost:3000/;
+ }
+
+ location /prometheus/ {
+ proxy_pass http://localhost:9090/;
+ }
+
+ location /pushgateway/ {
+ proxy_pass http://localhost:9091/;
+ }
+
+ location /slurmexporter/ {
+ proxy_pass http://localhost:8081/;
+ }
+}
\ No newline at end of file
diff --git a/monitoring/prometheus-slurm-exporter/slurm_exporter.service b/monitoring/prometheus-slurm-exporter/slurm_exporter.service
new file mode 100644
index 0000000..d54498b
--- /dev/null
+++ b/monitoring/prometheus-slurm-exporter/slurm_exporter.service
@@ -0,0 +1,13 @@
+[Unit]
+Description=Prometheus SLURM Exporter
+
+[Service]
+Environment=PATH=/opt/slurm/bin:$PATH
+ExecStart=/usr/bin/prometheus-slurm-exporter -listen-address 0.0.0.0:8081
+Restart=on-failure
+RestartSec=15
+Type=simple
+
+
+[Install]
+WantedBy=multi-user.target
\ No newline at end of file
diff --git a/monitoring/prometheus/prometheus.yml b/monitoring/prometheus/prometheus.yml
new file mode 100755
index 0000000..3438fbc
--- /dev/null
+++ b/monitoring/prometheus/prometheus.yml
@@ -0,0 +1,73 @@
+#
+#
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+# SPDX-License-Identifier: MIT-0
+#
+#
+global:
+ scrape_interval: 15s
+ evaluation_interval: 15s
+ scrape_timeout: 15s
+
+scrape_configs:
+ - job_name: 'slurm_exporter'
+ scrape_interval: 30s
+ scrape_timeout: 30s
+ static_configs:
+ - targets: ['localhost:8081']
+ - job_name: 'pushgateway'
+ honor_labels: true
+ static_configs:
+ - targets: ['localhost:9091']
+ - job_name: 'prometheus_server'
+ scrape_interval: 5s
+ static_configs:
+ - targets: ['localhost:9090']
+ - job_name: 'ec2_instances'
+ scrape_interval: 5s
+ ec2_sd_configs:
+ - port: 9100
+ refresh_interval: 10s
+ - port: 9400
+ refresh_interval: 10s
+ filters:
+ - name: instance-type
+ values:
+ - p2.xlarge
+ - p2.8xlarge
+ - p2.16xlarge
+ - p3.2xlarge
+ - p3.8xlarge
+ - p3.16xlarge
+ - p3dn.24xlarge
+ - p4d.24xlarge
+ - g3s.xlarge
+ - g3.4xlarge
+ - g3.8xlarge
+ - g3.16xlarge
+ - g4dn.xlarge
+ - g4dn.2xlarge
+ - g4dn.4xlarge
+ - g4dn.8xlarge
+ - g4dn.16xlarge
+ - g4dn.12xlarge
+ - g4dn.metal
+ relabel_configs:
+ - source_labels: [__meta_ec2_tag_Name]
+ target_label: instance_name
+ - source_labels: [__meta_ec2_tag_Application]
+ target_label: instance_grafana
+ regex: __Application__
+ action: keep
+ - source_labels: [__meta_ec2_instance_id]
+ target_label: instance_id
+ - source_labels: [__meta_ec2_availability_zone]
+ target_label: instance_az
+ - source_labels: [__meta_ec2_instance_state]
+ regex: running
+ action: keep
+ target_label: instance_state
+ - source_labels: [__meta_ec2_instance_type]
+ target_label: instance_type
+ - source_labels: [__meta_ec2_vpc_id]
+ target_label: instance_vpc
\ No newline at end of file
diff --git a/monitoring/www/aws-logo.svg b/monitoring/www/aws-logo.svg
new file mode 100644
index 0000000..4d23322
--- /dev/null
+++ b/monitoring/www/aws-logo.svg
@@ -0,0 +1 @@
+AWS-Logo_White-Color
\ No newline at end of file
diff --git a/monitoring/www/background.png b/monitoring/www/background.png
new file mode 100644
index 0000000..4208555
Binary files /dev/null and b/monitoring/www/background.png differ
diff --git a/monitoring/www/index.html b/monitoring/www/index.html
new file mode 100644
index 0000000..fe63d21
--- /dev/null
+++ b/monitoring/www/index.html
@@ -0,0 +1,97 @@
+
+
+
+
+ AWS ParallelCluster
+
+
+
+
+
+
1Click-HPC
+
1Click-HPC is an open-source project that aims at speeding up the deployment of an HPC Cluster on AWS. You can have a fully functional and ready to use HPC cluster in minutes and with just 1-Click.
+
1Click-HPC source code and get started guide can be found here .
+
It leverages on AWS supported services and projects, like:
+
+
+ AWS ParallelCluster
+ an AWS-supported open source cluster management tool that makes it easy for you to deploy and manage High Performance Computing (HPC) clusters on AWS.
+
+ NICE DCV
+ a high-performance remote display protocol that provides customers with a secure way to deliver remote desktops and application streaming from any cloud or data center to any device, over varying network conditions.
+ NICE EnginFrame
+ is an advanced web front-end for accessing technical and scientific applications running on an HPC Cluster
+
+
+
EnginFrame
+
Cluster Monitoring Dashboards
+
Prometheus Server
+
+
+
\ No newline at end of file
diff --git a/parallelcluster/config.ap-east-1.sample b/parallelcluster/config.ap-east-1.sample
index aa17ec5..2b89dd8 100644
--- a/parallelcluster/config.ap-east-1.sample
+++ b/parallelcluster/config.ap-east-1.sample
@@ -40,7 +40,7 @@ queue_settings = batch-efa, batch, dcv-gpu, dcv, spot-batch-efa
post_install = ${POST_INSTALL}
post_install_args = "${POST_INSTALL_ARGS}"
tags = {"EnginFrame" : "true", "1Click-HPC" : "true", "1Click-HPC-version" : "0.1", "1Click-HPC-Template" : "limited-ec2" }
-additional_iam_policies = arn:aws:iam::aws:policy/ElasticLoadBalancingFullAccess,arn:aws:iam::aws:policy/SecretsManagerReadWrite
+additional_iam_policies = arn:aws:iam::aws:policy/ElasticLoadBalancingFullAccess,arn:aws:iam::aws:policy/SecretsManagerReadWrite,arn:aws:iam::aws:policy/CloudWatchFullAccess,arn:aws:iam::aws:policy/AWSPriceListServiceFullAccess,arn:aws:iam::aws:policy/AmazonSSMFullAccess,arn:aws:iam::aws:policy/AWSCloudFormationReadOnlyAccess
${EXTRA_JSON}
[fsx new]
diff --git a/parallelcluster/config.ap-northeast-1.sample b/parallelcluster/config.ap-northeast-1.sample
index ad4cd28..04c5a3b 100644
--- a/parallelcluster/config.ap-northeast-1.sample
+++ b/parallelcluster/config.ap-northeast-1.sample
@@ -40,7 +40,7 @@ queue_settings = batch-efa, batch, dcv-gpu, dcv, spot-batch-efa
post_install = ${POST_INSTALL}
post_install_args = "${POST_INSTALL_ARGS}"
tags = {"EnginFrame" : "true", "1Click-HPC" : "true", "1Click-HPC-version" : "0.1", "1Click-HPC-Template" : "standard" }
-additional_iam_policies = arn:aws:iam::aws:policy/ElasticLoadBalancingFullAccess,arn:aws:iam::aws:policy/SecretsManagerReadWrite
+additional_iam_policies = arn:aws:iam::aws:policy/ElasticLoadBalancingFullAccess,arn:aws:iam::aws:policy/SecretsManagerReadWrite,arn:aws:iam::aws:policy/CloudWatchFullAccess,arn:aws:iam::aws:policy/AWSPriceListServiceFullAccess,arn:aws:iam::aws:policy/AmazonSSMFullAccess,arn:aws:iam::aws:policy/AWSCloudFormationReadOnlyAccess
${EXTRA_JSON}
[fsx new]
diff --git a/parallelcluster/config.ap-northeast-2.sample b/parallelcluster/config.ap-northeast-2.sample
index ad4cd28..04c5a3b 100644
--- a/parallelcluster/config.ap-northeast-2.sample
+++ b/parallelcluster/config.ap-northeast-2.sample
@@ -40,7 +40,7 @@ queue_settings = batch-efa, batch, dcv-gpu, dcv, spot-batch-efa
post_install = ${POST_INSTALL}
post_install_args = "${POST_INSTALL_ARGS}"
tags = {"EnginFrame" : "true", "1Click-HPC" : "true", "1Click-HPC-version" : "0.1", "1Click-HPC-Template" : "standard" }
-additional_iam_policies = arn:aws:iam::aws:policy/ElasticLoadBalancingFullAccess,arn:aws:iam::aws:policy/SecretsManagerReadWrite
+additional_iam_policies = arn:aws:iam::aws:policy/ElasticLoadBalancingFullAccess,arn:aws:iam::aws:policy/SecretsManagerReadWrite,arn:aws:iam::aws:policy/CloudWatchFullAccess,arn:aws:iam::aws:policy/AWSPriceListServiceFullAccess,arn:aws:iam::aws:policy/AmazonSSMFullAccess,arn:aws:iam::aws:policy/AWSCloudFormationReadOnlyAccess
${EXTRA_JSON}
[fsx new]
diff --git a/parallelcluster/config.ap-south-1.sample b/parallelcluster/config.ap-south-1.sample
index aa17ec5..2b89dd8 100644
--- a/parallelcluster/config.ap-south-1.sample
+++ b/parallelcluster/config.ap-south-1.sample
@@ -40,7 +40,7 @@ queue_settings = batch-efa, batch, dcv-gpu, dcv, spot-batch-efa
post_install = ${POST_INSTALL}
post_install_args = "${POST_INSTALL_ARGS}"
tags = {"EnginFrame" : "true", "1Click-HPC" : "true", "1Click-HPC-version" : "0.1", "1Click-HPC-Template" : "limited-ec2" }
-additional_iam_policies = arn:aws:iam::aws:policy/ElasticLoadBalancingFullAccess,arn:aws:iam::aws:policy/SecretsManagerReadWrite
+additional_iam_policies = arn:aws:iam::aws:policy/ElasticLoadBalancingFullAccess,arn:aws:iam::aws:policy/SecretsManagerReadWrite,arn:aws:iam::aws:policy/CloudWatchFullAccess,arn:aws:iam::aws:policy/AWSPriceListServiceFullAccess,arn:aws:iam::aws:policy/AmazonSSMFullAccess,arn:aws:iam::aws:policy/AWSCloudFormationReadOnlyAccess
${EXTRA_JSON}
[fsx new]
diff --git a/parallelcluster/config.ca-central-1.sample b/parallelcluster/config.ca-central-1.sample
index aa17ec5..2b89dd8 100644
--- a/parallelcluster/config.ca-central-1.sample
+++ b/parallelcluster/config.ca-central-1.sample
@@ -40,7 +40,7 @@ queue_settings = batch-efa, batch, dcv-gpu, dcv, spot-batch-efa
post_install = ${POST_INSTALL}
post_install_args = "${POST_INSTALL_ARGS}"
tags = {"EnginFrame" : "true", "1Click-HPC" : "true", "1Click-HPC-version" : "0.1", "1Click-HPC-Template" : "limited-ec2" }
-additional_iam_policies = arn:aws:iam::aws:policy/ElasticLoadBalancingFullAccess,arn:aws:iam::aws:policy/SecretsManagerReadWrite
+additional_iam_policies = arn:aws:iam::aws:policy/ElasticLoadBalancingFullAccess,arn:aws:iam::aws:policy/SecretsManagerReadWrite,arn:aws:iam::aws:policy/CloudWatchFullAccess,arn:aws:iam::aws:policy/AWSPriceListServiceFullAccess,arn:aws:iam::aws:policy/AmazonSSMFullAccess,arn:aws:iam::aws:policy/AWSCloudFormationReadOnlyAccess
${EXTRA_JSON}
[fsx new]
diff --git a/parallelcluster/config.eu-central-1.sample b/parallelcluster/config.eu-central-1.sample
index a7f6f09..47b2321 100644
--- a/parallelcluster/config.eu-central-1.sample
+++ b/parallelcluster/config.eu-central-1.sample
@@ -40,7 +40,7 @@ queue_settings = batch-efa, batch, dcv-gpu, dcv, spot-batch-efa
post_install = ${POST_INSTALL}
post_install_args = "${POST_INSTALL_ARGS}"
tags = {"EnginFrame" : "true", "1Click-HPC" : "true", "1Click-HPC-version" : "0.1", "1Click-HPC-Template" : "standard" }
-additional_iam_policies = arn:aws:iam::aws:policy/ElasticLoadBalancingFullAccess,arn:aws:iam::aws:policy/SecretsManagerReadWrite
+additional_iam_policies = arn:aws:iam::aws:policy/ElasticLoadBalancingFullAccess,arn:aws:iam::aws:policy/SecretsManagerReadWrite,arn:aws:iam::aws:policy/CloudWatchFullAccess,arn:aws:iam::aws:policy/AWSPriceListServiceFullAccess,arn:aws:iam::aws:policy/AmazonSSMFullAccess,arn:aws:iam::aws:policy/AWSCloudFormationReadOnlyAccess
${EXTRA_JSON}
[fsx new]
diff --git a/parallelcluster/config.eu-north-1.sample b/parallelcluster/config.eu-north-1.sample
index aa17ec5..2b89dd8 100644
--- a/parallelcluster/config.eu-north-1.sample
+++ b/parallelcluster/config.eu-north-1.sample
@@ -40,7 +40,7 @@ queue_settings = batch-efa, batch, dcv-gpu, dcv, spot-batch-efa
post_install = ${POST_INSTALL}
post_install_args = "${POST_INSTALL_ARGS}"
tags = {"EnginFrame" : "true", "1Click-HPC" : "true", "1Click-HPC-version" : "0.1", "1Click-HPC-Template" : "limited-ec2" }
-additional_iam_policies = arn:aws:iam::aws:policy/ElasticLoadBalancingFullAccess,arn:aws:iam::aws:policy/SecretsManagerReadWrite
+additional_iam_policies = arn:aws:iam::aws:policy/ElasticLoadBalancingFullAccess,arn:aws:iam::aws:policy/SecretsManagerReadWrite,arn:aws:iam::aws:policy/CloudWatchFullAccess,arn:aws:iam::aws:policy/AWSPriceListServiceFullAccess,arn:aws:iam::aws:policy/AmazonSSMFullAccess,arn:aws:iam::aws:policy/AWSCloudFormationReadOnlyAccess
${EXTRA_JSON}
[fsx new]
diff --git a/parallelcluster/config.eu-south-1.sample b/parallelcluster/config.eu-south-1.sample
index 389f8f6..e2703e9 100644
--- a/parallelcluster/config.eu-south-1.sample
+++ b/parallelcluster/config.eu-south-1.sample
@@ -40,7 +40,7 @@ queue_settings = batch-efa, batch, dcv-gpu, spot-batch, spot-batch-efa
post_install = ${POST_INSTALL}
post_install_args = "${POST_INSTALL_ARGS}"
tags = {"EnginFrame" : "true", "1Click-HPC" : "true", "1Click-HPC-version" : "0.1", "1Click-HPC-Template" : "standard" }
-additional_iam_policies = arn:aws:iam::aws:policy/ElasticLoadBalancingFullAccess,arn:aws:iam::aws:policy/SecretsManagerReadWrite
+additional_iam_policies = arn:aws:iam::aws:policy/ElasticLoadBalancingFullAccess,arn:aws:iam::aws:policy/SecretsManagerReadWrite,arn:aws:iam::aws:policy/CloudWatchFullAccess,arn:aws:iam::aws:policy/AWSPriceListServiceFullAccess,arn:aws:iam::aws:policy/AmazonSSMFullAccess,arn:aws:iam::aws:policy/AWSCloudFormationReadOnlyAccess
${EXTRA_JSON}
[fsx new]
diff --git a/parallelcluster/config.eu-west-1.sample b/parallelcluster/config.eu-west-1.sample
index a7f6f09..47b2321 100644
--- a/parallelcluster/config.eu-west-1.sample
+++ b/parallelcluster/config.eu-west-1.sample
@@ -40,7 +40,7 @@ queue_settings = batch-efa, batch, dcv-gpu, dcv, spot-batch-efa
post_install = ${POST_INSTALL}
post_install_args = "${POST_INSTALL_ARGS}"
tags = {"EnginFrame" : "true", "1Click-HPC" : "true", "1Click-HPC-version" : "0.1", "1Click-HPC-Template" : "standard" }
-additional_iam_policies = arn:aws:iam::aws:policy/ElasticLoadBalancingFullAccess,arn:aws:iam::aws:policy/SecretsManagerReadWrite
+additional_iam_policies = arn:aws:iam::aws:policy/ElasticLoadBalancingFullAccess,arn:aws:iam::aws:policy/SecretsManagerReadWrite,arn:aws:iam::aws:policy/CloudWatchFullAccess,arn:aws:iam::aws:policy/AWSPriceListServiceFullAccess,arn:aws:iam::aws:policy/AmazonSSMFullAccess,arn:aws:iam::aws:policy/AWSCloudFormationReadOnlyAccess
${EXTRA_JSON}
[fsx new]
diff --git a/parallelcluster/config.us-east-1.sample b/parallelcluster/config.us-east-1.sample
index ad4cd28..04c5a3b 100644
--- a/parallelcluster/config.us-east-1.sample
+++ b/parallelcluster/config.us-east-1.sample
@@ -40,7 +40,7 @@ queue_settings = batch-efa, batch, dcv-gpu, dcv, spot-batch-efa
post_install = ${POST_INSTALL}
post_install_args = "${POST_INSTALL_ARGS}"
tags = {"EnginFrame" : "true", "1Click-HPC" : "true", "1Click-HPC-version" : "0.1", "1Click-HPC-Template" : "standard" }
-additional_iam_policies = arn:aws:iam::aws:policy/ElasticLoadBalancingFullAccess,arn:aws:iam::aws:policy/SecretsManagerReadWrite
+additional_iam_policies = arn:aws:iam::aws:policy/ElasticLoadBalancingFullAccess,arn:aws:iam::aws:policy/SecretsManagerReadWrite,arn:aws:iam::aws:policy/CloudWatchFullAccess,arn:aws:iam::aws:policy/AWSPriceListServiceFullAccess,arn:aws:iam::aws:policy/AmazonSSMFullAccess,arn:aws:iam::aws:policy/AWSCloudFormationReadOnlyAccess
${EXTRA_JSON}
[fsx new]
diff --git a/parallelcluster/config.us-east-2.sample b/parallelcluster/config.us-east-2.sample
index ad4cd28..04c5a3b 100644
--- a/parallelcluster/config.us-east-2.sample
+++ b/parallelcluster/config.us-east-2.sample
@@ -40,7 +40,7 @@ queue_settings = batch-efa, batch, dcv-gpu, dcv, spot-batch-efa
post_install = ${POST_INSTALL}
post_install_args = "${POST_INSTALL_ARGS}"
tags = {"EnginFrame" : "true", "1Click-HPC" : "true", "1Click-HPC-version" : "0.1", "1Click-HPC-Template" : "standard" }
-additional_iam_policies = arn:aws:iam::aws:policy/ElasticLoadBalancingFullAccess,arn:aws:iam::aws:policy/SecretsManagerReadWrite
+additional_iam_policies = arn:aws:iam::aws:policy/ElasticLoadBalancingFullAccess,arn:aws:iam::aws:policy/SecretsManagerReadWrite,arn:aws:iam::aws:policy/CloudWatchFullAccess,arn:aws:iam::aws:policy/AWSPriceListServiceFullAccess,arn:aws:iam::aws:policy/AmazonSSMFullAccess,arn:aws:iam::aws:policy/AWSCloudFormationReadOnlyAccess
${EXTRA_JSON}
[fsx new]
diff --git a/parallelcluster/config.us-west-1.sample b/parallelcluster/config.us-west-1.sample
index ad4cd28..04c5a3b 100644
--- a/parallelcluster/config.us-west-1.sample
+++ b/parallelcluster/config.us-west-1.sample
@@ -40,7 +40,7 @@ queue_settings = batch-efa, batch, dcv-gpu, dcv, spot-batch-efa
post_install = ${POST_INSTALL}
post_install_args = "${POST_INSTALL_ARGS}"
tags = {"EnginFrame" : "true", "1Click-HPC" : "true", "1Click-HPC-version" : "0.1", "1Click-HPC-Template" : "standard" }
-additional_iam_policies = arn:aws:iam::aws:policy/ElasticLoadBalancingFullAccess,arn:aws:iam::aws:policy/SecretsManagerReadWrite
+additional_iam_policies = arn:aws:iam::aws:policy/ElasticLoadBalancingFullAccess,arn:aws:iam::aws:policy/SecretsManagerReadWrite,arn:aws:iam::aws:policy/CloudWatchFullAccess,arn:aws:iam::aws:policy/AWSPriceListServiceFullAccess,arn:aws:iam::aws:policy/AmazonSSMFullAccess,arn:aws:iam::aws:policy/AWSCloudFormationReadOnlyAccess
${EXTRA_JSON}
[fsx new]
diff --git a/parallelcluster/config.us-west-2.sample b/parallelcluster/config.us-west-2.sample
index ad4cd28..04c5a3b 100644
--- a/parallelcluster/config.us-west-2.sample
+++ b/parallelcluster/config.us-west-2.sample
@@ -40,7 +40,7 @@ queue_settings = batch-efa, batch, dcv-gpu, dcv, spot-batch-efa
post_install = ${POST_INSTALL}
post_install_args = "${POST_INSTALL_ARGS}"
tags = {"EnginFrame" : "true", "1Click-HPC" : "true", "1Click-HPC-version" : "0.1", "1Click-HPC-Template" : "standard" }
-additional_iam_policies = arn:aws:iam::aws:policy/ElasticLoadBalancingFullAccess,arn:aws:iam::aws:policy/SecretsManagerReadWrite
+additional_iam_policies = arn:aws:iam::aws:policy/ElasticLoadBalancingFullAccess,arn:aws:iam::aws:policy/SecretsManagerReadWrite,arn:aws:iam::aws:policy/CloudWatchFullAccess,arn:aws:iam::aws:policy/AWSPriceListServiceFullAccess,arn:aws:iam::aws:policy/AmazonSSMFullAccess,arn:aws:iam::aws:policy/AWSCloudFormationReadOnlyAccess
${EXTRA_JSON}
[fsx new]
diff --git a/scripts/post.install.sh b/scripts/post.install.sh
index 00536d1..9e94c3e 100644
--- a/scripts/post.install.sh
+++ b/scripts/post.install.sh
@@ -82,7 +82,7 @@ export ec2user_home=$(getent passwd | grep ec2-user | sed 's/^.*:.*:.*:.*:.*:\(.
export dna_json="/etc/chef/dna.json"
if [[ -z "${cfn_postinstall_args}" ]]; then
- export myscripts="03.configure.slurm.acct.master.sh 04.configure.slurm.AllOrNothing.master.sh 04.configure.disable.anacron.compute.sh 05.install.ldap.server.master.sh 06.install.ldap.client.compute.sh 06.install.ldap.client.master.sh 07.configure.slurm.tagging.master.sh 10.install.enginframe.master.sh 11.install.ldap.enginframe.master.sh 12.configure.enginframe.alb.master.sh 20.install.dcv.slurm.master.sh 25.install.dcv-server.compute.sh 26.configure.dcv.alb.compute.sh 35.install.dcv.slurm.compute.sh"
+ export myscripts="03.configure.slurm.acct.master.sh 04.configure.slurm.AllOrNothing.master.sh 04.configure.disable.anacron.compute.sh 05.install.ldap.server.master.sh 06.install.ldap.client.compute.sh 06.install.ldap.client.master.sh 07.configure.slurm.tagging.master.sh 10.install.enginframe.master.sh 11.install.ldap.enginframe.master.sh 12.configure.enginframe.alb.master.sh 20.install.dcv.slurm.master.sh 25.install.dcv-server.compute.sh 26.configure.dcv.alb.compute.sh 35.install.dcv.slurm.compute.sh 40.install.monitoring.master.sh 40.install.monitoring.compute.sh"
else
export myscripts="${@:2}"
fi