diff --git a/Templates/AWS-HPC-Cluster.yaml b/Templates/AWS-HPC-Cluster.yaml index a2bc479..73f4441 100644 --- a/Templates/AWS-HPC-Cluster.yaml +++ b/Templates/AWS-HPC-Cluster.yaml @@ -957,8 +957,8 @@ Outputs: Cloud9URL: Description: Cloud9 Environment Value: !Sub 'https://${AWS::Region}.console.aws.amazon.com/cloud9/ide/${Cloud9}' - EnginFrameURL: + WebURL: Description: "EnginFrame HPC Portal, default username: ec2-user , default password: Change_this!" Value: !Sub - - 'https://${ALB}/enginframe' + - 'https://${ALB}/' - ALB: !GetAtt ApplicationLoadBalancer.DNSName \ No newline at end of file diff --git a/modules/40.install.monitoring.compute.sh b/modules/40.install.monitoring.compute.sh new file mode 100644 index 0000000..e7637b5 --- /dev/null +++ b/modules/40.install.monitoring.compute.sh @@ -0,0 +1,64 @@ +#!/bin/bash + +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: MIT-0 +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this +# software and associated documentation files (the "Software"), to deal in the Software +# without restriction, including without limitation the rights to use, copy, modify, +# merge, publish, distribute, sublicense, and/or sell copies of the Software, and to +# permit persons to whom the Software is furnished to do so. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A +# PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +source /etc/parallelcluster/cfnconfig +compute_instance_type=$(ec2-metadata -t | awk '{print $2}') +gpu_instances="[pg][2-9].*\.[0-9]*[x]*large" + +monitoring_dir_name="monitoring" +monitoring_home="${SHARED_FS_DIR}/${monitoring_dir_name}" + +set -x +set -e + +installPreReq() { + yum -y install docker golang-bin + service docker start + chkconfig docker on + usermod -a -G docker $cfn_cluster_user + + #to be replaced with yum -y install docker-compose as the repository problem is fixed + curl -L "https://github.com/docker/compose/releases/download/1.29.2/docker-compose-$(uname -s)-$(uname -m)" -o /usr/local/bin/docker-compose + chmod +x /usr/local/bin/docker-compose +} + +configureMonitoring() { + + if [[ $compute_instance_type =~ $gpu_instances ]]; then + distribution=$(. /etc/os-release;echo $ID$VERSION_ID) + curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.repo | tee /etc/yum.repos.d/nvidia-docker.repo + yum -y clean expire-cache + yum -y install nvidia-docker2 + systemctl restart docker + /usr/local/bin/docker-compose -f "${monitoring_home}/docker-compose/docker-compose.compute.gpu.yml" -p monitoring-compute up -d + + else + /usr/local/bin/docker-compose -f "${monitoring_home}/docker-compose/docker-compose.compute.yml" -p monitoring-compute up -d + fi +} + +# main +# ---------------------------------------------------------------------------- +main() { + echo "[INFO][$(date '+%Y-%m-%d %H:%M:%S')] 40.install.monitoring.compute.sh: START" >&2 + installPreReq + configureMonitoring + echo "[INFO][$(date '+%Y-%m-%d %H:%M:%S')] 40.install.monitoring.compute.sh: STOP" >&2 +} + +main "$@" \ No newline at end of file diff --git a/modules/40.install.monitoring.master.sh b/modules/40.install.monitoring.master.sh new file mode 100644 index 0000000..6036ec4 --- /dev/null +++ b/modules/40.install.monitoring.master.sh @@ -0,0 +1,130 @@ +#!/bin/bash + +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: MIT-0 +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this +# software and associated documentation files (the "Software"), to deal in the Software +# without restriction, including without limitation the rights to use, copy, modify, +# merge, publish, distribute, sublicense, and/or sell copies of the Software, and to +# permit persons to whom the Software is furnished to do so. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A +# PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +source /etc/parallelcluster/cfnconfig +cfn_fsx_fs_id=$(cat /etc/chef/dna.json | grep \"cfn_fsx_fs_id\" | awk '{print $2}' | sed "s/\",//g;s/\"//g") +master_instance_id=$(ec2-metadata -i | awk '{print $2}') +cfn_max_queue_size=$(aws cloudformation describe-stacks --stack-name $stack_name --region $cfn_region | jq -r '.Stacks[0].Parameters | map(select(.ParameterKey == "MaxSize"))[0].ParameterValue') +monitoring_dir_name="monitoring" +monitoring_home="${SHARED_FS_DIR}/${monitoring_dir_name}" +chef_dna="/etc/chef/dna.json" +s3_bucket=$(echo $cfn_postinstall | sed "s/s3:\/\///g;s/\/.*//") +grafana_password=$(aws secretsmanager get-secret-value --secret-id "${stack_name}" --query SecretString --output text --region "${cfn_region}") +NICE_ROOT=$(jq --arg default "${SHARED_FS_DIR}/nice" -r '.post_install.enginframe | if has("nice_root") then .nice_root else $default end' "${dna_json}") + + +set -x +set -e + +installPreReq() { + yum -y install docker golang-bin + service docker start + chkconfig docker on + usermod -a -G docker $cfn_cluster_user + + #to be replaced with yum -y install docker-compose as the repository problem is fixed + curl -L "https://github.com/docker/compose/releases/download/1.29.2/docker-compose-$(uname -s)-$(uname -m)" -o /usr/local/bin/docker-compose + chmod +x /usr/local/bin/docker-compose +} + +saveClusterConfigLocally(){ + + cluster_s3_bucket=$(cat "${chef_dna}" | grep \"cluster_s3_bucket\" | awk '{print $2}' | sed "s/\",//g;s/\"//g") + cluster_config_s3_key=$(cat "${chef_dna}" | grep \"cluster_config_s3_key\" | awk '{print $2}' | sed "s/\",//g;s/\"//g") + cluster_config_version=$(cat "${chef_dna}" | grep \"cluster_config_version\" | awk '{print $2}' | sed "s/\",//g;s/\"//g") + log_group_names="\/aws\/parallelcluster\/$(echo ${stack_name} | cut -d "-" -f2-)" + + mkdir -p "${monitoring_home}/parallelcluster" + aws s3api get-object --bucket $cluster_s3_bucket --key $cluster_config_s3_key --region $cfn_region --version-id $cluster_config_version "${monitoring_home}/parallelcluster/cluster-config.json" +} + +installMonitoring(){ + + aws s3 cp --recursive "${post_install_base}/monitoring" "${monitoring_home}" --region "${cfn_region}" || exit 1 + chown $cfn_cluster_user:$cfn_cluster_user -R "${monitoring_home}" + chmod +x ${monitoring_home}/custom-metrics/* + + cp -rp ${monitoring_home}/custom-metrics/* /usr/local/bin/ + mv -f "${monitoring_home}/prometheus-slurm-exporter/slurm_exporter.service" /etc/systemd/system/ + + cp -rp ${monitoring_home}/www/* "${NICE_ROOT}/enginframe/conf/tomcat/webapps/ROOT/" +} + + + +configureMonitoring() { + + (crontab -l -u $cfn_cluster_user; echo "*/1 * * * * /usr/local/bin/1m-cost-metrics.sh") | crontab -u $cfn_cluster_user - + (crontab -l -u $cfn_cluster_user; echo "*/60 * * * * /usr/local/bin/1h-cost-metrics.sh") | crontab -u $cfn_cluster_user - + + # replace tokens + sed -i "s/_S3_BUCKET_/${s3_bucket}/g" "${monitoring_home}/grafana/dashboards/ParallelCluster.json" + sed -i "s/__INSTANCE_ID__/${master_instance_id}/g" "${monitoring_home}/grafana/dashboards/ParallelCluster.json" + sed -i "s/__FSX_ID__/${cfn_fsx_fs_id}/g" "${monitoring_home}/grafana/dashboards/ParallelCluster.json" + sed -i "s/__AWS_REGION__/${cfn_region}/g" "${monitoring_home}/grafana/dashboards/ParallelCluster.json" + + sed -i "s/__AWS_REGION__/${cfn_region}/g" "${monitoring_home}/grafana/dashboards/logs.json" + sed -i "s/__LOG_GROUP__NAMES__/${log_group_names}/g" "${monitoring_home}/grafana/dashboards/logs.json" + + sed -i "s/__Application__/${stack_name}/g" "${monitoring_home}/prometheus/prometheus.yml" + + sed -i "s/__INSTANCE_ID__/${master_instance_id}/g" "${monitoring_home}/grafana/dashboards/master-node-details.json" + sed -i "s/__INSTANCE_ID__/${master_instance_id}/g" "${monitoring_home}/grafana/dashboards/compute-node-list.json" + sed -i "s/__INSTANCE_ID__/${master_instance_id}/g" "${monitoring_home}/grafana/dashboards/compute-node-details.json" + + sed -i "s~__MONITORING_DIR__~${monitoring_home}~g" "${monitoring_home}/docker-compose/docker-compose.master.yml" + sed -i "s~__GRAFANA_PASSWORD__~${grafana_password}~g" "${monitoring_home}/docker-compose/docker-compose.master.yml" + + + # Download and build prometheus-slurm-exporter + ##### Plese note this software package is under GPLv3 License ##### + # More info here: https://github.com/vpenso/prometheus-slurm-exporter/blob/master/LICENSE + cd "${monitoring_home}" + #FIXME: temporary + rm -rf prometheus-slurm-exporter + git clone https://github.com/vpenso/prometheus-slurm-exporter.git + cd prometheus-slurm-exporter + sed -i 's/NodeList,AllocMem,Memory,CPUsState,StateLong/NodeList: ,AllocMem: ,Memory: ,CPUsState: ,StateLong:/' node.go + GOPATH=/root/go-modules-cache HOME=/root go mod download + GOPATH=/root/go-modules-cache HOME=/root go build + mv -f "${monitoring_home}/prometheus-slurm-exporter/prometheus-slurm-exporter" /usr/bin/prometheus-slurm-exporter +} + + +startMonitoringDaemons() { + + /usr/local/bin/docker-compose --env-file /etc/parallelcluster/cfnconfig -f "${monitoring_home}/docker-compose/docker-compose.master.yml" -p monitoring-master up -d + systemctl daemon-reload + systemctl enable slurm_exporter + systemctl start slurm_exporter + +} + +# main +# ---------------------------------------------------------------------------- +main() { + echo "[INFO][$(date '+%Y-%m-%d %H:%M:%S')] 40.install.monitoring.master.sh: START" >&2 + installPreReq + saveClusterConfigLocally + installMonitoring + configureMonitoring + startMonitoringDaemons + echo "[INFO][$(date '+%Y-%m-%d %H:%M:%S')] 40.install.monitoring.master.sh: STOP" >&2 +} + +main "$@" \ No newline at end of file diff --git a/monitoring/custom-metrics/1h-cost-metrics.sh b/monitoring/custom-metrics/1h-cost-metrics.sh new file mode 100644 index 0000000..f4d3cb0 --- /dev/null +++ b/monitoring/custom-metrics/1h-cost-metrics.sh @@ -0,0 +1,123 @@ +#!/bin/bash +# +# +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: MIT-0 +# +# + +#source the AWS ParallelCluster profile +. /etc/parallelcluster/cfnconfig + +export AWS_DEFAULT_REGION=$cfn_region +aws_region_long_name=$(python /usr/local/bin/aws-region.py $cfn_region) +aws_region_long_name=${aws_region_long_name/Europe/EU} + +masterInstanceType=$(ec2-metadata -t | awk '{print $2}') +masterInstanceId=$(ec2-metadata -i | awk '{print $2}') +s3_bucket=$(echo $cfn_postinstall | sed "s/s3:\/\///g;s/\/.*//") +s3_size_gb=$(echo "$(aws s3api list-objects --bucket $s3_bucket --output json --query "[sum(Contents[].Size)]"| sed -n 2p | tr -d ' ') / 1024 / 1024 / 1024" | bc) + + +#retrieve the s3 cost +if [[ $s3_size_gb -le 51200 ]]; then + s3_range=51200 +elif [[ $VAR -le 512000 ]]; then + s3_range=512000 +else + s3_range="Inf" +fi + +####################### S3 ######################### + +s3_cost_gb_month=$(aws --region us-east-1 pricing get-products \ + --service-code AmazonS3 \ + --filters 'Type=TERM_MATCH,Field=location,Value='"${aws_region_long_name}" \ + 'Type=TERM_MATCH,Field=storageClass,Value=General Purpose' \ + --query 'PriceList[0]' --output text \ + | jq -r --arg endRange $s3_range '.terms.OnDemand | to_entries[] | .value.priceDimensions | to_entries[].value | select(.endRange==$endRange).pricePerUnit.USD') + +s3=$(echo "scale=2; $s3_cost_gb_month * $s3_size_gb / 720" | bc) +echo "s3_cost $s3" | curl --data-binary @- http://127.0.0.1:9091/metrics/job/cost + + +####################### Master ######################### +master_node_h_price=$(aws pricing get-products \ + --region us-east-1 \ + --service-code AmazonEC2 \ + --filters 'Type=TERM_MATCH,Field=instanceType,Value='$masterInstanceType \ + 'Type=TERM_MATCH,Field=location,Value='"${aws_region_long_name}" \ + 'Type=TERM_MATCH,Field=preInstalledSw,Value=NA' \ + 'Type=TERM_MATCH,Field=operatingSystem,Value=Linux' \ + 'Type=TERM_MATCH,Field=tenancy,Value=Shared' \ + 'Type=TERM_MATCH,Field=capacitystatus,Value=UnusedCapacityReservation' \ + --output text \ + --query 'PriceList' \ + | jq -r '.terms.OnDemand | to_entries[] | .value.priceDimensions | to_entries[] | .value.pricePerUnit.USD') + +echo "master_node_cost $master_node_h_price" | curl --data-binary @- http://127.0.0.1:9091/metrics/job/cost + + +fsx_id=$(aws cloudformation describe-stacks --stack-name $stack_name --region $cfn_region \ + | jq -r '.Stacks[0].Parameters | map(select(.ParameterKey == "FSXOptions"))[0].ParameterValue' \ + | awk -F "," '{print $2}') +fsx_summary=$(aws fsx describe-file-systems --region $cfn_region --file-system-ids $fsx_id) +fsx_size_gb=$(echo $fsx_summary | jq -r '.FileSystems[0].StorageCapacity') +fsx_type=$(echo $fsx_summary | jq -r '.FileSystems[0].LustreConfiguration.DeploymentType') +fsx_throughput=$(echo $fsx_summary | jq -r '.FileSystems[0].LustreConfiguration.PerUnitStorageThroughput') + +if [[ $fsx_type = "SCRATCH_2" ]] || [[ $fsx_type = "SCRATCH_1" ]]; then + fsx_cost_gb_month=$(aws pricing get-products \ + --region us-east-1 \ + --service-code AmazonFSx \ + --filters 'Type=TERM_MATCH,Field=location,Value='"${aws_region_long_name}" \ + 'Type=TERM_MATCH,Field=fileSystemType,Value=Lustre' \ + 'Type=TERM_MATCH,Field=throughputCapacity,Value=N/A' \ + --output text \ + --query 'PriceList' \ + | jq -r '.terms.OnDemand | to_entries[] | .value.priceDimensions | to_entries[] | .value.pricePerUnit.USD') + +elif [ $fsx_type = "PERSISTENT_1" ]; then + fsx_cost_gb_month=$(aws pricing get-products \ + --region us-east-1 \ + --service-code AmazonFSx \ + --filters 'Type=TERM_MATCH,Field=location,Value='"${aws_region_long_name}" \ + 'Type=TERM_MATCH,Field=fileSystemType,Value=Lustre' \ + 'Type=TERM_MATCH,Field=throughputCapacity,Value='$fsx_throughput \ + --output text \ + --query 'PriceList' \ + | jq -r '.terms.OnDemand | to_entries[] | .value.priceDimensions | to_entries[] | .value.pricePerUnit.USD') + +else + fsx_cost_gb_month=0 +fi + +fsx=$(echo "scale=2; $fsx_cost_gb_month * $fsx_size_gb / 720" | bc) +echo "fsx_cost $fsx" | curl --data-binary @- http://127.0.0.1:9091/metrics/job/cost + + +#parametrize: +ebs_volume_total_cost=0 +ebs_volume_ids=$(aws ec2 describe-instances --instance-ids $masterInstanceId \ + | jq -r '.Reservations | to_entries[].value | .Instances | to_entries[].value | .BlockDeviceMappings | to_entries[].value | .Ebs.VolumeId') + +for ebs_volume_id in $ebs_volume_ids +do + ebs_volume_type=$(aws ec2 describe-volumes --volume-ids $ebs_volume_id | jq -r '.Volumes | to_entries[].value.VolumeType') + #ebs_volume_iops=$(aws ec2 describe-volumes --volume-ids $ebs_volume_id | jq -r '.Volumes | to_entries[].value.Iops') + ebs_volume_size=$(aws ec2 describe-volumes --volume-ids $ebs_volume_id | jq -r '.Volumes | to_entries[].value.Size') + + ebs_cost_gb_month=$(aws --region us-east-1 pricing get-products \ + --service-code AmazonEC2 \ + --query 'PriceList' \ + --output text \ + --filters 'Type=TERM_MATCH,Field=location,Value='"${aws_region_long_name}" \ + 'Type=TERM_MATCH,Field=productFamily,Value=Storage' \ + 'Type=TERM_MATCH,Field=volumeApiName,Value='$ebs_volume_type \ + | jq -r '.terms.OnDemand | to_entries[] | .value.priceDimensions | to_entries[] | .value.pricePerUnit.USD') + + ebs_volume_cost=$(echo "scale=2; $ebs_cost_gb_month * $ebs_volume_size / 720" | bc) + ebs_volume_total_cost=$(echo "scale=2; $ebs_volume_total_cost + $ebs_volume_cost" | bc) +done + +echo "ebs_master_cost $ebs_volume_total_cost" | curl --data-binary @- http://127.0.0.1:9091/metrics/job/cost \ No newline at end of file diff --git a/monitoring/custom-metrics/1m-cost-metrics.sh b/monitoring/custom-metrics/1m-cost-metrics.sh new file mode 100644 index 0000000..9a8ee8a --- /dev/null +++ b/monitoring/custom-metrics/1m-cost-metrics.sh @@ -0,0 +1,70 @@ +#!/bin/bash +# +# +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: MIT-0 +# +# + +#!/bin/bash +# +# +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: MIT-0 +# +# + +#source the AWS ParallelCluster profile +. /etc/parallelcluster/cfnconfig + +export AWS_DEFAULT_REGION=$cfn_region +aws_region_long_name=$(python /usr/local/bin/aws-region.py $cfn_region) +aws_region_long_name=${aws_region_long_name/Europe/EU} + +#FIXME: not hardcode dir +monitoring_dir_name="monitoring" +monitoring_home="/fsx/${monitoring_dir_name}" + +queues=$(/opt/slurm/bin/sinfo --noheader -O partition | sed 's/\*//g') +cluster_config_file="${monitoring_home}/parallelcluster-setup/cluster-config.json" + +compute_nodes_total_cost=0 + +for queue in $queues; do + + instance_type=$(cat "${cluster_config_file}" | jq -r --arg queue $queue '.cluster.queue_settings | to_entries[] | select(.key==$queue).value.compute_resource_settings | to_entries[]| .value.instance_type') + + compute_node_h_price=$(aws pricing get-products \ + --region us-east-1 \ + --service-code AmazonEC2 \ + --filters 'Type=TERM_MATCH,Field=instanceType,Value='$instance_type \ + 'Type=TERM_MATCH,Field=location,Value='"${aws_region_long_name}" \ + 'Type=TERM_MATCH,Field=preInstalledSw,Value=NA' \ + 'Type=TERM_MATCH,Field=operatingSystem,Value=Linux' \ + 'Type=TERM_MATCH,Field=tenancy,Value=Shared' \ + 'Type=TERM_MATCH,Field=capacitystatus,Value=UnusedCapacityReservation' \ + --output text \ + --query 'PriceList' \ + | jq -r '.terms.OnDemand | to_entries[] | .value.priceDimensions | to_entries[] | .value.pricePerUnit.USD') + + ebs_cost_gb_month=$(aws --region us-east-1 pricing get-products \ + --service-code AmazonEC2 \ + --query 'PriceList' \ + --output text \ + --filters 'Type=TERM_MATCH,Field=location,Value='"${aws_region_long_name}" \ + 'Type=TERM_MATCH,Field=productFamily,Value=Storage' \ + 'Type=TERM_MATCH,Field=volumeApiName,Value=gp2' \ + | jq -r '.terms.OnDemand | to_entries[] | .value.priceDimensions | to_entries[] | .value.pricePerUnit.USD') + + total_num_compute_nodes=$(/opt/slurm/bin/sinfo --noheader --partition=$queue | egrep -v "idle~" | awk '{sum += $4} END {if (sum) print sum; else print 0; }') + + ebs_volume_size=$(aws cloudformation describe-stacks --stack-name $stack_name --region $cfn_region | jq -r '.Stacks[0].Parameters | map(select(.ParameterKey == "ComputeRootVolumeSize"))[0].ParameterValue') + compute_ebs_volume_cost=$(echo "scale=2; $ebs_cost_gb_month * $total_num_compute_nodes * $ebs_volume_size / 720" | bc) + compute_nodes_cost=$(echo "scale=2; $total_num_compute_nodes * $compute_node_h_price" | bc) + + compute_nodes_total_cost=$(echo "scale=2; $compute_nodes_total_cost + $compute_nodes_cost" | bc) + +done + +echo "ebs_compute_cost $compute_ebs_volume_cost" | curl --data-binary @- http://127.0.0.1:9091/metrics/job/cost +echo "compute_nodes_cost $compute_nodes_total_cost" | curl --data-binary @- http://127.0.0.1:9091/metrics/job/cost \ No newline at end of file diff --git a/monitoring/custom-metrics/aws-region.py b/monitoring/custom-metrics/aws-region.py new file mode 100644 index 0000000..c5465f9 --- /dev/null +++ b/monitoring/custom-metrics/aws-region.py @@ -0,0 +1,23 @@ +# +# +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: MIT-0 +# +# +import json +import sys + +from pkg_resources import resource_filename + +region = str(sys.argv[1]) + +name = None +endpoint_file = resource_filename('botocore', 'data/endpoints.json') +with open(endpoint_file, 'r') as ep_file: + data = json.load(ep_file) + for partition in data['partitions']: + if region in partition['regions']: + name = partition['regions'][region]['description'] + break + +print(name) \ No newline at end of file diff --git a/monitoring/docker-compose/docker-compose.compute.gpu.yml b/monitoring/docker-compose/docker-compose.compute.gpu.yml new file mode 100644 index 0000000..e0778ac --- /dev/null +++ b/monitoring/docker-compose/docker-compose.compute.gpu.yml @@ -0,0 +1,28 @@ +# +# +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: MIT-0 +# +# +version: '3.8' +services: + prometheus-node-exporter: + container_name: node-exporter + network_mode: host + pid: host + restart: unless-stopped + volumes: + - '/:/host:ro,rslave' + image: quay.io/prometheus/node-exporter + command: + - '--path.rootfs=/host' + dcgm-exporter: + container_name: nvidia-dcgm + network_mode: host + pid: host + restart: unless-stopped + image: nvidia/dcgm-exporter + runtime: nvidia + environment: + - NVIDIA_VISIBLE_DEVICES=all + - NVIDIA_DRIVER_CAPABILITIES=all \ No newline at end of file diff --git a/monitoring/docker-compose/docker-compose.compute.yml b/monitoring/docker-compose/docker-compose.compute.yml new file mode 100644 index 0000000..3f1dfd6 --- /dev/null +++ b/monitoring/docker-compose/docker-compose.compute.yml @@ -0,0 +1,18 @@ +# +# +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: MIT-0 +# +# +version: '3.8' +services: + prometheus-node-exporter: + container_name: node-exporter + network_mode: host + pid: host + restart: unless-stopped + volumes: + - '/:/host:ro,rslave' + image: quay.io/prometheus/node-exporter + command: + - '--path.rootfs=/host' \ No newline at end of file diff --git a/monitoring/docker-compose/docker-compose.master.yml b/monitoring/docker-compose/docker-compose.master.yml new file mode 100644 index 0000000..270f5b8 --- /dev/null +++ b/monitoring/docker-compose/docker-compose.master.yml @@ -0,0 +1,65 @@ +# +# +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: MIT-0 +# +# +version: '3.8' +services: + pushgateway: + container_name: pushgateway + network_mode: host + pid: host + restart: unless-stopped + image: prom/pushgateway + prometheus: + container_name: prometheus + network_mode: host + pid: host + restart: unless-stopped + volumes: + - '__MONITORING_DIR__/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml' + - 'prometheus-data:/prometheus' + image: prom/prometheus + command: + - '--config.file=/etc/prometheus/prometheus.yml' + - '--storage.tsdb.path=/prometheus' + - '--web.console.libraries=/usr/share/prometheus/console_libraries' + - '--web.console.templates=/usr/share/prometheus/consoles' + - '--web.external-url=/prometheus/' + - '--web.route-prefix=/' + grafana: + container_name: grafana + network_mode: host + pid: host + restart: unless-stopped + environment: + - 'GF_SECURITY_ADMIN_PASSWORD=__GRAFANA_PASSWORD__' + - 'GF_SERVER_ROOT_URL=http://%(domain)s/grafana/' + volumes: + - '__MONITORING_DIR__/grafana:/etc/grafana/provisioning' + - 'grafana-data:/var/lib/grafana' + image: grafana/grafana + prometheus-node-exporter: + container_name: node-exporter + network_mode: host + pid: host + restart: unless-stopped + volumes: + - '/:/host:ro,rslave' + image: quay.io/prometheus/node-exporter + command: + - '--path.rootfs=/host' + nginx: + container_name: nginx + network_mode: host + pid: host + restart: unless-stopped + volumes: + - '__MONITORING_DIR__/nginx/conf.d:/etc/nginx/conf.d/' + - '__MONITORING_DIR__/nginx/ssl:/etc/ssl/' + - '__MONITORING_DIR__/www:/usr/share/nginx/html' + image: nginx +volumes: + prometheus-data: + grafana-data: \ No newline at end of file diff --git a/monitoring/grafana/dashboards/ParallelCluster.json b/monitoring/grafana/dashboards/ParallelCluster.json new file mode 100755 index 0000000..8d48680 --- /dev/null +++ b/monitoring/grafana/dashboards/ParallelCluster.json @@ -0,0 +1,2761 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "links": [ + { + "asDropdown": false, + "icon": "external link", + "tags": [], + "type": "dashboards" + } + ], + "panels": [ + { + "collapsed": false, + "datasource": null, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 45, + "panels": [], + "title": "Storage", + "type": "row" + }, + { + "columns": [], + "datasource": "prometheus", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fontSize": "110%", + "gridPos": { + "h": 9, + "w": 8, + "x": 0, + "y": 1 + }, + "id": 50, + "links": [], + "pageSize": null, + "scroll": true, + "showHeader": true, + "sort": { + "col": 6, + "desc": false + }, + "styles": [ + { + "alias": "Mounted on", + "align": "auto", + "colorMode": null, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "mappingType": 1, + "pattern": "mountpoint", + "thresholds": [ + "" + ], + "type": "string", + "unit": "bytes" + }, + { + "alias": "Avail", + "align": "auto", + "colorMode": "value", + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "mappingType": 1, + "pattern": "Value #A", + "thresholds": [ + "10000000000", + "20000000000" + ], + "type": "number", + "unit": "bytes" + }, + { + "alias": "Used", + "align": "auto", + "colorMode": "cell", + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "mappingType": 1, + "pattern": "Value #B", + "thresholds": [ + "0.6", + "0.8" + ], + "type": "number", + "unit": "percentunit" + }, + { + "alias": "Size", + "align": "auto", + "colorMode": null, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 1, + "link": false, + "mappingType": 1, + "pattern": "Value #C", + "thresholds": [], + "type": "number", + "unit": "bytes" + }, + { + "alias": "Filesystem", + "align": "auto", + "colorMode": null, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "mappingType": 1, + "pattern": "fstype", + "thresholds": [], + "type": "string", + "unit": "short" + }, + { + "alias": "IP", + "align": "auto", + "colorMode": null, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "mappingType": 1, + "pattern": "instance", + "preserveFormat": false, + "sanitize": false, + "thresholds": [], + "type": "string", + "unit": "short" + }, + { + "alias": "", + "align": "auto", + "colorMode": null, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "decimals": 2, + "pattern": "/.*/", + "preserveFormat": true, + "sanitize": false, + "thresholds": [], + "type": "hidden", + "unit": "short" + } + ], + "targets": [ + { + "expr": "node_filesystem_size_bytes{instance_id=\"__INSTANCE_ID__\",fstype=~\"ext4|xfs|lustre|nfs|nfs4\"}-0", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "", + "refId": "C" + }, + { + "expr": "node_filesystem_avail_bytes {instance_id=\"__INSTANCE_ID__\",fstype=~\"ext4|xfs|lustre|nfs|nfs4\"}-0", + "format": "table", + "hide": false, + "instant": true, + "interval": "10s", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + }, + { + "expr": "1-(node_filesystem_free_bytes{instance_id=\"__INSTANCE_ID__\",fstype=~\"ext4|xfs|lustre|nfs|nfs4\"} / node_filesystem_size_bytes{instance_id=\"__INSTANCE_ID__\",fstype=~\"ext4|xfs|lustre|nfs|nfs4\"})", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "", + "refId": "B" + } + ], + "title": "Cluster Storage", + "transform": "table", + "type": "table-old" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "cloudwatch", + "fieldConfig": { + "defaults": { + "custom": { + "align": null + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 8, + "x": 8, + "y": 1 + }, + "hiddenSeries": false, + "id": 52, + "interval": "", + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": false, + "hideZero": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "total": true, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pluginVersion": "7.0.3", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "alias": "", + "dimensions": {"FileSystemId": "__FSX_ID__"}, + "expression": "", + "id": "fsxRead", + "matchExact": false, + "metricName": "DataReadBytes", + "namespace": "AWS/FSx", + "period": "", + "refId": "A", + "region": "__AWS_REGION__", + "statistics": [ + "Sum" + ] + }, + { + "alias": "", + "dimensions": {"FileSystemId": "__FSX_ID__"}, + "expression": "", + "hide": false, + "id": "fsxWrite", + "matchExact": false, + "metricName": "DataWriteBytes", + "namespace": "AWS/FSx", + "period": "", + "refId": "B", + "region": "__AWS_REGION__", + "statistics": [ + "Sum" + ] + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "FSx Total throughput ", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "transformations": [ + { + "id": "filterFieldsByName", + "options": { + "include": { + "names": [ + "Time", + "DataReadBytes", + "DataWriteBytes" + ] + } + } + } + ], + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:9663", + "decimals": 1, + "format": "Bps", + "label": "Total throughput (bytes/sec)", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:9664", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "cloudwatch", + "description": "", + "fieldConfig": { + "defaults": { + "custom": { + "align": null + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 9, + "w": 8, + "x": 16, + "y": 1 + }, + "hiddenSeries": false, + "id": 53, + "interval": "", + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "show": true, + "total": true, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pluginVersion": "7.0.3", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "alias": "", + "dimensions": {"FileSystemId": "__FSX_ID__"}, + "expression": "", + "id": "fsxIOPsRead", + "matchExact": false, + "metricName": "DataReadOperations", + "namespace": "AWS/FSx", + "period": "", + "refId": "A", + "region": "__AWS_REGION__", + "statistics": [ + "Sum" + ] + }, + { + "alias": "", + "dimensions": {"FileSystemId": "__FSX_ID__"}, + "expression": "", + "hide": false, + "id": "fsxIOPsWrite", + "matchExact": false, + "metricName": "DataWriteOperations", + "namespace": "AWS/FSx", + "period": "", + "refId": "B", + "region": "__AWS_REGION__", + "statistics": [ + "Sum" + ] + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "FSx Total IOPs", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "transformations": [ + { + "id": "filterFieldsByName", + "options": { + "include": { + "names": [ + "Time", + "DataReadOperations", + "DataWriteOperations" + ] + } + } + } + ], + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:9663", + "decimals": 1, + "format": "iops", + "label": "Total IOPs (Operations/sec)", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:9664", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "collapsed": false, + "datasource": null, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 10 + }, + "id": 40, + "panels": [], + "title": "Scheduler Stats", + "type": "row" + }, + { + "datasource": "prometheus", + "description": "", + "fieldConfig": { + "defaults": { + "custom": {}, + "decimals": 0, + "displayName": "Jobs", + "mappings": [], + "max": 6, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "#EAB839", + "value": 3.01 + }, + { + "color": "red", + "value": 5.01 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 4, + "x": 0, + "y": 11 + }, + "id": 48, + "options": { + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "7.0.6", + "targets": [ + { + "expr": "slurm_queue_configuring", + "instant": false, + "interval": "1", + "intervalFactor": 1, + "legendFormat": " ", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Slurm Pending Jobs", + "type": "gauge" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "decimals": 0, + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 12, + "w": 9, + "x": 4, + "y": 11 + }, + "hiddenSeries": false, + "id": 59, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": true, + "min": true, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "slurm_queue_completing", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Completing Jobs", + "refId": "A" + }, + { + "expr": "slurm_queue_running", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Running Jobs", + "refId": "B" + }, + { + "expr": "slurm_queue_pending", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Pending Jobs", + "refId": "C" + }, + { + "expr": "slurm_queue_completed", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Completed Jobs", + "refId": "D" + }, + { + "expr": "slurm_queue_timeout", + "interval": "", + "intervalFactor": 2, + "legendFormat": "Timed out Jobs", + "refId": "E" + }, + { + "expr": "slurm_queue_failed", + "interval": "", + "intervalFactor": 2, + "legendFormat": "Failed Jobs", + "refId": "F" + }, + { + "expr": "slurm_queue_node_fail", + "interval": "", + "intervalFactor": 2, + "legendFormat": "Failed jobs (due to NodeFail)", + "refId": "G" + }, + { + "expr": "slurm_queue_suspended", + "interval": "", + "intervalFactor": 2, + "legendFormat": "Suspended Jobs", + "refId": "H" + }, + { + "expr": "slurm_queue_cancelled", + "interval": "", + "intervalFactor": 2, + "legendFormat": "Cancelled Jobs", + "refId": "I" + }, + { + "expr": "slurm_queue_preempted", + "interval": "", + "intervalFactor": 2, + "legendFormat": "Preempted Jobs", + "refId": "J" + }, + { + "expr": "slurm_queue_completing + slurm_queue_running + slurm_queue_pending + slurm_queue_completed + slurm_queue_timeout + slurm_queue_failed + slurm_queue_node_fail + slurm_queue_suspended + slurm_queue_cancelled + slurm_queue_preempted", + "hide": true, + "interval": "", + "intervalFactor": 2, + "legendFormat": "Total", + "refId": "K" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Total Jobs", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "decimals": 0, + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Total Nodes": "#052b51" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "decimals": 0, + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 12, + "w": 11, + "x": 13, + "y": 11 + }, + "hiddenSeries": false, + "id": 57, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": false, + "max": true, + "min": true, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "slurm_nodes_alloc", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Allocated Nodes", + "refId": "A" + }, + { + "expr": "slurm_nodes_comp", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Completing Nodes", + "refId": "B" + }, + { + "expr": "slurm_nodes_idle", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Idle Nodes", + "refId": "C" + }, + { + "expr": "slurm_nodes_down", + "interval": "", + "intervalFactor": 2, + "legendFormat": "Down Nodes", + "refId": "E" + }, + { + "expr": "slurm_nodes_drain", + "interval": "", + "intervalFactor": 2, + "legendFormat": "Draining Nodes", + "refId": "F" + }, + { + "expr": "slurm_nodes_err", + "interval": "", + "legendFormat": "Nodes in *error* state", + "refId": "G" + }, + { + "expr": "slurm_nodes_fail", + "interval": "", + "legendFormat": "Nodes in *fail* state", + "refId": "H" + }, + { + "expr": "slurm_nodes_alloc + slurm_nodes_down + slurm_nodes_drain + slurm_nodes_idle + slurm_nodes_mix + slurm_nodes_comp + slurm_nodes_maint", + "format": "time_series", + "instant": false, + "intervalFactor": 2, + "legendFormat": "Total Nodes", + "refId": "D" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Nodes", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "min": "0", + "decimals": "0", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "datasource": "prometheus", + "description": "", + "fieldConfig": { + "defaults": { + "custom": {}, + "displayName": "Jobs", + "mappings": [], + "max": 10, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 4, + "x": 0, + "y": 17 + }, + "id": 47, + "options": { + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "7.0.6", + "targets": [ + { + "expr": "slurm_queue_running", + "instant": false, + "interval": "1", + "intervalFactor": 1, + "legendFormat": "Running", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Slurm Jobs", + "type": "gauge" + }, + { + "collapsed": false, + "datasource": null, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 23 + }, + "id": 2, + "panels": [], + "title": "CPU, Memory & Disk Stats", + "type": "row" + }, + { + "datasource": null, + "fieldConfig": { + "defaults": { + "custom": {}, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 2, + "x": 0, + "y": 24 + }, + "id": 6, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + } + }, + "pluginVersion": "7.0.6", + "targets": [ + { + "expr": "count(node_cpu_seconds_total{mode=\"idle\",instance_id!=\"__INSTANCE_ID__\"})", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Total CPU cores", + "type": "stat" + }, + { + "datasource": null, + "fieldConfig": { + "defaults": { + "custom": {}, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgb(143, 143, 143)", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 2, + "x": 2, + "y": 24 + }, + "id": 31, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + } + }, + "pluginVersion": "7.0.6", + "targets": [ + { + "expr": "count(node_cpu_seconds_total{mode=\"idle\",instance_id!=\"__INSTANCE_ID__\"})/count(count(node_cpu_seconds_total{instance_id!=\"__INSTANCE_ID__\"}) by (instance_id))", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Avg. CPU cores", + "type": "stat" + }, + { + "datasource": null, + "fieldConfig": { + "defaults": { + "custom": {}, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 2, + "x": 4, + "y": 24 + }, + "id": 17, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + } + }, + "pluginVersion": "7.0.6", + "targets": [ + { + "expr": "sum(rate(node_cpu_seconds_total{mode!=\"idle\",instance_id!=\"__INSTANCE_ID__\"}[1m]))", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Total CPU use", + "type": "stat" + }, + { + "datasource": null, + "fieldConfig": { + "defaults": { + "custom": {}, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgb(143, 143, 143)", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 2, + "x": 6, + "y": 24 + }, + "id": 32, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + } + }, + "pluginVersion": "7.0.6", + "targets": [ + { + "expr": "sum(rate(node_cpu_seconds_total{mode!=\"idle\",instance_id!=\"__INSTANCE_ID__\"}[1m]))/count(count(node_cpu_seconds_total{instance_id!=\"__INSTANCE_ID__\"}) by (instance_id))", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Avg. CPU use", + "type": "stat" + }, + { + "datasource": null, + "fieldConfig": { + "defaults": { + "custom": {}, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 2, + "x": 8, + "y": 24 + }, + "id": 16, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + } + }, + "pluginVersion": "7.0.6", + "targets": [ + { + "expr": "sum(node_memory_MemTotal_bytes{instance_id!=\"__INSTANCE_ID__\"})", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Total Memory", + "type": "stat" + }, + { + "datasource": null, + "fieldConfig": { + "defaults": { + "custom": {}, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgb(143, 143, 143)", + "value": null + } + ] + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 2, + "x": 10, + "y": 24 + }, + "id": 33, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + } + }, + "pluginVersion": "7.0.6", + "targets": [ + { + "expr": "sum(node_memory_MemTotal_bytes{instance_id!=\"__INSTANCE_ID__\"})/count(node_memory_MemTotal_bytes{instance_id!=\"__INSTANCE_ID__\"})", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Avg. Memory", + "type": "stat" + }, + { + "datasource": null, + "fieldConfig": { + "defaults": { + "custom": {}, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 2, + "x": 12, + "y": 24 + }, + "id": 24, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + } + }, + "pluginVersion": "7.0.6", + "targets": [ + { + "expr": "sum(node_memory_MemTotal_bytes{instance_id!=\"__INSTANCE_ID__\"}) - sum(node_memory_MemAvailable_bytes{instance_id!=\"__INSTANCE_ID__\"})", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Total Memory use", + "type": "stat" + }, + { + "datasource": null, + "fieldConfig": { + "defaults": { + "custom": {}, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgb(143, 143, 143)", + "value": null + } + ] + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 2, + "x": 14, + "y": 24 + }, + "id": 34, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + } + }, + "pluginVersion": "7.0.6", + "targets": [ + { + "expr": "(sum(node_memory_MemTotal_bytes{instance_id!=\"__INSTANCE_ID__\"}) - sum(node_memory_MemAvailable_bytes{instance_id!=\"__INSTANCE_ID__\"}))/count(node_memory_MemTotal_bytes{instance_id!=\"__INSTANCE_ID__\"})", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Avg. Memory use", + "type": "stat" + }, + { + "datasource": null, + "fieldConfig": { + "defaults": { + "custom": {}, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "Bps" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 2, + "x": 16, + "y": 24 + }, + "id": 18, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + } + }, + "pluginVersion": "7.0.6", + "targets": [ + { + "expr": "sum(rate(node_disk_written_bytes_total{instance_id!=\"__INSTANCE_ID__\"}[1m]))", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Total Disk (write)", + "type": "stat" + }, + { + "datasource": null, + "fieldConfig": { + "defaults": { + "custom": {}, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgb(143, 143, 143)", + "value": null + } + ] + }, + "unit": "Bps" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 2, + "x": 18, + "y": 24 + }, + "id": 35, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + } + }, + "pluginVersion": "7.0.6", + "targets": [ + { + "expr": "sum(rate(node_disk_written_bytes_total{instance_id!=\"__INSTANCE_ID__\"}[1m])) / count(node_disk_written_bytes_total{instance_id!=\"__INSTANCE_ID__\"})", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Avg. Disk (write)", + "type": "stat" + }, + { + "datasource": null, + "fieldConfig": { + "defaults": { + "custom": {}, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "Bps" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 2, + "x": 20, + "y": 24 + }, + "id": 25, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + } + }, + "pluginVersion": "7.0.6", + "targets": [ + { + "expr": "sum(rate(node_disk_read_bytes_total{instance_id!=\"__INSTANCE_ID__\"}[1m]))", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Total Disk (read)", + "type": "stat" + }, + { + "datasource": null, + "fieldConfig": { + "defaults": { + "custom": {}, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgb(143, 143, 143)", + "value": null + } + ] + }, + "unit": "Bps" + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 2, + "x": 22, + "y": 24 + }, + "id": 36, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + } + }, + "pluginVersion": "7.0.6", + "targets": [ + { + "expr": "sum(rate(node_disk_read_bytes_total{instance_id!=\"__INSTANCE_ID__\"}[1m])) / count(node_disk_read_bytes_total{instance_id!=\"__INSTANCE_ID__\"})", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Avg. Disk (read)", + "type": "stat" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 12, + "w": 8, + "x": 0, + "y": 28 + }, + "hiddenSeries": false, + "id": 4, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "count(node_cpu_seconds_total{mode=\"idle\",instance_id!=\"__INSTANCE_ID__\"})", + "interval": "", + "legendFormat": "Total cores", + "refId": "B" + }, + { + "expr": "sum(rate(node_cpu_seconds_total{mode!=\"idle\",instance_id!=\"__INSTANCE_ID__\"}[1m]))", + "legendFormat": "Load in cores", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "CPU load", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 12, + "w": 8, + "x": 8, + "y": 28 + }, + "hiddenSeries": false, + "id": 15, + "legend": { + "alignAsTable": false, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(node_memory_MemTotal_bytes{instance_id!=\"__INSTANCE_ID__\"})", + "interval": "", + "legendFormat": "Total Memory", + "refId": "B" + }, + { + "expr": "sum(node_memory_MemTotal_bytes{instance_id!=\"__INSTANCE_ID__\"}) - sum(node_memory_MemAvailable_bytes{instance_id!=\"__INSTANCE_ID__\"})", + "interval": "", + "legendFormat": "Used Memory", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Memory", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "decbytes", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 6, + "w": 8, + "x": 16, + "y": 28 + }, + "hiddenSeries": false, + "id": 26, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(node_disk_read_bytes_total{instance_id!=\"__INSTANCE_ID__\"}[1m]))", + "interval": "", + "legendFormat": "Read throughput", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Disk I/O (read)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 6, + "w": 8, + "x": 16, + "y": 34 + }, + "hiddenSeries": false, + "id": 23, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(node_disk_written_bytes_total{instance_id!=\"__INSTANCE_ID__\"}[1m]))", + "interval": "", + "legendFormat": "Write throughput", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Disk I/O (write)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:10296", + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:10297", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "collapsed": false, + "datasource": null, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 40 + }, + "id": 14, + "panels": [], + "title": "Network Stats", + "type": "row" + }, + { + "datasource": null, + "fieldConfig": { + "defaults": { + "custom": {}, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "semi-dark-green", + "value": null + } + ] + }, + "unit": "Bps" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 0, + "y": 41 + }, + "id": 10, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + } + }, + "pluginVersion": "7.0.6", + "targets": [ + { + "expr": "sum(irate(node_network_transmit_bytes_total{device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*',instance_id!=\"__INSTANCE_ID__\"}[1m]))", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Current Total Throughput (Transmit)", + "type": "stat" + }, + { + "datasource": null, + "fieldConfig": { + "defaults": { + "custom": {}, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgb(143, 143, 143)", + "value": null + } + ] + }, + "unit": "Bps" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 6, + "y": 41 + }, + "id": 28, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + } + }, + "pluginVersion": "7.0.6", + "targets": [ + { + "expr": "sum(irate(node_network_transmit_bytes_total{device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*',instance_id!=\"__INSTANCE_ID__\"}[1m]))/count(count(node_network_transmit_bytes_total{instance_id!=\"__INSTANCE_ID__\"}) by(instance_id))", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Current Avg. Throughput per Node (Transmit)", + "type": "stat" + }, + { + "datasource": null, + "fieldConfig": { + "defaults": { + "custom": {}, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "semi-dark-purple", + "value": null + } + ] + }, + "unit": "Bps" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 12, + "y": 41 + }, + "id": 11, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + } + }, + "pluginVersion": "7.0.6", + "targets": [ + { + "expr": "sum(irate(node_network_receive_bytes_total{device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*',instance_id!=\"__INSTANCE_ID__\"}[1m]))", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Current Total Throughput (Receive)", + "type": "stat" + }, + { + "datasource": null, + "fieldConfig": { + "defaults": { + "custom": {}, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "rgb(143, 143, 143)", + "value": null + } + ] + }, + "unit": "Bps" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 6, + "x": 18, + "y": 41 + }, + "id": 30, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + } + }, + "pluginVersion": "7.0.6", + "targets": [ + { + "expr": "sum(irate(node_network_receive_bytes_total{device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*',instance_id!=\"__INSTANCE_ID__\"}[1m]))/count(count(node_network_receive_bytes_total{instance_id!=\"__INSTANCE_ID__\"}) by(instance_id))", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Current Avg. Throughput per Node (Receive)", + "type": "stat" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "decimals": null, + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 3, + "gridPos": { + "h": 11, + "w": 12, + "x": 0, + "y": 46 + }, + "hiddenSeries": false, + "id": 8, + "interval": "", + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": false, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "Transmit", + "color": "#56A64B" + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(irate(node_network_transmit_bytes_total{device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*',instance_id!=\"__INSTANCE_ID__\"}[1m]))", + "interval": "", + "legendFormat": "Transmit", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Throughput (Transmit)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 3, + "gridPos": { + "h": 11, + "w": 12, + "x": 12, + "y": 46 + }, + "hiddenSeries": false, + "id": 12, + "interval": "", + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "Receive", + "color": "#A352CC" + } + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(irate(node_network_receive_bytes_total{device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*',instance_id!=\"__INSTANCE_ID__\"}[1m]))", + "interval": "", + "legendFormat": "Receive", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Throughput (Receive)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "collapsed": false, + "datasource": null, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 57 + }, + "id": 20, + "panels": [], + "title": "S3 stats", + "type": "row" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "cloudwatch", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 58 + }, + "hiddenSeries": false, + "id": 55, + "interval": "", + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": true, + "min": false, + "show": true, + "total": true, + "values": true + }, + "lines": true, + "linewidth": 2, + "nullPointMode": "null as zero", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:1603", + "alias": "transmit", + "color": "#A352CC" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "alias": "", + "dimensions": { + "BucketName": "_S3_BUCKET_" + }, + "expression": "", + "id": "", + "matchExact": false, + "metricName": "BytesUploaded", + "namespace": "AWS/S3", + "period": "", + "refId": "A", + "region": "__AWS_REGION__", + "statistics": [ + "Sum" + ] + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "S3 Throughput (Upload)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "cloudwatch", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 58 + }, + "hiddenSeries": false, + "id": 54, + "interval": "", + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": true, + "min": false, + "show": true, + "total": true, + "values": true + }, + "lines": true, + "linewidth": 2, + "nullPointMode": "null as zero", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:1603", + "alias": "transmit", + "color": "#A352CC" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "alias": "", + "dimensions": { + "BucketName": "_S3_BUCKET_" + }, + "expression": "", + "id": "", + "matchExact": false, + "metricName": "BytesDownloaded", + "namespace": "AWS/S3", + "period": "", + "refId": "A", + "region": "__AWS_REGION__", + "statistics": [ + "Sum" + ] + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "S3 Throughput (Download)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "refresh": "5s", + "schemaVersion": 25, + "style": "dark", + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ] + }, + "timezone": "", + "title": "ParallelCluster Summary", + "uid": "r9_U2AzGz", + "version": 1 +} \ No newline at end of file diff --git a/monitoring/grafana/dashboards/compute-node-details.json b/monitoring/grafana/dashboards/compute-node-details.json new file mode 100755 index 0000000..951a7f6 --- /dev/null +++ b/monitoring/grafana/dashboards/compute-node-details.json @@ -0,0 +1,2198 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "", + "editable": true, + "gnetId": 11074, + "graphTooltip": 0, + "id": 2, + "iteration": 1584377467734, + "links": [ + { + "icon": "external link", + "tags": [], + "type": "dashboards" + } + ], + "panels": [ + { + "cacheTimeout": null, + "colorBackground": false, + "colorPostfix": false, + "colorPrefix": false, + "colorValue": true, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "prometheus", + "decimals": 1, + "description": "", + "format": "s", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 3, + "w": 2, + "x": 0, + "y": 0 + }, + "hideTimeOverride": true, + "id": 15, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "null", + "nullText": null, + "options": {}, + "pluginVersion": "6.4.2", + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(time() - node_boot_time_seconds{instance=~\"$node\"})", + "format": "time_series", + "hide": false, + "instant": true, + "intervalFactor": 1, + "refId": "A", + "step": 40 + } + ], + "thresholds": "1,2", + "title": "System Uptime", + "type": "singlestat", + "valueFontSize": "100%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": true, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "prometheus", + "decimals": 2, + "description": "", + "format": "bytes", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 3, + "w": 2, + "x": 2, + "y": 0 + }, + "id": 75, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "maxPerRow": 6, + "nullPointMode": "null", + "nullText": null, + "options": {}, + "postfix": "", + "postfixFontSize": "70%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(node_memory_MemTotal_bytes{instance=~\"$node\"})", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "{{instance}}", + "refId": "A", + "step": 20 + } + ], + "thresholds": "2,3", + "title": "Total RAM", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "datasource": "prometheus", + "gridPos": { + "h": 6, + "w": 5, + "x": 4, + "y": 0 + }, + "id": 177, + "options": { + "displayMode": "lcd", + "fieldOptions": { + "calcs": [ + "last" + ], + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "#EAB839", + "value": 60 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "title": "", + "unit": "percent" + }, + "overrides": [], + "values": false + }, + "orientation": "horizontal", + "showUnfilled": true + }, + "pluginVersion": "6.6.2", + "targets": [ + { + "expr": "100 - (avg(irate(node_cpu_seconds_total{instance=~\"$node\",mode=\"idle\"}[30m])) * 100)", + "instant": true, + "legendFormat": "CPU Busy", + "refId": "A" + }, + { + "expr": "avg(irate(node_cpu_seconds_total{instance=~\"$node\",mode=\"iowait\"}[30m])) * 100", + "hide": true, + "instant": true, + "legendFormat": "Busy Iowait", + "refId": "C" + }, + { + "expr": "(1 - (node_memory_MemAvailable_bytes{instance=~\"$node\"} / (node_memory_MemTotal_bytes{instance=~\"$node\"})))* 100", + "instant": true, + "legendFormat": "Used RAM Memory", + "refId": "B" + }, + { + "expr": "100 - ((node_filesystem_avail_bytes{instance=~\"$node\",mountpoint=\"$maxmount\",fstype=~\"ext4|xfs\"} * 100) / node_filesystem_size_bytes {instance=~\"$node\",mountpoint=\"$maxmount\",fstype=~\"ext4|xfs\"})", + "hide": false, + "instant": true, + "legendFormat": "Used Max Mount($maxmount)", + "refId": "D" + }, + { + "expr": "(1 - (node_memory_SwapFree_bytes{instance=~\"$node\"} / node_memory_SwapTotal_bytes{instance=~\"$node\"})) * 100", + "instant": true, + "legendFormat": "Used SWAP", + "refId": "E" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "", + "type": "bargauge" + }, + { + "aliasColors": { + "15分钟": "#6ED0E0", + "1分钟": "#BF1B00", + "5分钟": "#CCA300" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "editable": true, + "error": false, + "fill": 1, + "fillGradient": 1, + "grid": {}, + "gridPos": { + "h": 6, + "w": 8, + "x": 9, + "y": 0 + }, + "height": "300", + "hiddenSeries": false, + "id": 13, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null as zero", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_load1{instance=~\"$node\"}", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}_1m", + "metric": "", + "refId": "A", + "step": 20, + "target": "" + }, + { + "expr": "node_load5{instance=~\"$node\"}", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}_5m", + "refId": "B", + "step": 20 + }, + { + "expr": "node_load15{instance=~\"$node\"}", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}_15m", + "refId": "C", + "step": 20 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "System Load", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 2, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "columns": [], + "datasource": "prometheus", + "fontSize": "110%", + "gridPos": { + "h": 6, + "w": 7, + "x": 17, + "y": 0 + }, + "id": 164, + "links": [], + "options": {}, + "pageSize": null, + "scroll": true, + "showHeader": true, + "sort": { + "col": 6, + "desc": false + }, + "styles": [ + { + "alias": "Mounted on", + "align": "auto", + "colorMode": null, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "mappingType": 1, + "pattern": "mountpoint", + "thresholds": [ + "" + ], + "type": "string", + "unit": "bytes" + }, + { + "alias": "Avail", + "align": "auto", + "colorMode": "value", + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "mappingType": 1, + "pattern": "Value #A", + "thresholds": [ + "10000000000", + "20000000000" + ], + "type": "number", + "unit": "bytes" + }, + { + "alias": "Used", + "align": "auto", + "colorMode": "cell", + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "mappingType": 1, + "pattern": "Value #B", + "thresholds": [ + "0.6", + "0.8" + ], + "type": "number", + "unit": "percentunit" + }, + { + "alias": "Size", + "align": "auto", + "colorMode": null, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 1, + "link": false, + "mappingType": 1, + "pattern": "Value #C", + "thresholds": [], + "type": "number", + "unit": "bytes" + }, + { + "alias": "Filesystem", + "align": "auto", + "colorMode": null, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "mappingType": 1, + "pattern": "fstype", + "thresholds": [], + "type": "string", + "unit": "short" + }, + { + "alias": "IP", + "align": "auto", + "colorMode": null, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "mappingType": 1, + "pattern": "instance", + "preserveFormat": false, + "sanitize": false, + "thresholds": [], + "type": "string", + "unit": "short" + }, + { + "alias": "", + "align": "auto", + "colorMode": null, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "decimals": 2, + "pattern": "/.*/", + "preserveFormat": true, + "sanitize": false, + "thresholds": [], + "type": "hidden", + "unit": "short" + } + ], + "targets": [ + { + "expr": "node_filesystem_size_bytes{instance=~'$node',fstype=~\"ext4|xfs\"}-0", + "format": "table", + "hide": false, + "instant": true, + "intervalFactor": 1, + "legendFormat": "", + "refId": "C" + }, + { + "expr": "node_filesystem_avail_bytes {instance=~'$node',fstype=~\"ext4|xfs\"}-0", + "format": "table", + "hide": false, + "instant": true, + "interval": "10s", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + }, + { + "expr": "1-(node_filesystem_free_bytes{instance=~'$node',fstype=~\"ext4|xfs\"} / node_filesystem_size_bytes{instance=~'$node',fstype=~\"ext4|xfs\"})", + "format": "table", + "hide": false, + "instant": true, + "intervalFactor": 1, + "legendFormat": "", + "refId": "B" + } + ], + "title": "Disk Space Used Basic(EXT4/XFS)", + "transform": "table", + "type": "table" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorPostfix": false, + "colorValue": true, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "prometheus", + "description": "", + "format": "short", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 3, + "w": 2, + "x": 0, + "y": 3 + }, + "id": 14, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "maxPerRow": 6, + "nullPointMode": "null", + "nullText": null, + "options": {}, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(count(node_cpu_seconds_total{instance=~\"$node\", mode='system'}) by (cpu))", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "", + "refId": "A", + "step": 20 + } + ], + "thresholds": "1,2", + "title": "CPU Cores", + "type": "singlestat", + "valueFontSize": "100%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": true, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "prometheus", + "decimals": 2, + "description": "", + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 3, + "w": 2, + "x": 2, + "y": 3 + }, + "id": 20, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "options": {}, + "pluginVersion": "6.4.2", + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "#3274D9", + "show": true, + "ymax": null, + "ymin": null + }, + "tableColumn": "", + "targets": [ + { + "expr": "avg(irate(node_cpu_seconds_total{instance=~\"$node\",mode=\"iowait\"}[30m])) * 100", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A", + "step": 20 + } + ], + "thresholds": "20,50", + "timeFrom": null, + "timeShift": null, + "title": "CPU IOwait", + "type": "singlestat", + "valueFontSize": "100%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "aliasColors": { + "192.168.200.241:9100_Total": "dark-red", + "Idle - Waiting for something to happen": "#052B51", + "guest": "#9AC48A", + "idle": "#052B51", + "iowait": "#EAB839", + "irq": "#BF1B00", + "nice": "#C15C17", + "sdb_每秒I/O操作%": "#d683ce", + "softirq": "#E24D42", + "steal": "#FCE2DE", + "system": "#508642", + "user": "#5195CE", + "磁盘花费在I/O操作占比": "#ba43a9" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "decimals": 2, + "description": "", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 11, + "w": 12, + "x": 0, + "y": 6 + }, + "hiddenSeries": false, + "id": 7, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + { + "alias": "/.*_Total/", + "color": "#C4162A", + "fill": 0 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "avg(irate(node_cpu_seconds_total{instance=~\"$node\",mode=\"system\"}[30m])) by (instance)", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}_System", + "refId": "A", + "step": 20 + }, + { + "expr": "avg(irate(node_cpu_seconds_total{instance=~\"$node\",mode=\"user\"}[30m])) by (instance)", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "{{instance}}_User", + "refId": "B", + "step": 240 + }, + { + "expr": "avg(irate(node_cpu_seconds_total{instance=~\"$node\",mode=\"iowait\"}[30m])) by (instance)", + "format": "time_series", + "hide": false, + "instant": false, + "intervalFactor": 1, + "legendFormat": "{{instance}}_Iowait", + "refId": "D", + "step": 240 + }, + { + "expr": "1 - avg(irate(node_cpu_seconds_total{instance=~\"$node\",mode=\"idle\"}[30m])) by (instance)", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "{{instance}}_Total", + "refId": "F", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "CPU Basic", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": 2, + "format": "percentunit", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "192.168.10.227:9100_em1_in下载": "super-light-green", + "192.168.10.227:9100_em1_out上传": "dark-blue" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "fillGradient": 3, + "gridPos": { + "h": 11, + "w": 12, + "x": 12, + "y": 6 + }, + "height": "300", + "hiddenSeries": false, + "id": 157, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*_transmit$/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(node_network_receive_bytes_total{instance=~'$node',device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*'}[30m])*8", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{instance}}_{{device}}_receive", + "refId": "A", + "step": 4 + }, + { + "expr": "irate(node_network_transmit_bytes_total{instance=~'$node',device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*'}[30m])*8", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{instance}}_{{device}}_transmit", + "refId": "B", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Network Traffic Basic", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bps", + "label": "transmit(-)/receive(+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "192.168.200.241:9100_总内存": "dark-red", + "内存_Avaliable": "#6ED0E0", + "内存_Cached": "#EF843C", + "内存_Free": "#629E51", + "内存_Total": "#6d1f62", + "内存_Used": "#eab839", + "可用": "#9ac48a", + "总内存": "#bf1b00" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "decimals": 2, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 12, + "w": 12, + "x": 0, + "y": 17 + }, + "height": "300", + "hiddenSeries": false, + "id": 156, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*_Total/", + "color": "#C4162A", + "fill": 0 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_memory_MemTotal_bytes{instance=~\"$node\"}", + "format": "time_series", + "hide": false, + "instant": false, + "intervalFactor": 1, + "legendFormat": "{{instance}}_Total", + "refId": "A", + "step": 4 + }, + { + "expr": "node_memory_MemTotal_bytes{instance=~\"$node\"} - node_memory_MemAvailable_bytes{instance=~\"$node\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "{{instance}}_Used", + "refId": "B", + "step": 4 + }, + { + "expr": "node_memory_MemAvailable_bytes{instance=~\"$node\"}", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}_Avaliable", + "refId": "F", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Memory Basic", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "vda_write": "#6ED0E0" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "description": "Per second read / write bytes ", + "fill": 1, + "fillGradient": 1, + "gridPos": { + "h": 12, + "w": 12, + "x": 12, + "y": 17 + }, + "height": "300", + "hiddenSeries": false, + "id": 168, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*_Read bytes$/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(node_disk_read_bytes_total{instance=~\"$node\"}[30m])", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}_{{device}}_Read bytes", + "refId": "A", + "step": 10 + }, + { + "expr": "irate(node_disk_written_bytes_total{instance=~\"$node\"}[30m])", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "{{instance}}_{{device}}_Written bytes", + "refId": "B", + "step": 10 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Disk R/W Data", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": null, + "format": "Bps", + "label": "Bytes read (-) / write (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "fillGradient": 3, + "gridPos": { + "h": 6, + "w": 8, + "x": 0, + "y": 29 + }, + "hiddenSeries": false, + "id": 174, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/Inodes.*/", + "yaxis": 2 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "1-(node_filesystem_free_bytes{instance=~'$node',fstype=~\"ext4|xfs\"} / node_filesystem_size_bytes{instance=~'$node',fstype=~\"ext4|xfs\"})", + "format": "time_series", + "instant": false, + "intervalFactor": 1, + "legendFormat": "{{instance}}:{{mountpoint}}", + "refId": "A" + }, + { + "expr": "node_filesystem_files_free{instance=~'$node',fstype=~\"ext4|xfs\"} / node_filesystem_files{instance=~'$node',fstype=~\"ext4|xfs\"}", + "hide": true, + "legendFormat": "Inodes:{{instance}}:{{mountpoint}}", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Disk Space Used Basic", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": 2, + "format": "percentunit", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "decimals": 2, + "format": "percentunit", + "label": null, + "logBase": 1, + "max": "1", + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "filefd_192.168.200.241:9100": "super-light-green", + "switches_192.168.200.241:9100": "semi-dark-red" + }, + "bars": false, + "cacheTimeout": null, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "description": "", + "fill": 0, + "fillGradient": 1, + "gridPos": { + "h": 6, + "w": 8, + "x": 8, + "y": 29 + }, + "hiddenSeries": false, + "hideTimeOverride": false, + "id": 16, + "legend": { + "alignAsTable": false, + "avg": false, + "current": true, + "max": true, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pluginVersion": "6.4.2", + "pointradius": 1, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/filefd_.*/", + "lines": false, + "pointradius": 1, + "points": true + }, + { + "alias": "/switches_.*/", + "color": "#F2495C", + "yaxis": 2 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_filefd_allocated{instance=~\"$node\"}", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 5, + "legendFormat": "filefd_{{instance}}", + "refId": "B" + }, + { + "expr": "irate(node_context_switches_total{instance=~\"$node\"}[30m])", + "intervalFactor": 5, + "legendFormat": "switches_{{instance}}", + "refId": "A" + }, + { + "expr": "node_filefd_maximum{instance=~\"$node\"}", + "hide": true, + "refId": "C" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Open File Descriptor(left)/Context switches(right)", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": "context_switches", + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Idle - Waiting for something to happen": "#052B51", + "guest": "#9AC48A", + "idle": "#052B51", + "iowait": "#EAB839", + "irq": "#BF1B00", + "nice": "#C15C17", + "sdb_每秒I/O操作%": "#d683ce", + "softirq": "#E24D42", + "steal": "#FCE2DE", + "system": "#508642", + "user": "#5195CE", + "磁盘花费在I/O操作占比": "#ba43a9" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "decimals": null, + "description": "The time spent on I/O in the natural time of each second.(wall-clock time)", + "fill": 1, + "fillGradient": 5, + "gridPos": { + "h": 6, + "w": 8, + "x": 16, + "y": 29 + }, + "hiddenSeries": false, + "id": 175, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "sort": null, + "sortDesc": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(node_disk_io_time_seconds_total{instance=~\"$node\"}[30m])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{instance}}_{{device}}_ IO time", + "refId": "C" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Time Spent Doing I/Os", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": null, + "format": "s", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "TCP": "#6ED0E0" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "description": "TCP_alloc - Allocated sockets\n\nCurrEstab - TCP connections for which the current state is either ESTABLISHED or CLOSE- WAIT\n\nTCP_tw - Sockets wating close\n\nUDP_inuse - Udp sockets currently in use\n\nSockets_used - Sockets currently in use", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 6, + "w": 8, + "x": 0, + "y": 35 + }, + "height": "300", + "hiddenSeries": false, + "id": 158, + "interval": "", + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*_Sockets_used/", + "color": "#C4162A", + "fill": 0 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_netstat_Tcp_CurrEstab{instance=~'$node'}", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}_CurrEstab", + "refId": "A", + "step": 20 + }, + { + "expr": "node_sockstat_TCP_tw{instance=~'$node'}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{instance}}_TCP_tw", + "refId": "D" + }, + { + "expr": "node_sockstat_sockets_used{instance=~'$node'}", + "legendFormat": "{{instance}}_Sockets_used", + "refId": "B" + }, + { + "expr": "node_sockstat_UDP_inuse{instance=~'$node'}", + "legendFormat": "{{instance}}_UDP_inuse", + "refId": "C" + }, + { + "expr": "node_sockstat_TCP_alloc{instance=~'$node'}", + "legendFormat": "{{instance}}_TCP_alloc", + "refId": "E" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Network Sockstat", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "vda_write": "#6ED0E0" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "description": "Read/write completions per second", + "fill": 1, + "fillGradient": 1, + "gridPos": { + "h": 6, + "w": 8, + "x": 8, + "y": 35 + }, + "height": "300", + "hiddenSeries": false, + "id": 161, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, + "min": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*_Reads completed$/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(node_disk_reads_completed_total{instance=~\"$node\"}[30m])", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}_{{device}}_Reads completed", + "refId": "A", + "step": 10 + }, + { + "expr": "irate(node_disk_writes_completed_total{instance=~\"$node\"}[30m])", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "{{instance}}_{{device}}_Writes completed", + "refId": "B", + "step": 10 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Disk IOps Completed", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": null, + "format": "iops", + "label": "IO read (-) / write (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "vda": "#6ED0E0" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "description": "Time spent on each read/write operation", + "fill": 1, + "fillGradient": 1, + "gridPos": { + "h": 6, + "w": 8, + "x": 16, + "y": 35 + }, + "height": "300", + "hiddenSeries": false, + "id": 160, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, + "min": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "null as zero", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/,*_Read time$/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(node_disk_read_time_seconds_total{instance=~\"$node\"}[30m]) / irate(node_disk_reads_completed_total{instance=~\"$node\"}[30m])", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}_{{device}}_Read time", + "refId": "B" + }, + { + "expr": "irate(node_disk_write_time_seconds_total{instance=~\"$node\"}[30m]) / irate(node_disk_writes_completed_total{instance=~\"$node\"}[30m])", + "format": "time_series", + "hide": false, + "instant": false, + "intervalFactor": 1, + "legendFormat": "{{instance}}_{{device}}_Write time", + "refId": "C" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Disk R/W Time(Reference: less than 100ms)(beta)", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "label": "Time. read (-) / write (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "refresh": "5s", + "schemaVersion": 22, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "allValue": null, + "current": { + "text": "i-08ab98802857b0864", + "value": "i-08ab98802857b0864" + }, + "datasource": "prometheus", + "definition": "label_values(node_uname_info{job=~\"ec2_instances\"}, instance_id)", + "hide": 0, + "includeAll": false, + "label": "Instance ID", + "multi": false, + "name": "instance_id", + "options": [], + "query": "label_values(node_uname_info{job=~\"ec2_instances\"}, instance_id)", + "refresh": 1, + "regex": "^(?!.*__INSTANCE_ID__).*$", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allFormat": "glob", + "allValue": null, + "current": { + "selected": false, + "text": "All", + "value": "$__all" + }, + "datasource": "prometheus", + "definition": "label_values(node_uname_info{instance_id=~\"$instance_id\"},instance)", + "hide": 0, + "includeAll": true, + "label": "IP", + "multi": false, + "multiFormat": "regex values", + "name": "node", + "options": [], + "query": "label_values(node_uname_info{instance_id=~\"$instance_id\"},instance)", + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + "text": "/", + "value": "/" + }, + "datasource": "prometheus", + "definition": "", + "hide": 2, + "includeAll": false, + "label": "", + "multi": false, + "name": "maxmount", + "options": [], + "query": "query_result(topk(1,sort_desc (max(node_filesystem_size_bytes{instance=~'$node',fstype=~\"ext4|xfs\"}) by (mountpoint))))", + "refresh": 2, + "regex": "/.*\\\"(.*)\\\".*/", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allFormat": "glob", + "allValue": null, + "current": { + "isNone": true, + "selected": false, + "text": "None", + "value": "" + }, + "datasource": "prometheus", + "definition": "", + "hide": 2, + "includeAll": false, + "label": "环境", + "multi": false, + "multiFormat": "regex values", + "name": "env", + "options": [], + "query": "label_values(node_exporter_build_info,env)", + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allFormat": "glob", + "allValue": "", + "current": { + "text": "None", + "value": [ + "" + ] + }, + "datasource": "prometheus", + "definition": "label_values(node_exporter_build_info{env=~'$env'},name)", + "hide": 2, + "includeAll": false, + "label": "名称", + "multi": true, + "multiFormat": "regex values", + "name": "name", + "options": [], + "query": "label_values(node_exporter_build_info{env=~'$env'},name)", + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "tagValuesQuery": "/.*/", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-15m", + "to": "now" + }, + "timepicker": { + "now": true, + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "Compute Node Details", + "uid": "qI8VfvXZz", + "version": 1 +} \ No newline at end of file diff --git a/monitoring/grafana/dashboards/compute-node-list.json b/monitoring/grafana/dashboards/compute-node-list.json new file mode 100755 index 0000000..068780a --- /dev/null +++ b/monitoring/grafana/dashboards/compute-node-list.json @@ -0,0 +1,236 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "iteration": 1592242343557, + "links": [ + { + "$$hashKey": "object:53", + "icon": "external link", + "tags": [], + "type": "dashboards" + } + ], + "panels": [ + { + "columns": [], + "datasource": null, + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fontSize": "100%", + "gridPos": { + "h": 24, + "w": 9, + "x": 0, + "y": 0 + }, + "id": 2, + "pageSize": null, + "showHeader": true, + "sort": { + "col": 2, + "desc": true + }, + "styles": [ + { + "alias": "Time", + "align": "auto", + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "pattern": "Time", + "type": "hidden" + }, + { + "alias": "Availability Zone", + "align": "left", + "colorMode": null, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "decimals": 2, + "pattern": "instance_az", + "thresholds": [], + "type": "number", + "unit": "short" + }, + { + "alias": "Instance Id", + "align": "auto", + "colorMode": null, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": true, + "linkTargetBlank": true, + "linkTooltip": "Go To Node Details", + "linkUrl": "/grafana/d/qI8VfvXZz/node-details-copy?var-instance_id=${__cell}", + "mappingType": 1, + "pattern": "instance_id", + "thresholds": [], + "type": "number", + "unit": "short" + }, + { + "alias": "Instance Type", + "align": "auto", + "colorMode": null, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "mappingType": 1, + "pattern": "instance_type", + "thresholds": [], + "type": "number", + "unit": "short" + }, + { + "alias": "CPU load", + "align": "auto", + "colorMode": null, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "mappingType": 1, + "pattern": "Value #A", + "thresholds": [], + "type": "number", + "unit": "short" + }, + { + "alias": "Transmit Rate", + "align": "auto", + "colorMode": null, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "mappingType": 1, + "pattern": "Value #B", + "thresholds": [], + "type": "number", + "unit": "Bps" + }, + { + "alias": "Receive Rate", + "align": "auto", + "colorMode": null, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "mappingType": 1, + "pattern": "Value #C", + "thresholds": [], + "type": "number", + "unit": "Bps" + } + ], + "targets": [ + { + "expr": "sum(rate(node_cpu_seconds_total{mode!=\"idle\"}[1m])) by (instance_id, instance_type, instance_az)", + "format": "table", + "instant": true, + "legendFormat": "", + "refId": "A" + }, + { + "expr": "sum(rate(node_network_transmit_bytes_total[1m])) by (instance_id, instance_type, instance_az)", + "format": "table", + "instant": true, + "refId": "B" + }, + { + "expr": "sum(rate(node_network_receive_bytes_total[1m])) by (instance_id, instance_type, instance_az)", + "format": "table", + "instant": true, + "refId": "C" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "All Available Nodes", + "transform": "table", + "type": "table-old" + } + ], + "schemaVersion": 25, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "datasource": null, + "filters": [ + { + "condition": "", + "key": "instance_id", + "operator": "!=", + "value": "__INSTANCE_ID__" + } + ], + "hide": 2, + "label": "", + "name": "Filters", + "skipUrlSync": false, + "type": "adhoc" + } + ] + }, + "time": { + "from": "now-15m", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ] + }, + "timezone": "", + "title": "Compute Node List", + "uid": "SugNQvuWk", + "version": 1 +} \ No newline at end of file diff --git a/monitoring/grafana/dashboards/costs.json b/monitoring/grafana/dashboards/costs.json new file mode 100755 index 0000000..0aa1b82 --- /dev/null +++ b/monitoring/grafana/dashboards/costs.json @@ -0,0 +1,674 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "links": [ + { + "$$hashKey": "object:56", + "asDropdown": false, + "icon": "external link", + "tags": [], + "title": "Dash", + "type": "dashboards" + } + ], + "panels": [ + { + "datasource": "prometheus", + "description": "", + "fieldConfig": { + "defaults": { + "custom": {}, + "decimals": 2, + "displayName": "Jobs", + "mappings": [], + "max": 10, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "#6ED0E0", + "value": 500 + }, + { + "color": "#EAB839", + "value": 1000 + }, + { + "color": "red", + "value": 1800 + } + ] + }, + "unit": "currencyUSD" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 4, + "x": 0, + "y": 0 + }, + "id": 10, + "options": { + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "7.1.0", + "targets": [ + { + "expr": "master_node_cost", + "instant": false, + "interval": "1", + "intervalFactor": 1, + "legendFormat": " ", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Master Node hourly Cost", + "type": "gauge" + }, + { + "datasource": "prometheus", + "description": "", + "fieldConfig": { + "defaults": { + "custom": {}, + "decimals": 2, + "displayName": "Jobs", + "mappings": [], + "max": 5, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 4.01 + } + ] + }, + "unit": "currencyUSD" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 4, + "x": 4, + "y": 0 + }, + "id": 7, + "options": { + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "7.1.0", + "targets": [ + { + "expr": "ebs_master_cost", + "instant": false, + "interval": "1", + "intervalFactor": 1, + "legendFormat": " ", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "EBS (Master node) hourly Cost", + "type": "gauge" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "description": "", + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 14, + "w": 16, + "x": 8, + "y": 0 + }, + "hiddenSeries": false, + "id": 3, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "nullPointMode": "null", + "percentage": false, + "pluginVersion": "7.1.0", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": true, + "targets": [ + { + "expr": "s3_cost+master_node_cost+fsx_cost+ebs_master_cost+compute_nodes_cost+ebs_compute_cost", + "instant": false, + "interval": "1", + "intervalFactor": 1, + "legendFormat": "Entire Cluster cost", + "refId": "A" + }, + { + "expr": "s3_cost", + "instant": false, + "interval": "1", + "intervalFactor": 1, + "legendFormat": "S3", + "refId": "B" + }, + { + "expr": "master_node_cost", + "instant": false, + "interval": "1", + "intervalFactor": 1, + "legendFormat": "Master Node", + "refId": "C" + }, + { + "expr": "fsx_cost", + "instant": false, + "interval": "1", + "intervalFactor": 1, + "legendFormat": "FSx", + "refId": "D" + }, + { + "expr": "ebs_master_cost", + "instant": false, + "interval": "1", + "intervalFactor": 1, + "legendFormat": "EBS (Master)", + "refId": "E" + }, + { + "expr": "compute_nodes_cost", + "instant": false, + "interval": "1", + "intervalFactor": 1, + "legendFormat": "Compute Nodes", + "refId": "F" + }, + { + "expr": "ebs_compute_cost", + "instant": false, + "interval": "1", + "intervalFactor": 1, + "legendFormat": "EBS (Compute)", + "refId": "G" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Cluster Total Hourly Cost", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": 1, + "format": "currencyUSD", + "label": "$ / hour", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "datasource": "prometheus", + "description": "", + "fieldConfig": { + "defaults": { + "custom": {}, + "decimals": 2, + "displayName": "Jobs", + "mappings": [], + "max": 2000, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "#EAB839", + "value": 500.01 + }, + { + "color": "red", + "value": 1800 + } + ] + }, + "unit": "currencyUSD" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 4, + "x": 0, + "y": 7 + }, + "id": 9, + "options": { + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "7.1.0", + "targets": [ + { + "expr": "compute_nodes_cost", + "instant": false, + "interval": "1", + "intervalFactor": 1, + "legendFormat": " ", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Compute Node hourly Cost", + "type": "gauge" + }, + { + "datasource": "prometheus", + "description": "", + "fieldConfig": { + "defaults": { + "custom": {}, + "decimals": 2, + "displayName": "Jobs", + "mappings": [], + "max": 20, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "#EAB839", + "value": 5.01 + }, + { + "color": "red", + "value": 18 + } + ] + }, + "unit": "currencyUSD" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 4, + "x": 4, + "y": 7 + }, + "id": 8, + "options": { + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "7.1.0", + "targets": [ + { + "expr": "ebs_compute_cost", + "instant": false, + "interval": "1", + "intervalFactor": 1, + "legendFormat": " ", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "EBS (Compute nodes) hourly Cost", + "type": "gauge" + }, + { + "datasource": "prometheus", + "description": "", + "fieldConfig": { + "defaults": { + "custom": {}, + "decimals": 2, + "displayName": "Jobs", + "mappings": [], + "max": 20, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "#EAB839", + "value": 5.01 + }, + { + "color": "red", + "value": 18 + } + ] + }, + "unit": "currencyUSD" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 4, + "x": 0, + "y": 14 + }, + "id": 5, + "options": { + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "7.1.0", + "targets": [ + { + "expr": "fsx_cost", + "instant": false, + "interval": "1", + "intervalFactor": 1, + "legendFormat": " ", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "FSx hourly Cost", + "type": "gauge" + }, + { + "datasource": "prometheus", + "description": "", + "fieldConfig": { + "defaults": { + "custom": {}, + "decimals": 2, + "displayName": "Jobs", + "mappings": [], + "max": 20, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "#EAB839", + "value": 5.01 + }, + { + "color": "red", + "value": 18 + } + ] + }, + "unit": "currencyUSD" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 4, + "x": 4, + "y": 14 + }, + "id": 6, + "options": { + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "7.1.0", + "targets": [ + { + "expr": "s3_cost", + "instant": false, + "interval": "1", + "intervalFactor": 1, + "legendFormat": " ", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "S3 hourly Cost", + "type": "gauge" + }, + { + "datasource": "prometheus", + "description": "", + "fieldConfig": { + "defaults": { + "custom": {}, + "decimals": 2, + "displayName": "Jobs", + "mappings": [], + "max": 20, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "#EAB839", + "value": 5.01 + }, + { + "color": "red", + "value": 18 + } + ] + }, + "unit": "currencyUSD" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 4, + "x": 8, + "y": 14 + }, + "id": 11, + "options": { + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "7.1.0", + "targets": [ + { + "expr": "s3_cost", + "instant": false, + "interval": "1", + "intervalFactor": 1, + "legendFormat": " ", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "EFS hourly cost", + "type": "gauge" + } + ], + "refresh": "1m", + "schemaVersion": 26, + "style": "dark", + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ] + }, + "timezone": "", + "title": "Cluster Cost", + "uid": "LTU-foiMk", + "version": 1 +} \ No newline at end of file diff --git a/monitoring/grafana/dashboards/dashboards.yml b/monitoring/grafana/dashboards/dashboards.yml new file mode 100755 index 0000000..181f601 --- /dev/null +++ b/monitoring/grafana/dashboards/dashboards.yml @@ -0,0 +1,10 @@ +apiVersion: 1 +providers: + - name: "Dashboards" + orgId: 1 + folder: "" + type: file + disableDeletion: false + editable: true + options: + path: /etc/grafana/provisioning/dashboards diff --git a/monitoring/grafana/dashboards/gpu.json b/monitoring/grafana/dashboards/gpu.json new file mode 100644 index 0000000..fa3d35c --- /dev/null +++ b/monitoring/grafana/dashboards/gpu.json @@ -0,0 +1,1921 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "Dashboard to visualize data from the NVIDIA Data Center GPU Manager (DCGM)", + "editable": true, + "gnetId": 11752, + "graphTooltip": 0, + "iteration": 1606149765866, + "links": [ + { + "icon": "external link", + "tags": [], + "type": "dashboards" + } + ], + "panels": [ + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "prometheus", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "format": "hertz", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 2, + "w": 3, + "x": 0, + "y": 0 + }, + "hideTimeOverride": true, + "id": 59, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "null", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "nodename", + "targets": [ + { + "expr": "node_uname_info{instance_id=~\"$instance_id\"}", + "format": "table", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "", + "title": "Hostname", + "type": "singlestat", + "valueFontSize": "50%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "prometheus", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "format": "hertz", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 2, + "w": 3, + "x": 3, + "y": 0 + }, + "id": 60, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "release", + "targets": [ + { + "expr": "node_uname_info{instance_id=\"$instance_id\"}", + "format": "table", + "intervalFactor": 1, + "refId": "A" + } + ], + "thresholds": "", + "title": "Kernel", + "type": "singlestat", + "valueFontSize": "50%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "prometheus", + "description": "", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 6, + "y": 0 + }, + "id": 58, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "DCGM_FI_DEV_GPU_UTIL{instance_id=\"$instance_id\"}", + "format": "time_series", + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "80,90", + "title": "GPU Total Utilization", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "prometheus", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "format": "watt", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 9, + "y": 0 + }, + "id": 30, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "DCGM_FI_DEV_POWER_USAGE{instance_id=\"$instance_id\"}", + "format": "time_series", + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "60,90", + "title": "GPU Total Power", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "prometheus", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "format": "celsius", + "gauge": { + "maxValue": 90, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 12, + "y": 0 + }, + "id": 31, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "DCGM_FI_DEV_GPU_TEMP{instance_id=\"$instance_id\"}", + "format": "time_series", + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "83,87", + "title": "GPU Avg. Temperature", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "prometheus", + "description": "", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 15, + "y": 0 + }, + "id": 68, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "DCGM_FI_DEV_DEC_UTIL{instance_id=\"$instance_id\"}", + "format": "time_series", + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "80,90", + "title": "GPU Decored Utilization", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "prometheus", + "description": "", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 18, + "y": 0 + }, + "id": 67, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "DCGM_FI_DEV_ENC_UTIL{instance_id=\"$instance_id\"}", + "format": "time_series", + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "80,90", + "title": "GPU Encored Utilization", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "prometheus", + "description": "", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 21, + "y": 0 + }, + "id": 40, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "DCGM_FI_DEV_FB_USED{instance_id=\"$instance_id\"}/(DCGM_FI_DEV_FB_FREE{instance_id=\"$instance_id\"}+DCGM_FI_DEV_FB_USED{instance_id=\"$instance_id\"})*100", + "format": "time_series", + "instant": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "70,90", + "title": "GPU Mem Util.", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "prometheus", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "format": "hertz", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 3, + "w": 3, + "x": 0, + "y": 2 + }, + "id": 44, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "DCGM_FI_DEV_SM_CLOCK{instance_id=\"$instance_id\"}*1000000", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "", + "title": "GPU SM Clocks", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "prometheus", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "format": "hertz", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 3, + "w": 3, + "x": 3, + "y": 2 + }, + "id": 45, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "DCGM_FI_DEV_MEM_CLOCK{instance_id=\"$instance_id\"}*1000000", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "", + "title": "GPU Memory Clocks", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 5, + "w": 8, + "x": 0, + "y": 5 + }, + "hiddenSeries": false, + "id": 35, + "legend": { + "avg": false, + "current": true, + "max": false, + "min": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.3.3", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(DCGM_FI_DEV_NVLINK_BANDWIDTH_TOTAL{instance_id=\"$instance_id\"}[$__interval]))", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "Total", + "refId": "A" + }, + { + "expr": "dcgm_nv", + "format": "time_series", + "hide": true, + "interval": "", + "intervalFactor": 1, + "legendFormat": "", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "NVLINK Bandwidth", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "watt", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [] + }, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 5, + "w": 8, + "x": 8, + "y": 5 + }, + "hiddenSeries": false, + "id": 57, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.3.3", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "DCGM_FI_DEV_GPU_UTIL{instance_id=\"$instance_id\"}", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "GPU {{gpu}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "GPU Utilization", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percent", + "label": null, + "logBase": 1, + "max": "100", + "min": "0", + "show": true + }, + { + "format": "watt", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [] + }, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 5, + "w": 8, + "x": 16, + "y": 5 + }, + "hiddenSeries": false, + "id": 25, + "legend": { + "avg": false, + "current": true, + "max": false, + "min": false, + "show": false, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.3.3", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "DCGM_FI_DEV_GPU_TEMP{instance_id=\"$instance_id\"}", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "GPU {{gpu}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "GPU Temperature", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "celsius", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 5, + "w": 8, + "x": 0, + "y": 10 + }, + "hiddenSeries": false, + "id": 36, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": false, + "hideZero": false, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.3.3", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "Rx", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "DCGM_FI_DEV_PCIE_TX_THROUGHPUT{instance_id=\"$instance_id\"}", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "GPU{{gpu}}_Tx", + "refId": "A" + }, + { + "expr": "DCGM_FI_DEV_PCIE_RX_THROUGHPUT{instance_id=\"$instance_id\"}", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "GPU{{gpu}}_Rx", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "PCIe Throughput", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "KBs", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "watt", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 5, + "w": 8, + "x": 8, + "y": 10 + }, + "hiddenSeries": false, + "id": 64, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.3.3", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "DCGM_FI_DEV_SM_CLOCK{instance_id=\"$instance_id\"}", + "interval": "", + "legendFormat": "GPU {{gpu}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "GPU SM Clocks", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "hertz", + "label": null, + "logBase": 1, + "max": "100", + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 5, + "w": 8, + "x": 16, + "y": 10 + }, + "hiddenSeries": false, + "id": 24, + "legend": { + "avg": false, + "current": true, + "max": false, + "min": false, + "show": false, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.3.3", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "DCGM_FI_DEV_POWER_USAGE{instance_id=\"$instance_id\"}", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "GPU {{gpu}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "GPU Power Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "watt", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "watt", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [] + }, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 5, + "w": 8, + "x": 0, + "y": 15 + }, + "hiddenSeries": false, + "id": 42, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.3.3", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "DCGM_FI_DEV_FB_USED{instance_id=\"$instance_id\"}", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "GPU {{gpu}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "GPU Mem Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:1193", + "decimals": null, + "format": "decmbytes", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:1194", + "format": "watt", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 5, + "w": 8, + "x": 8, + "y": 15 + }, + "hiddenSeries": false, + "id": 62, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.3.3", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "DCGM_FI_DEV_MEM_CLOCK{instance_id=\"$instance_id\"}", + "interval": "", + "legendFormat": "GPU {{gpu}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "GPU Memory Clocks", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "hertz", + "label": null, + "logBase": 1, + "max": "100", + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [] + }, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 5, + "w": 8, + "x": 16, + "y": 15 + }, + "hiddenSeries": false, + "id": 39, + "legend": { + "avg": false, + "current": true, + "max": false, + "min": false, + "show": false, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.3.3", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "DCGM_FI_DEV_MEM_COPY_UTIL{instance_id=\"$instance_id\"}", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "GPU {{gpu}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "GPU Mem Cpy Utilization", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percent", + "label": null, + "logBase": 1, + "max": "100", + "min": "0", + "show": true + }, + { + "format": "watt", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "refresh": "10s", + "schemaVersion": 26, + "style": "dark", + "tags": [ + "GPU" + ], + "templating": { + "list": [ + { + "allValue": null, + "current": { + "selected": true, + "text": "i-0edc71e97259faa4c", + "value": "i-0edc71e97259faa4c" + }, + "datasource": "prometheus", + "definition": "label_values(node_uname_info{job=~\"ec2_instances\",instance_type=~\"[pg][2-4].*\"}, instance_id)", + "error": null, + "hide": 0, + "includeAll": false, + "label": "Instance ID", + "multi": false, + "name": "instance_id", + "options": [], + "query": "label_values(node_uname_info{job=~\"ec2_instances\",instance_type=~\"[pg][2-4].*\"}, instance_id)", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-15m", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "GPU Nodes", + "uid": "hpcsyl6zhqk", + "version": 2 +} \ No newline at end of file diff --git a/monitoring/grafana/dashboards/logs.json b/monitoring/grafana/dashboards/logs.json new file mode 100755 index 0000000..c3c2803 --- /dev/null +++ b/monitoring/grafana/dashboards/logs.json @@ -0,0 +1,729 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "links": [ + { + "$$hashKey": "object:58", + "icon": "external link", + "tags": [], + "type": "dashboards" + } + ], + "panels": [ + { + "collapsed": false, + "datasource": null, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 15, + "panels": [], + "title": "Slurm", + "type": "row" + }, + { + "datasource": "cloudwatch", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "gridPos": { + "h": 12, + "w": 12, + "x": 0, + "y": 1 + }, + "id": 16, + "options": { + "showLabels": false, + "showTime": false, + "sortOrder": "Descending", + "wrapLogMessage": true + }, + "targets": [ + { + "alias": "", + "apiMode": "Logs", + "dimensions": {}, + "expression": "fields @timestamp, @message, @logStream\n| sort @timestamp desc\n| limit 20\n| filter @logStream like /slurmctld/", + "id": "", + "logGroupNames": [ + "__LOG_GROUP__NAMES__" + ], + "matchExact": true, + "metricName": "", + "namespace": "", + "period": "", + "queryMode": "Logs", + "refId": "A", + "region": "__AWS_REGION__", + "statistics": [ + "Average" + ], + "statsGroups": [] + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Slurmctld", + "type": "logs" + }, + { + "datasource": "cloudwatch", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "gridPos": { + "h": 12, + "w": 12, + "x": 12, + "y": 1 + }, + "id": 17, + "options": { + "showLabels": false, + "showTime": false, + "sortOrder": "Descending", + "wrapLogMessage": true + }, + "targets": [ + { + "alias": "", + "apiMode": "Logs", + "dimensions": {}, + "expression": "fields @timestamp, @message, @logStream\n| sort @timestamp desc\n| limit 20\n| filter @logStream like /slurmd/", + "id": "", + "logGroupNames": [ + "__LOG_GROUP__NAMES__" + ], + "matchExact": true, + "metricName": "", + "namespace": "", + "period": "", + "queryMode": "Logs", + "refId": "A", + "region": "__AWS_REGION__", + "statistics": [ + "Average" + ], + "statsGroups": [] + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Slurmd", + "type": "logs" + }, + { + "collapsed": false, + "datasource": null, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 13 + }, + "id": 10, + "panels": [], + "title": "Watchers", + "type": "row" + }, + { + "datasource": "cloudwatch", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "gridPos": { + "h": 15, + "w": 8, + "x": 0, + "y": 14 + }, + "id": 11, + "options": { + "showLabels": false, + "showTime": false, + "sortOrder": "Descending", + "wrapLogMessage": true + }, + "targets": [ + { + "alias": "", + "apiMode": "Logs", + "dimensions": {}, + "expression": "fields @timestamp, @message, @logStream\n| sort @timestamp desc\n| limit 20\n| filter @logStream like /sqswatcher/", + "id": "", + "logGroupNames": [ + "__LOG_GROUP__NAMES__" + ], + "matchExact": true, + "metricName": "", + "namespace": "", + "period": "", + "queryMode": "Logs", + "refId": "A", + "region": "__AWS_REGION__", + "statistics": [ + "Average" + ], + "statsGroups": [] + } + ], + "timeFrom": null, + "timeShift": null, + "title": "SQS-Watcher Logs", + "type": "logs" + }, + { + "datasource": "cloudwatch", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "gridPos": { + "h": 15, + "w": 8, + "x": 8, + "y": 14 + }, + "id": 12, + "options": { + "showLabels": false, + "showTime": false, + "sortOrder": "Descending", + "wrapLogMessage": true + }, + "targets": [ + { + "alias": "", + "apiMode": "Logs", + "dimensions": {}, + "expression": "fields @timestamp, @message, @logStream\n| sort @timestamp desc\n| limit 20\n| filter @logStream like /nodewatcher/", + "id": "", + "logGroupNames": [ + "__LOG_GROUP__NAMES__" + ], + "matchExact": true, + "metricName": "", + "namespace": "", + "period": "", + "queryMode": "Logs", + "refId": "A", + "region": "__AWS_REGION__", + "statistics": [ + "Average" + ], + "statsGroups": [] + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Node Watcher Logs", + "type": "logs" + }, + { + "datasource": "cloudwatch", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "gridPos": { + "h": 15, + "w": 8, + "x": 16, + "y": 14 + }, + "id": 13, + "options": { + "showLabels": false, + "showTime": false, + "sortOrder": "Descending", + "wrapLogMessage": true + }, + "targets": [ + { + "alias": "", + "apiMode": "Logs", + "dimensions": {}, + "expression": "fields @timestamp, @message, @logStream\n| sort @timestamp desc\n| limit 20\n| filter @logStream like /jobwatcher/", + "id": "", + "logGroupNames": [ + "__LOG_GROUP__NAMES__" + ], + "matchExact": true, + "metricName": "", + "namespace": "", + "period": "", + "queryMode": "Logs", + "refId": "A", + "region": "__AWS_REGION__", + "statistics": [ + "Average" + ], + "statsGroups": [] + } + ], + "timeFrom": null, + "timeShift": null, + "title": "JOB Watcher Logs", + "type": "logs" + }, + { + "collapsed": false, + "datasource": null, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 29 + }, + "id": 19, + "panels": [], + "title": "Init", + "type": "row" + }, + { + "datasource": "cloudwatch", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "gridPos": { + "h": 13, + "w": 12, + "x": 0, + "y": 30 + }, + "id": 20, + "options": { + "showLabels": false, + "showTime": false, + "sortOrder": "Descending", + "wrapLogMessage": true + }, + "targets": [ + { + "alias": "", + "apiMode": "Logs", + "dimensions": {}, + "expression": "fields @timestamp, @message, @logStream\n| sort @timestamp desc\n| limit 20\n| filter @logStream like /cfn-init/", + "id": "", + "logGroupNames": [ + "__LOG_GROUP__NAMES__" + ], + "matchExact": true, + "metricName": "", + "namespace": "", + "period": "", + "queryMode": "Logs", + "refId": "A", + "region": "__AWS_REGION__", + "statistics": [ + "Average" + ], + "statsGroups": [] + } + ], + "timeFrom": null, + "timeShift": null, + "title": "cfn-init", + "type": "logs" + }, + { + "datasource": "cloudwatch", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "gridPos": { + "h": 13, + "w": 11, + "x": 12, + "y": 30 + }, + "id": 21, + "options": { + "showLabels": false, + "showTime": false, + "sortOrder": "Descending", + "wrapLogMessage": true + }, + "targets": [ + { + "alias": "", + "apiMode": "Logs", + "dimensions": {}, + "expression": "fields @timestamp, @message, @logStream\n| sort @timestamp desc\n| limit 20\n| filter @logStream like /cloud-init/", + "id": "", + "logGroupNames": [ + "__LOG_GROUP__NAMES__" + ], + "matchExact": true, + "metricName": "", + "namespace": "", + "period": "", + "queryMode": "Logs", + "refId": "A", + "region": "__AWS_REGION__", + "statistics": [ + "Average" + ], + "statsGroups": [] + } + ], + "timeFrom": null, + "timeShift": null, + "title": "cloud-init", + "type": "logs" + }, + { + "collapsed": false, + "datasource": null, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 43 + }, + "id": 23, + "panels": [], + "title": "Others", + "type": "row" + }, + { + "datasource": "cloudwatch", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "gridPos": { + "h": 14, + "w": 12, + "x": 0, + "y": 44 + }, + "id": 24, + "options": { + "showLabels": false, + "showTime": false, + "sortOrder": "Descending", + "wrapLogMessage": true + }, + "targets": [ + { + "alias": "", + "apiMode": "Logs", + "dimensions": {}, + "expression": "fields @timestamp, @message, @logStream\n| sort @timestamp desc\n| limit 20\n| filter @logStream like /supervisord/", + "id": "", + "logGroupNames": [ + "__LOG_GROUP__NAMES__" + ], + "matchExact": true, + "metricName": "", + "namespace": "", + "period": "", + "queryMode": "Logs", + "refId": "A", + "region": "__AWS_REGION__", + "statistics": [ + "Average" + ], + "statsGroups": [] + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Supervisord", + "type": "logs" + }, + { + "datasource": "cloudwatch", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "gridPos": { + "h": 14, + "w": 12, + "x": 12, + "y": 44 + }, + "id": 25, + "options": { + "showLabels": false, + "showTime": false, + "sortOrder": "Descending", + "wrapLogMessage": true + }, + "targets": [ + { + "alias": "", + "apiMode": "Logs", + "dimensions": {}, + "expression": "fields @timestamp, @message, @logStream\n| sort @timestamp desc\n| limit 20\n| filter @logStream like /system-messages/", + "id": "", + "logGroupNames": [ + "__LOG_GROUP__NAMES__" + ], + "matchExact": true, + "metricName": "", + "namespace": "", + "period": "", + "queryMode": "Logs", + "refId": "A", + "region": "__AWS_REGION__", + "statistics": [ + "Average" + ], + "statsGroups": [] + } + ], + "timeFrom": null, + "timeShift": null, + "title": "System Messages", + "type": "logs" + }, + { + "collapsed": false, + "datasource": null, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 58 + }, + "id": 6, + "panels": [], + "title": "DCV", + "type": "row" + }, + { + "datasource": "cloudwatch", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "gridPos": { + "h": 13, + "w": 12, + "x": 0, + "y": 59 + }, + "id": 3, + "options": { + "showLabels": false, + "showTime": false, + "sortOrder": "Descending", + "wrapLogMessage": true + }, + "targets": [ + { + "alias": "", + "apiMode": "Logs", + "dimensions": {}, + "expression": "fields @timestamp, @message, @logStream\n| sort @timestamp desc\n| limit 20\n| filter @logStream like /dcv-authenticator/", + "id": "", + "logGroupNames": [ + "__LOG_GROUP__NAMES__" + ], + "matchExact": true, + "metricName": "", + "namespace": "", + "period": "", + "queryMode": "Logs", + "refId": "A", + "region": "__AWS_REGION__", + "statistics": [ + "Average" + ], + "statsGroups": [] + } + ], + "timeFrom": null, + "timeShift": null, + "title": "DCV Authenticator Logs", + "type": "logs" + }, + { + "datasource": "cloudwatch", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "gridPos": { + "h": 13, + "w": 12, + "x": 12, + "y": 59 + }, + "id": 4, + "options": { + "showLabels": false, + "showTime": false, + "sortOrder": "Descending", + "wrapLogMessage": true + }, + "targets": [ + { + "alias": "", + "apiMode": "Logs", + "dimensions": {}, + "expression": "fields @timestamp, @message, @logStream\n| sort @timestamp desc\n| limit 20\n| filter @logStream like /dcv-server/", + "id": "", + "logGroupNames": [ + "__LOG_GROUP__NAMES__" + ], + "matchExact": true, + "metricName": "", + "namespace": "", + "period": "", + "queryMode": "Logs", + "refId": "A", + "region": "__AWS_REGION__", + "statistics": [ + "Average" + ], + "statsGroups": [] + } + ], + "timeFrom": null, + "timeShift": null, + "title": "DCV Server Logs", + "type": "logs" + }, + { + "collapsed": false, + "datasource": null, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 72 + }, + "id": 8, + "panels": [], + "title": "Aggregate Logs", + "type": "row" + }, + { + "datasource": "cloudwatch", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "gridPos": { + "h": 4, + "w": 24, + "x": 0, + "y": 73 + }, + "id": 2, + "options": { + "showLabels": false, + "showTime": true, + "sortOrder": "Descending", + "wrapLogMessage": true + }, + "targets": [ + { + "alias": "", + "apiMode": "Logs", + "dimensions": {}, + "expression": "fields @timestamp, @message \n| sort @timestamp desc\n| limit 20", + "id": "", + "logGroupNames": [ + "__LOG_GROUP__NAMES__" + ], + "matchExact": true, + "metricName": "", + "namespace": "", + "period": "", + "queryMode": "Logs", + "refId": "A", + "region": "__AWS_REGION__", + "statistics": [ + "Average" + ], + "statsGroups": [] + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Cluster Logs (ALL)", + "type": "logs" + } + ], + "refresh": "30s", + "schemaVersion": 25, + "style": "dark", + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": { + "hidden": false, + "refresh_intervals": [ + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ] + }, + "timezone": "", + "title": "Cluster Logs", + "uid": "o_8MQ5mMk", + "version": 1 +} \ No newline at end of file diff --git a/monitoring/grafana/dashboards/master-node-details.json b/monitoring/grafana/dashboards/master-node-details.json new file mode 100755 index 0000000..8c4e229 --- /dev/null +++ b/monitoring/grafana/dashboards/master-node-details.json @@ -0,0 +1,2299 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "", + "editable": true, + "gnetId": 11074, + "graphTooltip": 0, + "iteration": 1592040183080, + "links": [ + { + "icon": "external link", + "tags": [], + "type": "dashboards" + } + ], + "panels": [ + { + "cacheTimeout": null, + "colorBackground": false, + "colorPostfix": false, + "colorPrefix": false, + "colorValue": true, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "prometheus", + "decimals": 1, + "description": "", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "format": "s", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 3, + "w": 2, + "x": 0, + "y": 0 + }, + "hideTimeOverride": true, + "id": 15, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "null", + "nullText": null, + "pluginVersion": "6.4.2", + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(time() - node_boot_time_seconds{instance=~\"$node\"})", + "format": "time_series", + "hide": false, + "instant": true, + "intervalFactor": 1, + "refId": "A", + "step": 40 + } + ], + "thresholds": "1,2", + "title": "System Uptime", + "type": "singlestat", + "valueFontSize": "100%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": true, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "prometheus", + "decimals": 2, + "description": "", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "format": "bytes", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 3, + "w": 2, + "x": 2, + "y": 0 + }, + "id": 75, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "maxPerRow": 6, + "nullPointMode": "null", + "nullText": null, + "postfix": "", + "postfixFontSize": "70%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(node_memory_MemTotal_bytes{instance=~\"$node\"})", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "{{instance}}", + "refId": "A", + "step": 20 + } + ], + "thresholds": "2,3", + "title": "Total RAM", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "datasource": "prometheus", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": {}, + "displayName": "", + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "#EAB839", + "value": 60 + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 5, + "x": 4, + "y": 0 + }, + "id": 177, + "options": { + "displayMode": "lcd", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showUnfilled": true + }, + "pluginVersion": "7.0.3", + "targets": [ + { + "expr": "100 - (avg(irate(node_cpu_seconds_total{instance=~\"$node\",mode=\"idle\"}[30m])) * 100)", + "instant": true, + "legendFormat": "CPU Busy", + "refId": "A" + }, + { + "expr": "avg(irate(node_cpu_seconds_total{instance=~\"$node\",mode=\"iowait\"}[30m])) * 100", + "hide": true, + "instant": true, + "legendFormat": "Busy Iowait", + "refId": "C" + }, + { + "expr": "(1 - (node_memory_MemAvailable_bytes{instance=~\"$node\"} / (node_memory_MemTotal_bytes{instance=~\"$node\"})))* 100", + "instant": true, + "legendFormat": "Used RAM Memory", + "refId": "B" + }, + { + "expr": "100 - ((node_filesystem_avail_bytes{instance=~\"$node\",mountpoint=\"$maxmount\",fstype=~\"ext4|xfs\"} * 100) / node_filesystem_size_bytes {instance=~\"$node\",mountpoint=\"$maxmount\",fstype=~\"ext4|xfs\"})", + "hide": false, + "instant": true, + "legendFormat": "Used Max Mount($maxmount)", + "refId": "D" + }, + { + "expr": "(1 - (node_memory_SwapFree_bytes{instance=~\"$node\"} / node_memory_SwapTotal_bytes{instance=~\"$node\"})) * 100", + "instant": true, + "legendFormat": "Used SWAP", + "refId": "E" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "", + "type": "bargauge" + }, + { + "aliasColors": { + "15分钟": "#6ED0E0", + "1分钟": "#BF1B00", + "5分钟": "#CCA300" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 1, + "grid": {}, + "gridPos": { + "h": 6, + "w": 8, + "x": 9, + "y": 0 + }, + "height": "300", + "hiddenSeries": false, + "id": 13, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null as zero", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_load1{instance=~\"$node\"}", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}_1m", + "metric": "", + "refId": "A", + "step": 20, + "target": "" + }, + { + "expr": "node_load5{instance=~\"$node\"}", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}_5m", + "refId": "B", + "step": 20 + }, + { + "expr": "node_load15{instance=~\"$node\"}", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}_15m", + "refId": "C", + "step": 20 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "System Load", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 2, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "columns": [], + "datasource": "prometheus", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fontSize": "110%", + "gridPos": { + "h": 6, + "w": 7, + "x": 17, + "y": 0 + }, + "id": 164, + "links": [], + "pageSize": null, + "scroll": true, + "showHeader": true, + "sort": { + "col": 6, + "desc": false + }, + "styles": [ + { + "alias": "Mounted on", + "align": "auto", + "colorMode": null, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "mappingType": 1, + "pattern": "mountpoint", + "thresholds": [ + "" + ], + "type": "string", + "unit": "bytes" + }, + { + "alias": "Avail", + "align": "auto", + "colorMode": "value", + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "mappingType": 1, + "pattern": "Value #A", + "thresholds": [ + "10000000000", + "20000000000" + ], + "type": "number", + "unit": "bytes" + }, + { + "alias": "Used", + "align": "auto", + "colorMode": "cell", + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "mappingType": 1, + "pattern": "Value #B", + "thresholds": [ + "0.6", + "0.8" + ], + "type": "number", + "unit": "percentunit" + }, + { + "alias": "Size", + "align": "auto", + "colorMode": null, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 1, + "link": false, + "mappingType": 1, + "pattern": "Value #C", + "thresholds": [], + "type": "number", + "unit": "bytes" + }, + { + "alias": "Filesystem", + "align": "auto", + "colorMode": null, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "mappingType": 1, + "pattern": "fstype", + "thresholds": [], + "type": "string", + "unit": "short" + }, + { + "alias": "IP", + "align": "auto", + "colorMode": null, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "mappingType": 1, + "pattern": "instance", + "preserveFormat": false, + "sanitize": false, + "thresholds": [], + "type": "string", + "unit": "short" + }, + { + "alias": "", + "align": "auto", + "colorMode": null, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "decimals": 2, + "pattern": "/.*/", + "preserveFormat": true, + "sanitize": false, + "thresholds": [], + "type": "hidden", + "unit": "short" + } + ], + "targets": [ + { + "expr": "node_filesystem_size_bytes{instance=~'$node',fstype=~\"ext4|xfs\"}-0", + "format": "table", + "hide": false, + "instant": true, + "intervalFactor": 1, + "legendFormat": "", + "refId": "C" + }, + { + "expr": "node_filesystem_avail_bytes {instance=~'$node',fstype=~\"ext4|xfs\"}-0", + "format": "table", + "hide": false, + "instant": true, + "interval": "10s", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + }, + { + "expr": "1-(node_filesystem_free_bytes{instance=~'$node',fstype=~\"ext4|xfs\"} / node_filesystem_size_bytes{instance=~'$node',fstype=~\"ext4|xfs\"})", + "format": "table", + "hide": false, + "instant": true, + "intervalFactor": 1, + "legendFormat": "", + "refId": "B" + } + ], + "title": "Disk Space Used Basic(EXT4/XFS)", + "transform": "table", + "type": "table-old" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorPostfix": false, + "colorValue": true, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "prometheus", + "description": "", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "format": "short", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 3, + "w": 2, + "x": 0, + "y": 3 + }, + "id": 14, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "maxPerRow": 6, + "nullPointMode": "null", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(count(node_cpu_seconds_total{instance=~\"$node\", mode='system'}) by (cpu))", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "", + "refId": "A", + "step": 20 + } + ], + "thresholds": "1,2", + "title": "CPU Cores", + "type": "singlestat", + "valueFontSize": "100%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": true, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "prometheus", + "decimals": 2, + "description": "", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 3, + "w": 2, + "x": 2, + "y": 3 + }, + "id": 20, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "pluginVersion": "6.4.2", + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "#3274D9", + "show": true, + "ymax": null, + "ymin": null + }, + "tableColumn": "", + "targets": [ + { + "expr": "avg(irate(node_cpu_seconds_total{instance=~\"$node\",mode=\"iowait\"}[30m])) * 100", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A", + "step": 20 + } + ], + "thresholds": "20,50", + "timeFrom": null, + "timeShift": null, + "title": "CPU IOwait", + "type": "singlestat", + "valueFontSize": "100%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "aliasColors": { + "192.168.200.241:9100_Total": "dark-red", + "Idle - Waiting for something to happen": "#052B51", + "guest": "#9AC48A", + "idle": "#052B51", + "iowait": "#EAB839", + "irq": "#BF1B00", + "nice": "#C15C17", + "sdb_每秒I/O操作%": "#d683ce", + "softirq": "#E24D42", + "steal": "#FCE2DE", + "system": "#508642", + "user": "#5195CE", + "磁盘花费在I/O操作占比": "#ba43a9" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "decimals": 2, + "description": "", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 11, + "w": 12, + "x": 0, + "y": 6 + }, + "hiddenSeries": false, + "id": 7, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + { + "alias": "/.*_Total/", + "color": "#C4162A", + "fill": 0 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "avg(irate(node_cpu_seconds_total{instance=~\"$node\",mode=\"system\"}[30m])) by (instance)", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}_System", + "refId": "A", + "step": 20 + }, + { + "expr": "avg(irate(node_cpu_seconds_total{instance=~\"$node\",mode=\"user\"}[30m])) by (instance)", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "{{instance}}_User", + "refId": "B", + "step": 240 + }, + { + "expr": "avg(irate(node_cpu_seconds_total{instance=~\"$node\",mode=\"iowait\"}[30m])) by (instance)", + "format": "time_series", + "hide": false, + "instant": false, + "intervalFactor": 1, + "legendFormat": "{{instance}}_Iowait", + "refId": "D", + "step": 240 + }, + { + "expr": "1 - avg(irate(node_cpu_seconds_total{instance=~\"$node\",mode=\"idle\"}[30m])) by (instance)", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "{{instance}}_Total", + "refId": "F", + "step": 240 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "CPU Basic", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": 2, + "format": "percentunit", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "192.168.10.227:9100_em1_in下载": "super-light-green", + "192.168.10.227:9100_em1_out上传": "dark-blue" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 3, + "gridPos": { + "h": 11, + "w": 12, + "x": 12, + "y": 6 + }, + "height": "300", + "hiddenSeries": false, + "id": 157, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*_transmit$/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(node_network_receive_bytes_total{instance=~'$node',device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*'}[30m])*8", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{instance}}_{{device}}_receive", + "refId": "A", + "step": 4 + }, + { + "expr": "irate(node_network_transmit_bytes_total{instance=~'$node',device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*'}[30m])*8", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{instance}}_{{device}}_transmit", + "refId": "B", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Network Traffic Basic", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:228", + "format": "bps", + "label": "transmit(-)/receive(+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:229", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "192.168.200.241:9100_总内存": "dark-red", + "内存_Avaliable": "#6ED0E0", + "内存_Cached": "#EF843C", + "内存_Free": "#629E51", + "内存_Total": "#6d1f62", + "内存_Used": "#eab839", + "可用": "#9ac48a", + "总内存": "#bf1b00" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "decimals": 2, + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 12, + "w": 12, + "x": 0, + "y": 17 + }, + "height": "300", + "hiddenSeries": false, + "id": 156, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*_Total/", + "color": "#C4162A", + "fill": 0 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_memory_MemTotal_bytes{instance=~\"$node\"}", + "format": "time_series", + "hide": false, + "instant": false, + "intervalFactor": 1, + "legendFormat": "{{instance}}_Total", + "refId": "A", + "step": 4 + }, + { + "expr": "node_memory_MemTotal_bytes{instance=~\"$node\"} - node_memory_MemAvailable_bytes{instance=~\"$node\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "{{instance}}_Used", + "refId": "B", + "step": 4 + }, + { + "expr": "node_memory_MemAvailable_bytes{instance=~\"$node\"}", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}_Avaliable", + "refId": "F", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Memory Basic", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "vda_write": "#6ED0E0" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "description": "Per second read / write bytes ", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 1, + "gridPos": { + "h": 12, + "w": 12, + "x": 12, + "y": 17 + }, + "height": "300", + "hiddenSeries": false, + "id": 168, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:131", + "alias": "/.*_Read bytes$/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(node_disk_read_bytes_total{instance=~\"$node\"}[30m])", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}_{{device}}_Read bytes", + "refId": "A", + "step": 10 + }, + { + "expr": "irate(node_disk_written_bytes_total{instance=~\"$node\"}[30m])", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "{{instance}}_{{device}}_Written bytes", + "refId": "B", + "step": 10 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Disk R/W Data", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:144", + "decimals": null, + "format": "Bps", + "label": "Bytes read (-) / write (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "$$hashKey": "object:145", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 3, + "gridPos": { + "h": 7, + "w": 8, + "x": 0, + "y": 29 + }, + "hiddenSeries": false, + "id": 174, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "hideEmpty": false, + "hideZero": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/Inodes.*/", + "yaxis": 2 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "1-(node_filesystem_free_bytes{instance=~'$node',fstype=~\"ext4|xfs\"} / node_filesystem_size_bytes{instance=~'$node',fstype=~\"ext4|xfs\"})", + "format": "time_series", + "instant": false, + "intervalFactor": 1, + "legendFormat": "{{instance}}:{{mountpoint}}", + "refId": "A" + }, + { + "expr": "node_filesystem_files_free{instance=~'$node',fstype=~\"ext4|xfs\"} / node_filesystem_files{instance=~'$node',fstype=~\"ext4|xfs\"}", + "hide": true, + "legendFormat": "Inodes:{{instance}}:{{mountpoint}}", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Disk Space Used Basic", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": 2, + "format": "percentunit", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "decimals": 2, + "format": "percentunit", + "label": null, + "logBase": 1, + "max": "1", + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "filefd_192.168.200.241:9100": "super-light-green", + "switches_192.168.200.241:9100": "semi-dark-red" + }, + "bars": false, + "cacheTimeout": null, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "description": "", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 0, + "fillGradient": 1, + "gridPos": { + "h": 6, + "w": 8, + "x": 8, + "y": 29 + }, + "hiddenSeries": false, + "hideTimeOverride": false, + "id": 16, + "legend": { + "alignAsTable": false, + "avg": false, + "current": true, + "max": true, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pluginVersion": "6.4.2", + "pointradius": 1, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/filefd_.*/", + "lines": false, + "pointradius": 1, + "points": true + }, + { + "alias": "/switches_.*/", + "color": "#F2495C", + "yaxis": 2 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_filefd_allocated{instance=~\"$node\"}", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 5, + "legendFormat": "filefd_{{instance}}", + "refId": "B" + }, + { + "expr": "irate(node_context_switches_total{instance=~\"$node\"}[30m])", + "intervalFactor": 5, + "legendFormat": "switches_{{instance}}", + "refId": "A" + }, + { + "expr": "node_filefd_maximum{instance=~\"$node\"}", + "hide": true, + "refId": "C" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Open File Descriptor(left)/Context switches(right)", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": "context_switches", + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Idle - Waiting for something to happen": "#052B51", + "guest": "#9AC48A", + "idle": "#052B51", + "iowait": "#EAB839", + "irq": "#BF1B00", + "nice": "#C15C17", + "sdb_每秒I/O操作%": "#d683ce", + "softirq": "#E24D42", + "steal": "#FCE2DE", + "system": "#508642", + "user": "#5195CE", + "磁盘花费在I/O操作占比": "#ba43a9" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "decimals": null, + "description": "The time spent on I/O in the natural time of each second.(wall-clock time)", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 5, + "gridPos": { + "h": 6, + "w": 8, + "x": 16, + "y": 29 + }, + "hiddenSeries": false, + "id": 175, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "sort": null, + "sortDesc": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "maxPerRow": 6, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(node_disk_io_time_seconds_total{instance=~\"$node\"}[30m])", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{instance}}_{{device}}_ IO time", + "refId": "C" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Time Spent Doing I/Os", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": null, + "format": "s", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "vda_write": "#6ED0E0" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "description": "Read/write completions per second", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 1, + "gridPos": { + "h": 6, + "w": 8, + "x": 8, + "y": 35 + }, + "height": "300", + "hiddenSeries": false, + "id": 161, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, + "min": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*_Reads completed$/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(node_disk_reads_completed_total{instance=~\"$node\"}[30m])", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}_{{device}}_Reads completed", + "refId": "A", + "step": 10 + }, + { + "expr": "irate(node_disk_writes_completed_total{instance=~\"$node\"}[30m])", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "{{instance}}_{{device}}_Writes completed", + "refId": "B", + "step": 10 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Disk IOps Completed", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": null, + "format": "iops", + "label": "IO read (-) / write (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "vda": "#6ED0E0" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "description": "Time spent on each read/write operation", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 1, + "gridPos": { + "h": 6, + "w": 8, + "x": 16, + "y": 35 + }, + "height": "300", + "hiddenSeries": false, + "id": 160, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, + "min": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "null as zero", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/,*_Read time$/", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "irate(node_disk_read_time_seconds_total{instance=~\"$node\"}[30m]) / irate(node_disk_reads_completed_total{instance=~\"$node\"}[30m])", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}_{{device}}_Read time", + "refId": "B" + }, + { + "expr": "irate(node_disk_write_time_seconds_total{instance=~\"$node\"}[30m]) / irate(node_disk_writes_completed_total{instance=~\"$node\"}[30m])", + "format": "time_series", + "hide": false, + "instant": false, + "intervalFactor": 1, + "legendFormat": "{{instance}}_{{device}}_Write time", + "refId": "C" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Disk R/W Time(Reference: less than 100ms)(beta)", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "label": "Time. read (-) / write (+)", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "TCP": "#6ED0E0" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "description": "TCP_alloc - Allocated sockets\n\nCurrEstab - TCP connections for which the current state is either ESTABLISHED or CLOSE- WAIT\n\nTCP_tw - Sockets wating close\n\nUDP_inuse - Udp sockets currently in use\n\nSockets_used - Sockets currently in use", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 6, + "w": 8, + "x": 0, + "y": 36 + }, + "height": "300", + "hiddenSeries": false, + "id": 158, + "interval": "", + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": true, + "min": false, + "rightSide": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*_Sockets_used/", + "color": "#C4162A", + "fill": 0 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_netstat_Tcp_CurrEstab{instance=~'$node'}", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{instance}}_CurrEstab", + "refId": "A", + "step": 20 + }, + { + "expr": "node_sockstat_TCP_tw{instance=~'$node'}", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{instance}}_TCP_tw", + "refId": "D" + }, + { + "expr": "node_sockstat_sockets_used{instance=~'$node'}", + "legendFormat": "{{instance}}_Sockets_used", + "refId": "B" + }, + { + "expr": "node_sockstat_UDP_inuse{instance=~'$node'}", + "legendFormat": "{{instance}}_UDP_inuse", + "refId": "C" + }, + { + "expr": "node_sockstat_TCP_alloc{instance=~'$node'}", + "legendFormat": "{{instance}}_TCP_alloc", + "refId": "E" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Network Sockstat", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "refresh": "5s", + "schemaVersion": 25, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "allValue": null, + "current": { + "selected": false, + "text": "__INSTANCE_ID__", + "value": "__INSTANCE_ID__" + }, + "datasource": "prometheus", + "definition": "label_values(node_uname_info{job=~\"ec2_instances\"}, instance_id)", + "hide": 2, + "includeAll": false, + "label": "Instance ID", + "multi": false, + "name": "instance_id", + "options": [], + "query": "label_values(node_uname_info{job=~\"ec2_instances\"}, instance_id)", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allFormat": "glob", + "allValue": null, + "current": { + "selected": false, + "text": "All", + "value": "$__all" + }, + "datasource": "prometheus", + "definition": "label_values(node_uname_info{instance_id=~\"$instance_id\"},instance)", + "hide": 2, + "includeAll": true, + "label": "IP", + "multi": false, + "multiFormat": "regex values", + "name": "node", + "options": [], + "query": "label_values(node_uname_info{instance_id=~\"$instance_id\"},instance)", + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + "selected": false, + "text": "/", + "value": "/" + }, + "datasource": "prometheus", + "definition": "", + "hide": 2, + "includeAll": false, + "label": "", + "multi": false, + "name": "maxmount", + "options": [], + "query": "query_result(topk(1,sort_desc (max(node_filesystem_size_bytes{instance=~'$node',fstype=~\"ext4|xfs\"}) by (mountpoint))))", + "refresh": 2, + "regex": "/.*\\\"(.*)\\\".*/", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allFormat": "glob", + "allValue": null, + "current": { + "isNone": true, + "selected": false, + "text": "None", + "value": "" + }, + "datasource": "prometheus", + "definition": "", + "hide": 2, + "includeAll": false, + "label": "环境", + "multi": false, + "multiFormat": "regex values", + "name": "env", + "options": [], + "query": "label_values(node_exporter_build_info,env)", + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allFormat": "glob", + "allValue": "", + "current": { + "selected": true, + "text": "None", + "value": [ + "" + ] + }, + "datasource": "prometheus", + "definition": "label_values(node_exporter_build_info{env=~'$env'},name)", + "hide": 2, + "includeAll": false, + "label": "名称", + "multi": true, + "multiFormat": "regex values", + "name": "name", + "options": [], + "query": "label_values(node_exporter_build_info{env=~'$env'},name)", + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "tagValuesQuery": "/.*/", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-15m", + "to": "now" + }, + "timepicker": { + "now": true, + "refresh_intervals": [ + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "Master Node Details", + "uid": "3NR7BmmMk", + "version": 1 +} \ No newline at end of file diff --git a/monitoring/grafana/datasources/datasource.yml b/monitoring/grafana/datasources/datasource.yml new file mode 100755 index 0000000..ae235f3 --- /dev/null +++ b/monitoring/grafana/datasources/datasource.yml @@ -0,0 +1,27 @@ +# +# +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: MIT-0 +# +# +apiVersion: 1 +datasources: + - name: prometheus + type: prometheus + access: proxy + orgId: 1 + version: 1 + url: http://localhost:9090 + isDefault: true + editable: true + jsonData: + timeInterval: 5s + - name: cloudwatch + type: cloudwatch + orgId: 1 + version: 1 + editable: true + jsonData: + authType: default + defaultRegion: us-east + \ No newline at end of file diff --git a/monitoring/nginx/conf.d/nginx.conf b/monitoring/nginx/conf.d/nginx.conf new file mode 100644 index 0000000..0791b19 --- /dev/null +++ b/monitoring/nginx/conf.d/nginx.conf @@ -0,0 +1,22 @@ +server { + listen 80 default_server; + listen [::]:80 default_server; + server_name _; + server_tokens off; + + location /grafana/ { + proxy_pass http://localhost:3000/; + } + + location /prometheus/ { + proxy_pass http://localhost:9090/; + } + + location /pushgateway/ { + proxy_pass http://localhost:9091/; + } + + location /slurmexporter/ { + proxy_pass http://localhost:8081/; + } +} \ No newline at end of file diff --git a/monitoring/prometheus-slurm-exporter/slurm_exporter.service b/monitoring/prometheus-slurm-exporter/slurm_exporter.service new file mode 100644 index 0000000..d54498b --- /dev/null +++ b/monitoring/prometheus-slurm-exporter/slurm_exporter.service @@ -0,0 +1,13 @@ +[Unit] +Description=Prometheus SLURM Exporter + +[Service] +Environment=PATH=/opt/slurm/bin:$PATH +ExecStart=/usr/bin/prometheus-slurm-exporter -listen-address 0.0.0.0:8081 +Restart=on-failure +RestartSec=15 +Type=simple + + +[Install] +WantedBy=multi-user.target \ No newline at end of file diff --git a/monitoring/prometheus/prometheus.yml b/monitoring/prometheus/prometheus.yml new file mode 100755 index 0000000..3438fbc --- /dev/null +++ b/monitoring/prometheus/prometheus.yml @@ -0,0 +1,73 @@ +# +# +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: MIT-0 +# +# +global: + scrape_interval: 15s + evaluation_interval: 15s + scrape_timeout: 15s + +scrape_configs: + - job_name: 'slurm_exporter' + scrape_interval: 30s + scrape_timeout: 30s + static_configs: + - targets: ['localhost:8081'] + - job_name: 'pushgateway' + honor_labels: true + static_configs: + - targets: ['localhost:9091'] + - job_name: 'prometheus_server' + scrape_interval: 5s + static_configs: + - targets: ['localhost:9090'] + - job_name: 'ec2_instances' + scrape_interval: 5s + ec2_sd_configs: + - port: 9100 + refresh_interval: 10s + - port: 9400 + refresh_interval: 10s + filters: + - name: instance-type + values: + - p2.xlarge + - p2.8xlarge + - p2.16xlarge + - p3.2xlarge + - p3.8xlarge + - p3.16xlarge + - p3dn.24xlarge + - p4d.24xlarge + - g3s.xlarge + - g3.4xlarge + - g3.8xlarge + - g3.16xlarge + - g4dn.xlarge + - g4dn.2xlarge + - g4dn.4xlarge + - g4dn.8xlarge + - g4dn.16xlarge + - g4dn.12xlarge + - g4dn.metal + relabel_configs: + - source_labels: [__meta_ec2_tag_Name] + target_label: instance_name + - source_labels: [__meta_ec2_tag_Application] + target_label: instance_grafana + regex: __Application__ + action: keep + - source_labels: [__meta_ec2_instance_id] + target_label: instance_id + - source_labels: [__meta_ec2_availability_zone] + target_label: instance_az + - source_labels: [__meta_ec2_instance_state] + regex: running + action: keep + target_label: instance_state + - source_labels: [__meta_ec2_instance_type] + target_label: instance_type + - source_labels: [__meta_ec2_vpc_id] + target_label: instance_vpc \ No newline at end of file diff --git a/monitoring/www/aws-logo.svg b/monitoring/www/aws-logo.svg new file mode 100644 index 0000000..4d23322 --- /dev/null +++ b/monitoring/www/aws-logo.svg @@ -0,0 +1 @@ +AWS-Logo_White-Color \ No newline at end of file diff --git a/monitoring/www/background.png b/monitoring/www/background.png new file mode 100644 index 0000000..4208555 Binary files /dev/null and b/monitoring/www/background.png differ diff --git a/monitoring/www/index.html b/monitoring/www/index.html new file mode 100644 index 0000000..fe63d21 --- /dev/null +++ b/monitoring/www/index.html @@ -0,0 +1,97 @@ + + + + + AWS ParallelCluster + + + +
+ +

1Click-HPC

+

1Click-HPC is an open-source project that aims at speeding up the deployment of an HPC Cluster on AWS. You can have a fully functional and ready to use HPC cluster in minutes and with just 1-Click.

+

1Click-HPC source code and get started guide can be found .

+

It leverages on AWS supported services and projects, like:

+ + + +

+

+

+
+ + \ No newline at end of file diff --git a/parallelcluster/config.ap-east-1.sample b/parallelcluster/config.ap-east-1.sample index aa17ec5..2b89dd8 100644 --- a/parallelcluster/config.ap-east-1.sample +++ b/parallelcluster/config.ap-east-1.sample @@ -40,7 +40,7 @@ queue_settings = batch-efa, batch, dcv-gpu, dcv, spot-batch-efa post_install = ${POST_INSTALL} post_install_args = "${POST_INSTALL_ARGS}" tags = {"EnginFrame" : "true", "1Click-HPC" : "true", "1Click-HPC-version" : "0.1", "1Click-HPC-Template" : "limited-ec2" } -additional_iam_policies = arn:aws:iam::aws:policy/ElasticLoadBalancingFullAccess,arn:aws:iam::aws:policy/SecretsManagerReadWrite +additional_iam_policies = arn:aws:iam::aws:policy/ElasticLoadBalancingFullAccess,arn:aws:iam::aws:policy/SecretsManagerReadWrite,arn:aws:iam::aws:policy/CloudWatchFullAccess,arn:aws:iam::aws:policy/AWSPriceListServiceFullAccess,arn:aws:iam::aws:policy/AmazonSSMFullAccess,arn:aws:iam::aws:policy/AWSCloudFormationReadOnlyAccess ${EXTRA_JSON} [fsx new] diff --git a/parallelcluster/config.ap-northeast-1.sample b/parallelcluster/config.ap-northeast-1.sample index ad4cd28..04c5a3b 100644 --- a/parallelcluster/config.ap-northeast-1.sample +++ b/parallelcluster/config.ap-northeast-1.sample @@ -40,7 +40,7 @@ queue_settings = batch-efa, batch, dcv-gpu, dcv, spot-batch-efa post_install = ${POST_INSTALL} post_install_args = "${POST_INSTALL_ARGS}" tags = {"EnginFrame" : "true", "1Click-HPC" : "true", "1Click-HPC-version" : "0.1", "1Click-HPC-Template" : "standard" } -additional_iam_policies = arn:aws:iam::aws:policy/ElasticLoadBalancingFullAccess,arn:aws:iam::aws:policy/SecretsManagerReadWrite +additional_iam_policies = arn:aws:iam::aws:policy/ElasticLoadBalancingFullAccess,arn:aws:iam::aws:policy/SecretsManagerReadWrite,arn:aws:iam::aws:policy/CloudWatchFullAccess,arn:aws:iam::aws:policy/AWSPriceListServiceFullAccess,arn:aws:iam::aws:policy/AmazonSSMFullAccess,arn:aws:iam::aws:policy/AWSCloudFormationReadOnlyAccess ${EXTRA_JSON} [fsx new] diff --git a/parallelcluster/config.ap-northeast-2.sample b/parallelcluster/config.ap-northeast-2.sample index ad4cd28..04c5a3b 100644 --- a/parallelcluster/config.ap-northeast-2.sample +++ b/parallelcluster/config.ap-northeast-2.sample @@ -40,7 +40,7 @@ queue_settings = batch-efa, batch, dcv-gpu, dcv, spot-batch-efa post_install = ${POST_INSTALL} post_install_args = "${POST_INSTALL_ARGS}" tags = {"EnginFrame" : "true", "1Click-HPC" : "true", "1Click-HPC-version" : "0.1", "1Click-HPC-Template" : "standard" } -additional_iam_policies = arn:aws:iam::aws:policy/ElasticLoadBalancingFullAccess,arn:aws:iam::aws:policy/SecretsManagerReadWrite +additional_iam_policies = arn:aws:iam::aws:policy/ElasticLoadBalancingFullAccess,arn:aws:iam::aws:policy/SecretsManagerReadWrite,arn:aws:iam::aws:policy/CloudWatchFullAccess,arn:aws:iam::aws:policy/AWSPriceListServiceFullAccess,arn:aws:iam::aws:policy/AmazonSSMFullAccess,arn:aws:iam::aws:policy/AWSCloudFormationReadOnlyAccess ${EXTRA_JSON} [fsx new] diff --git a/parallelcluster/config.ap-south-1.sample b/parallelcluster/config.ap-south-1.sample index aa17ec5..2b89dd8 100644 --- a/parallelcluster/config.ap-south-1.sample +++ b/parallelcluster/config.ap-south-1.sample @@ -40,7 +40,7 @@ queue_settings = batch-efa, batch, dcv-gpu, dcv, spot-batch-efa post_install = ${POST_INSTALL} post_install_args = "${POST_INSTALL_ARGS}" tags = {"EnginFrame" : "true", "1Click-HPC" : "true", "1Click-HPC-version" : "0.1", "1Click-HPC-Template" : "limited-ec2" } -additional_iam_policies = arn:aws:iam::aws:policy/ElasticLoadBalancingFullAccess,arn:aws:iam::aws:policy/SecretsManagerReadWrite +additional_iam_policies = arn:aws:iam::aws:policy/ElasticLoadBalancingFullAccess,arn:aws:iam::aws:policy/SecretsManagerReadWrite,arn:aws:iam::aws:policy/CloudWatchFullAccess,arn:aws:iam::aws:policy/AWSPriceListServiceFullAccess,arn:aws:iam::aws:policy/AmazonSSMFullAccess,arn:aws:iam::aws:policy/AWSCloudFormationReadOnlyAccess ${EXTRA_JSON} [fsx new] diff --git a/parallelcluster/config.ca-central-1.sample b/parallelcluster/config.ca-central-1.sample index aa17ec5..2b89dd8 100644 --- a/parallelcluster/config.ca-central-1.sample +++ b/parallelcluster/config.ca-central-1.sample @@ -40,7 +40,7 @@ queue_settings = batch-efa, batch, dcv-gpu, dcv, spot-batch-efa post_install = ${POST_INSTALL} post_install_args = "${POST_INSTALL_ARGS}" tags = {"EnginFrame" : "true", "1Click-HPC" : "true", "1Click-HPC-version" : "0.1", "1Click-HPC-Template" : "limited-ec2" } -additional_iam_policies = arn:aws:iam::aws:policy/ElasticLoadBalancingFullAccess,arn:aws:iam::aws:policy/SecretsManagerReadWrite +additional_iam_policies = arn:aws:iam::aws:policy/ElasticLoadBalancingFullAccess,arn:aws:iam::aws:policy/SecretsManagerReadWrite,arn:aws:iam::aws:policy/CloudWatchFullAccess,arn:aws:iam::aws:policy/AWSPriceListServiceFullAccess,arn:aws:iam::aws:policy/AmazonSSMFullAccess,arn:aws:iam::aws:policy/AWSCloudFormationReadOnlyAccess ${EXTRA_JSON} [fsx new] diff --git a/parallelcluster/config.eu-central-1.sample b/parallelcluster/config.eu-central-1.sample index a7f6f09..47b2321 100644 --- a/parallelcluster/config.eu-central-1.sample +++ b/parallelcluster/config.eu-central-1.sample @@ -40,7 +40,7 @@ queue_settings = batch-efa, batch, dcv-gpu, dcv, spot-batch-efa post_install = ${POST_INSTALL} post_install_args = "${POST_INSTALL_ARGS}" tags = {"EnginFrame" : "true", "1Click-HPC" : "true", "1Click-HPC-version" : "0.1", "1Click-HPC-Template" : "standard" } -additional_iam_policies = arn:aws:iam::aws:policy/ElasticLoadBalancingFullAccess,arn:aws:iam::aws:policy/SecretsManagerReadWrite +additional_iam_policies = arn:aws:iam::aws:policy/ElasticLoadBalancingFullAccess,arn:aws:iam::aws:policy/SecretsManagerReadWrite,arn:aws:iam::aws:policy/CloudWatchFullAccess,arn:aws:iam::aws:policy/AWSPriceListServiceFullAccess,arn:aws:iam::aws:policy/AmazonSSMFullAccess,arn:aws:iam::aws:policy/AWSCloudFormationReadOnlyAccess ${EXTRA_JSON} [fsx new] diff --git a/parallelcluster/config.eu-north-1.sample b/parallelcluster/config.eu-north-1.sample index aa17ec5..2b89dd8 100644 --- a/parallelcluster/config.eu-north-1.sample +++ b/parallelcluster/config.eu-north-1.sample @@ -40,7 +40,7 @@ queue_settings = batch-efa, batch, dcv-gpu, dcv, spot-batch-efa post_install = ${POST_INSTALL} post_install_args = "${POST_INSTALL_ARGS}" tags = {"EnginFrame" : "true", "1Click-HPC" : "true", "1Click-HPC-version" : "0.1", "1Click-HPC-Template" : "limited-ec2" } -additional_iam_policies = arn:aws:iam::aws:policy/ElasticLoadBalancingFullAccess,arn:aws:iam::aws:policy/SecretsManagerReadWrite +additional_iam_policies = arn:aws:iam::aws:policy/ElasticLoadBalancingFullAccess,arn:aws:iam::aws:policy/SecretsManagerReadWrite,arn:aws:iam::aws:policy/CloudWatchFullAccess,arn:aws:iam::aws:policy/AWSPriceListServiceFullAccess,arn:aws:iam::aws:policy/AmazonSSMFullAccess,arn:aws:iam::aws:policy/AWSCloudFormationReadOnlyAccess ${EXTRA_JSON} [fsx new] diff --git a/parallelcluster/config.eu-south-1.sample b/parallelcluster/config.eu-south-1.sample index 389f8f6..e2703e9 100644 --- a/parallelcluster/config.eu-south-1.sample +++ b/parallelcluster/config.eu-south-1.sample @@ -40,7 +40,7 @@ queue_settings = batch-efa, batch, dcv-gpu, spot-batch, spot-batch-efa post_install = ${POST_INSTALL} post_install_args = "${POST_INSTALL_ARGS}" tags = {"EnginFrame" : "true", "1Click-HPC" : "true", "1Click-HPC-version" : "0.1", "1Click-HPC-Template" : "standard" } -additional_iam_policies = arn:aws:iam::aws:policy/ElasticLoadBalancingFullAccess,arn:aws:iam::aws:policy/SecretsManagerReadWrite +additional_iam_policies = arn:aws:iam::aws:policy/ElasticLoadBalancingFullAccess,arn:aws:iam::aws:policy/SecretsManagerReadWrite,arn:aws:iam::aws:policy/CloudWatchFullAccess,arn:aws:iam::aws:policy/AWSPriceListServiceFullAccess,arn:aws:iam::aws:policy/AmazonSSMFullAccess,arn:aws:iam::aws:policy/AWSCloudFormationReadOnlyAccess ${EXTRA_JSON} [fsx new] diff --git a/parallelcluster/config.eu-west-1.sample b/parallelcluster/config.eu-west-1.sample index a7f6f09..47b2321 100644 --- a/parallelcluster/config.eu-west-1.sample +++ b/parallelcluster/config.eu-west-1.sample @@ -40,7 +40,7 @@ queue_settings = batch-efa, batch, dcv-gpu, dcv, spot-batch-efa post_install = ${POST_INSTALL} post_install_args = "${POST_INSTALL_ARGS}" tags = {"EnginFrame" : "true", "1Click-HPC" : "true", "1Click-HPC-version" : "0.1", "1Click-HPC-Template" : "standard" } -additional_iam_policies = arn:aws:iam::aws:policy/ElasticLoadBalancingFullAccess,arn:aws:iam::aws:policy/SecretsManagerReadWrite +additional_iam_policies = arn:aws:iam::aws:policy/ElasticLoadBalancingFullAccess,arn:aws:iam::aws:policy/SecretsManagerReadWrite,arn:aws:iam::aws:policy/CloudWatchFullAccess,arn:aws:iam::aws:policy/AWSPriceListServiceFullAccess,arn:aws:iam::aws:policy/AmazonSSMFullAccess,arn:aws:iam::aws:policy/AWSCloudFormationReadOnlyAccess ${EXTRA_JSON} [fsx new] diff --git a/parallelcluster/config.us-east-1.sample b/parallelcluster/config.us-east-1.sample index ad4cd28..04c5a3b 100644 --- a/parallelcluster/config.us-east-1.sample +++ b/parallelcluster/config.us-east-1.sample @@ -40,7 +40,7 @@ queue_settings = batch-efa, batch, dcv-gpu, dcv, spot-batch-efa post_install = ${POST_INSTALL} post_install_args = "${POST_INSTALL_ARGS}" tags = {"EnginFrame" : "true", "1Click-HPC" : "true", "1Click-HPC-version" : "0.1", "1Click-HPC-Template" : "standard" } -additional_iam_policies = arn:aws:iam::aws:policy/ElasticLoadBalancingFullAccess,arn:aws:iam::aws:policy/SecretsManagerReadWrite +additional_iam_policies = arn:aws:iam::aws:policy/ElasticLoadBalancingFullAccess,arn:aws:iam::aws:policy/SecretsManagerReadWrite,arn:aws:iam::aws:policy/CloudWatchFullAccess,arn:aws:iam::aws:policy/AWSPriceListServiceFullAccess,arn:aws:iam::aws:policy/AmazonSSMFullAccess,arn:aws:iam::aws:policy/AWSCloudFormationReadOnlyAccess ${EXTRA_JSON} [fsx new] diff --git a/parallelcluster/config.us-east-2.sample b/parallelcluster/config.us-east-2.sample index ad4cd28..04c5a3b 100644 --- a/parallelcluster/config.us-east-2.sample +++ b/parallelcluster/config.us-east-2.sample @@ -40,7 +40,7 @@ queue_settings = batch-efa, batch, dcv-gpu, dcv, spot-batch-efa post_install = ${POST_INSTALL} post_install_args = "${POST_INSTALL_ARGS}" tags = {"EnginFrame" : "true", "1Click-HPC" : "true", "1Click-HPC-version" : "0.1", "1Click-HPC-Template" : "standard" } -additional_iam_policies = arn:aws:iam::aws:policy/ElasticLoadBalancingFullAccess,arn:aws:iam::aws:policy/SecretsManagerReadWrite +additional_iam_policies = arn:aws:iam::aws:policy/ElasticLoadBalancingFullAccess,arn:aws:iam::aws:policy/SecretsManagerReadWrite,arn:aws:iam::aws:policy/CloudWatchFullAccess,arn:aws:iam::aws:policy/AWSPriceListServiceFullAccess,arn:aws:iam::aws:policy/AmazonSSMFullAccess,arn:aws:iam::aws:policy/AWSCloudFormationReadOnlyAccess ${EXTRA_JSON} [fsx new] diff --git a/parallelcluster/config.us-west-1.sample b/parallelcluster/config.us-west-1.sample index ad4cd28..04c5a3b 100644 --- a/parallelcluster/config.us-west-1.sample +++ b/parallelcluster/config.us-west-1.sample @@ -40,7 +40,7 @@ queue_settings = batch-efa, batch, dcv-gpu, dcv, spot-batch-efa post_install = ${POST_INSTALL} post_install_args = "${POST_INSTALL_ARGS}" tags = {"EnginFrame" : "true", "1Click-HPC" : "true", "1Click-HPC-version" : "0.1", "1Click-HPC-Template" : "standard" } -additional_iam_policies = arn:aws:iam::aws:policy/ElasticLoadBalancingFullAccess,arn:aws:iam::aws:policy/SecretsManagerReadWrite +additional_iam_policies = arn:aws:iam::aws:policy/ElasticLoadBalancingFullAccess,arn:aws:iam::aws:policy/SecretsManagerReadWrite,arn:aws:iam::aws:policy/CloudWatchFullAccess,arn:aws:iam::aws:policy/AWSPriceListServiceFullAccess,arn:aws:iam::aws:policy/AmazonSSMFullAccess,arn:aws:iam::aws:policy/AWSCloudFormationReadOnlyAccess ${EXTRA_JSON} [fsx new] diff --git a/parallelcluster/config.us-west-2.sample b/parallelcluster/config.us-west-2.sample index ad4cd28..04c5a3b 100644 --- a/parallelcluster/config.us-west-2.sample +++ b/parallelcluster/config.us-west-2.sample @@ -40,7 +40,7 @@ queue_settings = batch-efa, batch, dcv-gpu, dcv, spot-batch-efa post_install = ${POST_INSTALL} post_install_args = "${POST_INSTALL_ARGS}" tags = {"EnginFrame" : "true", "1Click-HPC" : "true", "1Click-HPC-version" : "0.1", "1Click-HPC-Template" : "standard" } -additional_iam_policies = arn:aws:iam::aws:policy/ElasticLoadBalancingFullAccess,arn:aws:iam::aws:policy/SecretsManagerReadWrite +additional_iam_policies = arn:aws:iam::aws:policy/ElasticLoadBalancingFullAccess,arn:aws:iam::aws:policy/SecretsManagerReadWrite,arn:aws:iam::aws:policy/CloudWatchFullAccess,arn:aws:iam::aws:policy/AWSPriceListServiceFullAccess,arn:aws:iam::aws:policy/AmazonSSMFullAccess,arn:aws:iam::aws:policy/AWSCloudFormationReadOnlyAccess ${EXTRA_JSON} [fsx new] diff --git a/scripts/post.install.sh b/scripts/post.install.sh index 00536d1..9e94c3e 100644 --- a/scripts/post.install.sh +++ b/scripts/post.install.sh @@ -82,7 +82,7 @@ export ec2user_home=$(getent passwd | grep ec2-user | sed 's/^.*:.*:.*:.*:.*:\(. export dna_json="/etc/chef/dna.json" if [[ -z "${cfn_postinstall_args}" ]]; then - export myscripts="03.configure.slurm.acct.master.sh 04.configure.slurm.AllOrNothing.master.sh 04.configure.disable.anacron.compute.sh 05.install.ldap.server.master.sh 06.install.ldap.client.compute.sh 06.install.ldap.client.master.sh 07.configure.slurm.tagging.master.sh 10.install.enginframe.master.sh 11.install.ldap.enginframe.master.sh 12.configure.enginframe.alb.master.sh 20.install.dcv.slurm.master.sh 25.install.dcv-server.compute.sh 26.configure.dcv.alb.compute.sh 35.install.dcv.slurm.compute.sh" + export myscripts="03.configure.slurm.acct.master.sh 04.configure.slurm.AllOrNothing.master.sh 04.configure.disable.anacron.compute.sh 05.install.ldap.server.master.sh 06.install.ldap.client.compute.sh 06.install.ldap.client.master.sh 07.configure.slurm.tagging.master.sh 10.install.enginframe.master.sh 11.install.ldap.enginframe.master.sh 12.configure.enginframe.alb.master.sh 20.install.dcv.slurm.master.sh 25.install.dcv-server.compute.sh 26.configure.dcv.alb.compute.sh 35.install.dcv.slurm.compute.sh 40.install.monitoring.master.sh 40.install.monitoring.compute.sh" else export myscripts="${@:2}" fi