Skip to content

Commit

Permalink
Merge pull request #5 from aws-samples/monitoring
Browse files Browse the repository at this point in the history
Monitoring
  • Loading branch information
nicolaven authored Oct 18, 2021
2 parents 2150d30 + e8541df commit d97718a
Show file tree
Hide file tree
Showing 38 changed files with 11,598 additions and 16 deletions.
4 changes: 2 additions & 2 deletions Templates/AWS-HPC-Cluster.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -957,8 +957,8 @@ Outputs:
Cloud9URL:
Description: Cloud9 Environment
Value: !Sub 'https://${AWS::Region}.console.aws.amazon.com/cloud9/ide/${Cloud9}'
EnginFrameURL:
WebURL:
Description: "EnginFrame HPC Portal, default username: ec2-user , default password: Change_this!"
Value: !Sub
- 'https://${ALB}/enginframe'
- 'https://${ALB}/'
- ALB: !GetAtt ApplicationLoadBalancer.DNSName
64 changes: 64 additions & 0 deletions modules/40.install.monitoring.compute.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
#!/bin/bash

# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: MIT-0
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of this
# software and associated documentation files (the "Software"), to deal in the Software
# without restriction, including without limitation the rights to use, copy, modify,
# merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
# permit persons to whom the Software is furnished to do so.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
# INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
# PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

source /etc/parallelcluster/cfnconfig
compute_instance_type=$(ec2-metadata -t | awk '{print $2}')
gpu_instances="[pg][2-9].*\.[0-9]*[x]*large"

monitoring_dir_name="monitoring"
monitoring_home="${SHARED_FS_DIR}/${monitoring_dir_name}"

set -x
set -e

installPreReq() {
yum -y install docker golang-bin
service docker start
chkconfig docker on
usermod -a -G docker $cfn_cluster_user

#to be replaced with yum -y install docker-compose as the repository problem is fixed
curl -L "https://github.com/docker/compose/releases/download/1.29.2/docker-compose-$(uname -s)-$(uname -m)" -o /usr/local/bin/docker-compose
chmod +x /usr/local/bin/docker-compose
}

configureMonitoring() {

if [[ $compute_instance_type =~ $gpu_instances ]]; then
distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.repo | tee /etc/yum.repos.d/nvidia-docker.repo
yum -y clean expire-cache
yum -y install nvidia-docker2
systemctl restart docker
/usr/local/bin/docker-compose -f "${monitoring_home}/docker-compose/docker-compose.compute.gpu.yml" -p monitoring-compute up -d

else
/usr/local/bin/docker-compose -f "${monitoring_home}/docker-compose/docker-compose.compute.yml" -p monitoring-compute up -d
fi
}

# main
# ----------------------------------------------------------------------------
main() {
echo "[INFO][$(date '+%Y-%m-%d %H:%M:%S')] 40.install.monitoring.compute.sh: START" >&2
installPreReq
configureMonitoring
echo "[INFO][$(date '+%Y-%m-%d %H:%M:%S')] 40.install.monitoring.compute.sh: STOP" >&2
}

main "$@"
130 changes: 130 additions & 0 deletions modules/40.install.monitoring.master.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
#!/bin/bash

# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: MIT-0
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of this
# software and associated documentation files (the "Software"), to deal in the Software
# without restriction, including without limitation the rights to use, copy, modify,
# merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
# permit persons to whom the Software is furnished to do so.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
# INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
# PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

source /etc/parallelcluster/cfnconfig
cfn_fsx_fs_id=$(cat /etc/chef/dna.json | grep \"cfn_fsx_fs_id\" | awk '{print $2}' | sed "s/\",//g;s/\"//g")
master_instance_id=$(ec2-metadata -i | awk '{print $2}')
cfn_max_queue_size=$(aws cloudformation describe-stacks --stack-name $stack_name --region $cfn_region | jq -r '.Stacks[0].Parameters | map(select(.ParameterKey == "MaxSize"))[0].ParameterValue')
monitoring_dir_name="monitoring"
monitoring_home="${SHARED_FS_DIR}/${monitoring_dir_name}"
chef_dna="/etc/chef/dna.json"
s3_bucket=$(echo $cfn_postinstall | sed "s/s3:\/\///g;s/\/.*//")
grafana_password=$(aws secretsmanager get-secret-value --secret-id "${stack_name}" --query SecretString --output text --region "${cfn_region}")
NICE_ROOT=$(jq --arg default "${SHARED_FS_DIR}/nice" -r '.post_install.enginframe | if has("nice_root") then .nice_root else $default end' "${dna_json}")


set -x
set -e

installPreReq() {
yum -y install docker golang-bin
service docker start
chkconfig docker on
usermod -a -G docker $cfn_cluster_user

#to be replaced with yum -y install docker-compose as the repository problem is fixed
curl -L "https://github.com/docker/compose/releases/download/1.29.2/docker-compose-$(uname -s)-$(uname -m)" -o /usr/local/bin/docker-compose
chmod +x /usr/local/bin/docker-compose
}

saveClusterConfigLocally(){

cluster_s3_bucket=$(cat "${chef_dna}" | grep \"cluster_s3_bucket\" | awk '{print $2}' | sed "s/\",//g;s/\"//g")
cluster_config_s3_key=$(cat "${chef_dna}" | grep \"cluster_config_s3_key\" | awk '{print $2}' | sed "s/\",//g;s/\"//g")
cluster_config_version=$(cat "${chef_dna}" | grep \"cluster_config_version\" | awk '{print $2}' | sed "s/\",//g;s/\"//g")
log_group_names="\/aws\/parallelcluster\/$(echo ${stack_name} | cut -d "-" -f2-)"

mkdir -p "${monitoring_home}/parallelcluster"
aws s3api get-object --bucket $cluster_s3_bucket --key $cluster_config_s3_key --region $cfn_region --version-id $cluster_config_version "${monitoring_home}/parallelcluster/cluster-config.json"
}

installMonitoring(){

aws s3 cp --recursive "${post_install_base}/monitoring" "${monitoring_home}" --region "${cfn_region}" || exit 1
chown $cfn_cluster_user:$cfn_cluster_user -R "${monitoring_home}"
chmod +x ${monitoring_home}/custom-metrics/*

cp -rp ${monitoring_home}/custom-metrics/* /usr/local/bin/
mv -f "${monitoring_home}/prometheus-slurm-exporter/slurm_exporter.service" /etc/systemd/system/

cp -rp ${monitoring_home}/www/* "${NICE_ROOT}/enginframe/conf/tomcat/webapps/ROOT/"
}



configureMonitoring() {

(crontab -l -u $cfn_cluster_user; echo "*/1 * * * * /usr/local/bin/1m-cost-metrics.sh") | crontab -u $cfn_cluster_user -
(crontab -l -u $cfn_cluster_user; echo "*/60 * * * * /usr/local/bin/1h-cost-metrics.sh") | crontab -u $cfn_cluster_user -

# replace tokens
sed -i "s/_S3_BUCKET_/${s3_bucket}/g" "${monitoring_home}/grafana/dashboards/ParallelCluster.json"
sed -i "s/__INSTANCE_ID__/${master_instance_id}/g" "${monitoring_home}/grafana/dashboards/ParallelCluster.json"
sed -i "s/__FSX_ID__/${cfn_fsx_fs_id}/g" "${monitoring_home}/grafana/dashboards/ParallelCluster.json"
sed -i "s/__AWS_REGION__/${cfn_region}/g" "${monitoring_home}/grafana/dashboards/ParallelCluster.json"

sed -i "s/__AWS_REGION__/${cfn_region}/g" "${monitoring_home}/grafana/dashboards/logs.json"
sed -i "s/__LOG_GROUP__NAMES__/${log_group_names}/g" "${monitoring_home}/grafana/dashboards/logs.json"

sed -i "s/__Application__/${stack_name}/g" "${monitoring_home}/prometheus/prometheus.yml"

sed -i "s/__INSTANCE_ID__/${master_instance_id}/g" "${monitoring_home}/grafana/dashboards/master-node-details.json"
sed -i "s/__INSTANCE_ID__/${master_instance_id}/g" "${monitoring_home}/grafana/dashboards/compute-node-list.json"
sed -i "s/__INSTANCE_ID__/${master_instance_id}/g" "${monitoring_home}/grafana/dashboards/compute-node-details.json"

sed -i "s~__MONITORING_DIR__~${monitoring_home}~g" "${monitoring_home}/docker-compose/docker-compose.master.yml"
sed -i "s~__GRAFANA_PASSWORD__~${grafana_password}~g" "${monitoring_home}/docker-compose/docker-compose.master.yml"


# Download and build prometheus-slurm-exporter
##### Plese note this software package is under GPLv3 License #####
# More info here: https://github.com/vpenso/prometheus-slurm-exporter/blob/master/LICENSE
cd "${monitoring_home}"
#FIXME: temporary
rm -rf prometheus-slurm-exporter
git clone https://github.com/vpenso/prometheus-slurm-exporter.git
cd prometheus-slurm-exporter
sed -i 's/NodeList,AllocMem,Memory,CPUsState,StateLong/NodeList: ,AllocMem: ,Memory: ,CPUsState: ,StateLong:/' node.go
GOPATH=/root/go-modules-cache HOME=/root go mod download
GOPATH=/root/go-modules-cache HOME=/root go build
mv -f "${monitoring_home}/prometheus-slurm-exporter/prometheus-slurm-exporter" /usr/bin/prometheus-slurm-exporter
}


startMonitoringDaemons() {

/usr/local/bin/docker-compose --env-file /etc/parallelcluster/cfnconfig -f "${monitoring_home}/docker-compose/docker-compose.master.yml" -p monitoring-master up -d
systemctl daemon-reload
systemctl enable slurm_exporter
systemctl start slurm_exporter

}

# main
# ----------------------------------------------------------------------------
main() {
echo "[INFO][$(date '+%Y-%m-%d %H:%M:%S')] 40.install.monitoring.master.sh: START" >&2
installPreReq
saveClusterConfigLocally
installMonitoring
configureMonitoring
startMonitoringDaemons
echo "[INFO][$(date '+%Y-%m-%d %H:%M:%S')] 40.install.monitoring.master.sh: STOP" >&2
}

main "$@"
123 changes: 123 additions & 0 deletions monitoring/custom-metrics/1h-cost-metrics.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
#!/bin/bash
#
#
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: MIT-0
#
#

#source the AWS ParallelCluster profile
. /etc/parallelcluster/cfnconfig

export AWS_DEFAULT_REGION=$cfn_region
aws_region_long_name=$(python /usr/local/bin/aws-region.py $cfn_region)
aws_region_long_name=${aws_region_long_name/Europe/EU}

masterInstanceType=$(ec2-metadata -t | awk '{print $2}')
masterInstanceId=$(ec2-metadata -i | awk '{print $2}')
s3_bucket=$(echo $cfn_postinstall | sed "s/s3:\/\///g;s/\/.*//")
s3_size_gb=$(echo "$(aws s3api list-objects --bucket $s3_bucket --output json --query "[sum(Contents[].Size)]"| sed -n 2p | tr -d ' ') / 1024 / 1024 / 1024" | bc)


#retrieve the s3 cost
if [[ $s3_size_gb -le 51200 ]]; then
s3_range=51200
elif [[ $VAR -le 512000 ]]; then
s3_range=512000
else
s3_range="Inf"
fi

####################### S3 #########################

s3_cost_gb_month=$(aws --region us-east-1 pricing get-products \
--service-code AmazonS3 \
--filters 'Type=TERM_MATCH,Field=location,Value='"${aws_region_long_name}" \
'Type=TERM_MATCH,Field=storageClass,Value=General Purpose' \
--query 'PriceList[0]' --output text \
| jq -r --arg endRange $s3_range '.terms.OnDemand | to_entries[] | .value.priceDimensions | to_entries[].value | select(.endRange==$endRange).pricePerUnit.USD')

s3=$(echo "scale=2; $s3_cost_gb_month * $s3_size_gb / 720" | bc)
echo "s3_cost $s3" | curl --data-binary @- http://127.0.0.1:9091/metrics/job/cost


####################### Master #########################
master_node_h_price=$(aws pricing get-products \
--region us-east-1 \
--service-code AmazonEC2 \
--filters 'Type=TERM_MATCH,Field=instanceType,Value='$masterInstanceType \
'Type=TERM_MATCH,Field=location,Value='"${aws_region_long_name}" \
'Type=TERM_MATCH,Field=preInstalledSw,Value=NA' \
'Type=TERM_MATCH,Field=operatingSystem,Value=Linux' \
'Type=TERM_MATCH,Field=tenancy,Value=Shared' \
'Type=TERM_MATCH,Field=capacitystatus,Value=UnusedCapacityReservation' \
--output text \
--query 'PriceList' \
| jq -r '.terms.OnDemand | to_entries[] | .value.priceDimensions | to_entries[] | .value.pricePerUnit.USD')

echo "master_node_cost $master_node_h_price" | curl --data-binary @- http://127.0.0.1:9091/metrics/job/cost


fsx_id=$(aws cloudformation describe-stacks --stack-name $stack_name --region $cfn_region \
| jq -r '.Stacks[0].Parameters | map(select(.ParameterKey == "FSXOptions"))[0].ParameterValue' \
| awk -F "," '{print $2}')
fsx_summary=$(aws fsx describe-file-systems --region $cfn_region --file-system-ids $fsx_id)
fsx_size_gb=$(echo $fsx_summary | jq -r '.FileSystems[0].StorageCapacity')
fsx_type=$(echo $fsx_summary | jq -r '.FileSystems[0].LustreConfiguration.DeploymentType')
fsx_throughput=$(echo $fsx_summary | jq -r '.FileSystems[0].LustreConfiguration.PerUnitStorageThroughput')

if [[ $fsx_type = "SCRATCH_2" ]] || [[ $fsx_type = "SCRATCH_1" ]]; then
fsx_cost_gb_month=$(aws pricing get-products \
--region us-east-1 \
--service-code AmazonFSx \
--filters 'Type=TERM_MATCH,Field=location,Value='"${aws_region_long_name}" \
'Type=TERM_MATCH,Field=fileSystemType,Value=Lustre' \
'Type=TERM_MATCH,Field=throughputCapacity,Value=N/A' \
--output text \
--query 'PriceList' \
| jq -r '.terms.OnDemand | to_entries[] | .value.priceDimensions | to_entries[] | .value.pricePerUnit.USD')

elif [ $fsx_type = "PERSISTENT_1" ]; then
fsx_cost_gb_month=$(aws pricing get-products \
--region us-east-1 \
--service-code AmazonFSx \
--filters 'Type=TERM_MATCH,Field=location,Value='"${aws_region_long_name}" \
'Type=TERM_MATCH,Field=fileSystemType,Value=Lustre' \
'Type=TERM_MATCH,Field=throughputCapacity,Value='$fsx_throughput \
--output text \
--query 'PriceList' \
| jq -r '.terms.OnDemand | to_entries[] | .value.priceDimensions | to_entries[] | .value.pricePerUnit.USD')

else
fsx_cost_gb_month=0
fi

fsx=$(echo "scale=2; $fsx_cost_gb_month * $fsx_size_gb / 720" | bc)
echo "fsx_cost $fsx" | curl --data-binary @- http://127.0.0.1:9091/metrics/job/cost


#parametrize:
ebs_volume_total_cost=0
ebs_volume_ids=$(aws ec2 describe-instances --instance-ids $masterInstanceId \
| jq -r '.Reservations | to_entries[].value | .Instances | to_entries[].value | .BlockDeviceMappings | to_entries[].value | .Ebs.VolumeId')

for ebs_volume_id in $ebs_volume_ids
do
ebs_volume_type=$(aws ec2 describe-volumes --volume-ids $ebs_volume_id | jq -r '.Volumes | to_entries[].value.VolumeType')
#ebs_volume_iops=$(aws ec2 describe-volumes --volume-ids $ebs_volume_id | jq -r '.Volumes | to_entries[].value.Iops')
ebs_volume_size=$(aws ec2 describe-volumes --volume-ids $ebs_volume_id | jq -r '.Volumes | to_entries[].value.Size')

ebs_cost_gb_month=$(aws --region us-east-1 pricing get-products \
--service-code AmazonEC2 \
--query 'PriceList' \
--output text \
--filters 'Type=TERM_MATCH,Field=location,Value='"${aws_region_long_name}" \
'Type=TERM_MATCH,Field=productFamily,Value=Storage' \
'Type=TERM_MATCH,Field=volumeApiName,Value='$ebs_volume_type \
| jq -r '.terms.OnDemand | to_entries[] | .value.priceDimensions | to_entries[] | .value.pricePerUnit.USD')

ebs_volume_cost=$(echo "scale=2; $ebs_cost_gb_month * $ebs_volume_size / 720" | bc)
ebs_volume_total_cost=$(echo "scale=2; $ebs_volume_total_cost + $ebs_volume_cost" | bc)
done

echo "ebs_master_cost $ebs_volume_total_cost" | curl --data-binary @- http://127.0.0.1:9091/metrics/job/cost
Loading

0 comments on commit d97718a

Please sign in to comment.