diff --git a/README.md b/README.md index 92eff436..eb1771e2 100644 --- a/README.md +++ b/README.md @@ -265,7 +265,7 @@ Example: The name of the cluster must be queueName-clusterNumber-instanceType_keyword -The keyword will need to match the one from /opt/oci-hpc/conf/queues.conf to be regirstered in Slurm +The keyword will need to match the one from /opt/oci-hpc/conf/queues.conf to be registered in Slurm ### Cluster Deletion: ``` @@ -293,8 +293,8 @@ Example of cluster command to add a new user: ```cluster user add name``` By default, a `privilege` group is created that has access to the NFS and can have sudo access on all nodes (Defined at the stack creation. This group has ID 9876) The group name can be modified. ```cluster user add name --gid 9876``` -To generate a user-specific key for passwordless ssh between nodes, use --ssh. -```cluster user add name --ssh --gid 9876``` +To avoid generating a user-specific key for passwordless ssh between nodes, use --nossh. +```cluster user add name --nossh --gid 9876``` # Shared home folder @@ -318,3 +318,43 @@ $ max_nodes --> Information about all the partitions and their respective cluste $ max_nodes --include_cluster_names xxx yyy zzz --> where xxx, yyy, zzz are cluster names. Provide a space separated list of cluster names to be considered for displaying the information about clusters and maximum number of nodes distributed evenly per partition + +## validation.py usage + +Use the alias "validate" to run the python script validation.py. You can run this script only from bastion. + +The script performs these checks. +-> Check the number of nodes is consistent across resize, /etc/hosts, slurm, topology.conf, OCI console, inventory files. +-> PCIe bandwidth check +-> GPU Throttle check +-> Check whether md5 sum of /etc/hosts file on nodes matches that on bastion + +Provide at least one argument: [-n NUM_NODES] [-p PCIE] [-g GPU_THROTTLE] [-e ETC_HOSTS] + +Optional argument with [-n NUM_NODES] [-p PCIE] [-g GPU_THROTTLE] [-e ETC_HOSTS]: [-cn CLUSTER_NAMES] +Provide a file that lists each cluster on a separate line for which you want to validate the number of nodes and/or pcie check and/or gpu throttle check and/or /etc/hosts md5 sum. + +For pcie, gpu throttle, and /etc/hosts md5 sum check, you can either provide y or Y along with -cn or you can give the hostfile path (each host on a separate line) for each argument. For number of nodes check, either provide y or give y along with -cn. + +Below are some examples for running this script. + +validate -n y --> This will validate that the number of nodes is consistent across resize, /etc/hosts, slurm, topology.conf, OCI console, inventory files. The clusters considered will be the default cluster if any and cluster(s) found in /opt/oci-hpc/autoscaling/clusters directory. The number of nodes considered will be from the resize script using the clusters we got before. + +validate -n y -cn --> This will validate that the number of nodes is consistent across resize, /etc/hosts, slurm, topology.conf, OCI console, inventory files. It will also check whether md5 sum of /etc/hosts file on all nodes matches that on bastion. The clusters considered will be from the file specified by -cn option. The number of nodes considered will be from the resize script using the clusters from the file. + +validate -p y -cn --> This will run the pcie bandwidth check. The clusters considered will be from the file specified by -cn option. The number of nodes considered will be from the resize script using the clusters from the file. + +validate -p --> This will run the pcie bandwidth check on the hosts provided in the file given. The pcie host file should have a host name on each line. + +validate -g y -cn --> This will run the GPU throttle check. The clusters considered will be from the file specified by -cn option. The number of nodes considered will be from the resize script using the clusters from the file. + +validate -g --> This will run the GPU throttle check on the hosts provided in the file given. The gpu check host file should have a host name on each line. + +validate -e y -cn --> This will run the GPU throttle check. The clusters considered will be from the file specified by -cn option. The number of nodes considered will be from the resize script using the clusters from the file. + +validate -e --> This will run the /etc/hosts md5 sum check on the hosts provided in the file given. The md5 sum check host file should have a host name on each line. + +You can combine all the options together such as: +validate -n y -p y -g y -e y -cn + + diff --git a/autoscaling/crontab/autoscale_slurm.sh b/autoscaling/crontab/autoscale_slurm.sh index 5bbee65d..7e2a0aa7 100755 --- a/autoscaling/crontab/autoscale_slurm.sh +++ b/autoscaling/crontab/autoscale_slurm.sh @@ -169,6 +169,11 @@ def getClusterName(node): for output in stdout.split('\n')[:-1]: if "Switches=" in output: clusterName=output.split()[0].split('SwitchName=')[1] + break + elif "SwitchName=inactive-" in output: + continue + else: + clusterName=output.split()[0].split('SwitchName=')[1] elif len(stdout.split('\n')) == 2: clusterName=stdout.split('\n')[0].split()[0].split('SwitchName=')[1] if clusterName.startswith("inactive-"): @@ -352,7 +357,7 @@ try: cluster_name=cluster[0] print ("Deleting cluster "+cluster_name) subprocess.Popen([script_path+'/delete_cluster.sh',cluster_name]) - time.sleep(1) + time.sleep(5) for cluster_name in nodes_to_destroy.keys(): print ("Resizing cluster "+cluster_name) @@ -374,7 +379,6 @@ try: subprocess.Popen([script_path+'/resize.sh','--force','--cluster_name',cluster_name,'remove','--remove_unreachable','--nodes']+initial_nodes) if len(unreachable_nodes) > 0: subprocess.Popen([script_path+'/resize.sh','--cluster_name',cluster_name,'remove_unreachable','--nodes']+unreachable_nodes) - time.sleep(1) for index,cluster in enumerate(cluster_to_build): diff --git a/autoscaling/tf_init/bastion_update.tf b/autoscaling/tf_init/bastion_update.tf index 18fc5889..316af6ff 100755 --- a/autoscaling/tf_init/bastion_update.tf +++ b/autoscaling/tf_init/bastion_update.tf @@ -22,10 +22,14 @@ resource "local_file" "inventory" { bastion_ip = var.bastion_ip, backup_name = var.backup_name, backup_ip = var.backup_ip, + login_name = var.login_name, + login_ip = var.login_ip, compute = var.node_count > 0 ? zipmap(local.cluster_instances_names, local.cluster_instances_ips) : zipmap([],[]) public_subnet = var.public_subnet, private_subnet = var.private_subnet, - nfs = local.cluster_instances_names[0], + rdma_network = cidrhost(var.rdma_subnet, 0), + rdma_netmask = cidrnetmask(var.rdma_subnet), + nfs = var.use_scratch_nfs ? local.cluster_instances_names[0] : "", scratch_nfs = var.use_scratch_nfs, cluster_nfs = var.use_cluster_nfs, home_nfs = var.home_nfs, @@ -53,7 +57,7 @@ resource "local_file" "inventory" { cluster_mount_ip = local.mount_ip, cluster_name = local.cluster_name, shape = var.cluster_network ? var.cluster_network_shape : var.instance_pool_shape, - instance_pool_ocpus=var.instance_pool_ocpus, + instance_pool_ocpus=local.instance_pool_ocpus, queue=var.queue, instance_type=var.instance_type, autoscaling_monitoring = var.autoscaling_monitoring, @@ -63,7 +67,9 @@ resource "local_file" "inventory" { privilege_group_name = var.privilege_group_name, latency_check = var.latency_check bastion_username = var.bastion_username, - compute_username = var.compute_username + compute_username = var.compute_username, + pam = var.pam, + sacct_limits = var.sacct_limits }) filename = "${local.bastion_path}/inventory" } diff --git a/autoscaling/tf_init/inventory.tpl b/autoscaling/tf_init/inventory.tpl index 11d848f0..9d2c062d 100755 --- a/autoscaling/tf_init/inventory.tpl +++ b/autoscaling/tf_init/inventory.tpl @@ -2,6 +2,8 @@ ${bastion_name} ansible_host=${bastion_ip} ansible_user=${bastion_username} role=bastion [slurm_backup] %{ if backup_name != "" }${backup_name} ansible_host=${backup_ip} ansible_user=${bastion_username} role=bastion%{ endif } +[login] +%{ if login_name != "" }${login_name} ansible_host=${login_ip} ansible_user=${compute_username} role=login%{ endif } [compute_to_add] [compute_configured] %{ for host, ip in compute ~} @@ -12,15 +14,15 @@ ${host} ansible_host=${ip} ansible_user=${compute_username} role=compute compute_to_add compute_configured [nfs] -${nfs} +%{ if nfs != "" }${nfs} ansible_user=${compute_username} role=nfs%{ endif } [all:children] bastion compute [all:vars] ansible_connection=ssh ansible_user=${compute_username} -rdma_network=192.168.128.0 -rdma_netmask=255.255.240.0 +rdma_network=${rdma_network} +rdma_netmask=${rdma_netmask} public_subnet=${public_subnet} private_subnet=${private_subnet} nvme_path=/mnt/localdisk/ @@ -62,3 +64,5 @@ privilege_group_name=${privilege_group_name} latency_check=${latency_check} compute_username=${compute_username} bastion_username=${bastion_username} +pam = ${pam} +sacct_limits=${sacct_limits} \ No newline at end of file diff --git a/autoscaling/tf_init/locals.tf b/autoscaling/tf_init/locals.tf index f4416844..283f3245 100755 --- a/autoscaling/tf_init/locals.tf +++ b/autoscaling/tf_init/locals.tf @@ -3,6 +3,9 @@ locals { cluster_instances_ids = var.cluster_network ? data.oci_core_instance.cluster_network_instances.*.id : data.oci_core_instance.instance_pool_instances.*.id cluster_instances_names = var.cluster_network ? data.oci_core_instance.cluster_network_instances.*.display_name : data.oci_core_instance.instance_pool_instances.*.display_name image_ocid = var.unsupported ? var.image_ocid : var.image + + shape = var.cluster_network ? var.cluster_network_shape : var.instance_pool_shape + instance_pool_ocpus = local.shape == "VM.DenseIO.E4.Flex" ? var.instance_pool_ocpus_denseIO_flex : var.instance_pool_ocpus // ips of the instances cluster_instances_ips = var.cluster_network ? data.oci_core_instance.cluster_network_instances.*.private_ip : data.oci_core_instance.instance_pool_instances.*.private_ip @@ -20,7 +23,7 @@ locals { // image = (var.cluster_network && var.use_marketplace_image == true) || (var.cluster_network == false && var.use_marketplace_image == false) ? var.image : data.oci_core_images.linux.images.0.id // is_bastion_flex_shape = length(regexall(".*VM.*.*Flex$", var.bastion_shape)) > 0 ? [var.bastion_ocpus]:[] - is_instance_pool_flex_shape = length(regexall(".*VM.*.*Flex$", var.instance_pool_shape)) > 0 ? [var.instance_pool_ocpus]:[] + is_instance_pool_flex_shape = length(regexall(".*VM.*.*Flex$", var.instance_pool_shape)) > 0 ? [local.instance_pool_ocpus]:[] // bastion_mount_ip = var.bastion_block ? element(concat(oci_core_volume_attachment.bastion_volume_attachment.*.ipv4, [""]), 0) : "none" diff --git a/autoscaling/tf_init/versions.tf b/autoscaling/tf_init/versions.tf index 44a6c867..458fd9db 100755 --- a/autoscaling/tf_init/versions.tf +++ b/autoscaling/tf_init/versions.tf @@ -3,7 +3,7 @@ terraform { required_providers { oci = { source = "oracle/oci" - version = "4.99.0" + version = "4.112.0" } } } \ No newline at end of file diff --git a/bastion.tf b/bastion.tf index 7ee4ede6..172d630f 100644 --- a/bastion.tf +++ b/bastion.tf @@ -74,6 +74,7 @@ resource "null_resource" "bastion" { provisioner "remote-exec" { inline = [ + "#!/bin/bash", "sudo mkdir -p /opt/oci-hpc", "sudo chown ${var.bastion_username}:${var.bastion_username} /opt/oci-hpc/", "mkdir -p /opt/oci-hpc/bin", @@ -176,6 +177,7 @@ resource "null_resource" "bastion" { provisioner "remote-exec" { inline = [ + "#!/bin/bash", "chmod 600 /home/${var.bastion_username}/.ssh/cluster.key", "cp /home/${var.bastion_username}/.ssh/cluster.key /home/${var.bastion_username}/.ssh/id_rsa", "chmod a+x /opt/oci-hpc/bin/*.sh", @@ -201,12 +203,14 @@ resource "null_resource" "cluster" { bastion_ip = oci_core_instance.bastion.private_ip, backup_name = var.slurm_ha ? oci_core_instance.backup[0].display_name : "", backup_ip = var.slurm_ha ? oci_core_instance.backup[0].private_ip: "", + login_name = var.login_node ? oci_core_instance.login[0].display_name : "", + login_ip = var.login_node ? oci_core_instance.login[0].private_ip: "", compute = var.node_count > 0 ? zipmap(local.cluster_instances_names, local.cluster_instances_ips) : zipmap([],[]) public_subnet = data.oci_core_subnet.public_subnet.cidr_block, private_subnet = data.oci_core_subnet.private_subnet.cidr_block, rdma_network = cidrhost(var.rdma_subnet, 0), rdma_netmask = cidrnetmask(var.rdma_subnet), - nfs = var.node_count > 0 ? local.cluster_instances_names[0] : "", + nfs = var.node_count > 0 && var.use_scratch_nfs ? local.cluster_instances_names[0] : "", home_nfs = var.home_nfs, create_fss = var.create_fss, home_fss = var.home_fss, @@ -232,8 +236,8 @@ resource "null_resource" "cluster" { cluster_mount_ip = local.mount_ip, autoscaling = var.autoscaling, cluster_name = local.cluster_name, - shape = var.cluster_network ? var.cluster_network_shape : var.instance_pool_shape, - instance_pool_ocpus = var.instance_pool_ocpus, + shape = local.shape, + instance_pool_ocpus = local.instance_pool_ocpus, queue=var.queue, monitoring = var.monitoring, hyperthreading = var.hyperthreading, @@ -248,7 +252,14 @@ resource "null_resource" "cluster" { pyxis = var.pyxis, privilege_sudo = var.privilege_sudo, privilege_group_name = var.privilege_group_name, - latency_check = var.latency_check + latency_check = var.latency_check, + pam = var.pam, + sacct_limits = var.sacct_limits, + inst_prin = var.inst_prin, + region = var.region, + tenancy_ocid = var.tenancy_ocid, + api_fingerprint = var.api_fingerprint, + api_user_ocid = var.api_user_ocid }) destination = "/opt/oci-hpc/playbooks/inventory" @@ -303,7 +314,7 @@ resource "null_resource" "cluster" { private_subnet = data.oci_core_subnet.private_subnet.cidr_block, private_subnet_id = local.subnet_id, targetCompartment = var.targetCompartment, - instance_pool_ocpus = var.instance_pool_ocpus, + instance_pool_ocpus = local.instance_pool_ocpus, instance_pool_memory = var.instance_pool_memory, instance_pool_custom_memory = var.instance_pool_custom_memory, queue=var.queue, @@ -325,14 +336,18 @@ resource "null_resource" "cluster" { bastion_ip = oci_core_instance.bastion.private_ip, backup_name = var.slurm_ha ? oci_core_instance.backup[0].display_name : "", backup_ip = var.slurm_ha ? oci_core_instance.backup[0].private_ip: "", + login_name = var.login_node ? oci_core_instance.login[0].display_name : "", + login_ip = var.login_node ? oci_core_instance.login[0].private_ip: "", compute = var.node_count > 0 ? zipmap(local.cluster_instances_names, local.cluster_instances_ips) : zipmap([],[]) public_subnet = data.oci_core_subnet.public_subnet.cidr_block, public_subnet_id = local.bastion_subnet_id, private_subnet = data.oci_core_subnet.private_subnet.cidr_block, private_subnet_id = local.subnet_id, + rdma_subnet = var.rdma_subnet, nfs = var.node_count > 0 ? local.cluster_instances_names[0] : "", scratch_nfs = var.use_scratch_nfs && var.node_count > 0, scratch_nfs_path = var.scratch_nfs_path, + use_scratch_nfs = var.use_scratch_nfs, slurm = var.slurm, rack_aware = var.rack_aware, slurm_nfs_path = var.add_nfs ? var.nfs_source_path : var.cluster_nfs_path @@ -376,7 +391,9 @@ resource "null_resource" "cluster" { private_deployment = var.private_deployment, use_multiple_ads = var.use_multiple_ads, bastion_username = var.bastion_username, - compute_username = var.compute_username + compute_username = var.compute_username, + pam = var.pam, + sacct_limits = var.sacct_limits }) destination = "/opt/oci-hpc/conf/variables.tf" @@ -409,7 +426,7 @@ provisioner "file" { } provisioner "file" { content = base64decode(var.api_user_key) - destination = "/opt/oci-hpc/autoscaling/credentials/key.initial" + destination = "/opt/oci-hpc/autoscaling/credentials/key.pem" connection { host = local.host type = "ssh" @@ -420,13 +437,12 @@ provisioner "file" { provisioner "remote-exec" { inline = [ + "#!/bin/bash", "chmod 755 /opt/oci-hpc/autoscaling/crontab/*.sh", - "chmod 755 /opt/oci-hpc/autoscaling/credentials/key.sh", - "/opt/oci-hpc/autoscaling/credentials/key.sh /opt/oci-hpc/autoscaling/credentials/key.initial /opt/oci-hpc/autoscaling/credentials/key.pem > /opt/oci-hpc/autoscaling/credentials/key.log", "chmod 600 /opt/oci-hpc/autoscaling/credentials/key.pem", "echo ${var.configure} > /tmp/configure.conf", - "timeout 2h /opt/oci-hpc/bin/configure.sh", - "exit_code=$?", + "timeout 2h /opt/oci-hpc/bin/configure.sh | tee /opt/oci-hpc/logs/initial_configure.log", + "exit_code=$${PIPESTATUS[0]}", "/opt/oci-hpc/bin/initial_monitoring.sh", "exit $exit_code" ] connection { diff --git a/bin/bastion.sh b/bin/bastion.sh index c1cadae0..8d6a83f7 100644 --- a/bin/bastion.sh +++ b/bin/bastion.sh @@ -33,11 +33,27 @@ if [ $ID == "ol" ] || [ $ID == "centos" ] ; then sudo yum install -y terraform elif [ $ID == "debian" ] || [ $ID == "ubuntu" ] ; then + # checking here as well to be sure that the lock file is not being held + function fix_apt { + apt_process=`ps aux | grep "apt update" | grep -v grep | wc -l` + apt_process=$(( apt_process -1 )) + while [ $apt_process -ge 1 ] + do + echo "wait until apt update is done" + sleep 10s + ps aux | grep "apt update" | grep -v grep + apt_process=`ps aux | grep "apt update" | grep -v grep | wc -l` + apt_process=$(( apt_process -1 )) + done + } + fix_apt + if [ $ID == "debian" ] && [ $VERSION_ID == "9" ] ; then echo deb http://ppa.launchpad.net/ansible/ansible/ubuntu trusty main | sudo tee -a /etc/apt/sources.list sudo apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 93C4A3FD7BB9C367 fi - + + sudo sed -i 's/"1"/"0"/g' /etc/apt/apt.conf.d/20auto-upgrades sudo apt purge -y --auto-remove unattended-upgrades sudo systemctl disable apt-daily-upgrade.timer @@ -45,50 +61,34 @@ elif [ $ID == "debian" ] || [ $ID == "ubuntu" ] ; then sudo systemctl disable apt-daily.timer sudo systemctl mask apt-daily.service - apt_process=`ps aux | grep "apt update" | grep -v grep | wc -l` - apt_process=$(( apt_process -1 )) - while [ $apt_process -ge 1 ] - do - echo "wait until apt update is done" - sleep 10s - ps aux | grep "apt update" | grep -v grep - apt_process=`ps aux | grep "apt update" | grep -v grep | wc -l` - apt_process=$(( apt_process -1 )) - done + sleep 10s sudo apt-mark hold linux-oracle linux-headers-oracle linux-image-oracle - # checking here as well to be sure that the lock file is not being held - apt_process=`ps aux | grep "apt update" | grep -v grep | wc -l` - apt_process=$(( apt_process -1 )) - while [ $apt_process -ge 1 ] - do - echo "wait until apt update is done" - sleep 10s - ps aux | grep "apt update" | grep -v grep - apt_process=`ps aux | grep "apt update" | grep -v grep | wc -l` - apt_process=$(( apt_process -1 )) - done - + fix_apt sleep 10s - - wget -O- https://apt.releases.hashicorp.com/gpg | \ - gpg --dearmor | \ - sudo tee /usr/share/keyrings/hashicorp-archive-keyring.gpg - - echo "deb [signed-by=/usr/share/keyrings/hashicorp-archive-keyring.gpg] \ - https://apt.releases.hashicorp.com $(lsb_release -cs) main" | \ - sudo tee /etc/apt/sources.list.d/hashicorp.list - - sudo apt-get update & - PID1=$! - wait $PID1 - sudo apt -y --fix-broken install - sudo apt-get -y install ansible python python-netaddr python3-pip terraform + fix_apt + + sudo apt-get -y install ansible + output=$? + if [ $output -ne 0 ] + then + fix_apt + sudo apt-get -y install ansible + fi + fix_apt + sudo apt-get -y install python python-netaddr python3 python3-pip + output=$? + if [ $output -ne 0 ] + then + fix_apt + sudo apt-get -y install python python-netaddr python3 python3-pip + fi + fix_apt pip install pip --upgrade pip install pyopenssl --upgrade @@ -99,6 +99,31 @@ elif [ $ID == "debian" ] || [ $ID == "ubuntu" ] ; then # install oci module pip install oci + wget -O- https://apt.releases.hashicorp.com/gpg | \ + gpg --dearmor | \ + sudo tee /usr/share/keyrings/hashicorp-archive-keyring.gpg + + echo "deb [signed-by=/usr/share/keyrings/hashicorp-archive-keyring.gpg] \ + https://apt.releases.hashicorp.com $(lsb_release -cs) main" | \ + sudo tee /etc/apt/sources.list.d/hashicorp.list + + sudo apt update && sudo apt install terraform + output=$? + if [ $output -ne 0 ] + then + fix_apt + echo "Terraform second try" + wget -O- https://apt.releases.hashicorp.com/gpg | \ + gpg --dearmor | \ + sudo tee /usr/share/keyrings/hashicorp-archive-keyring.gpg + + echo "deb [signed-by=/usr/share/keyrings/hashicorp-archive-keyring.gpg] \ + https://apt.releases.hashicorp.com $(lsb_release -cs) main" | \ + sudo tee /etc/apt/sources.list.d/hashicorp.list + + sudo apt update && sudo apt install terraform + fi + fix_apt fi ansible-galaxy collection install ansible.netcommon:=2.5.1 --force > /dev/null diff --git a/bin/cleanup.sh b/bin/cleanup.sh index 3df85c4f..54725e88 100755 --- a/bin/cleanup.sh +++ b/bin/cleanup.sh @@ -6,6 +6,12 @@ folder=`dirname $scripts` playbooks_path=$folder/../playbooks/ inventory_path=$folder/../autoscaling/clusters/$1 +if [ $EUID -eq 0 ] +then + echo "Run this script as opc or ubuntu and not as root" + exit +fi + ssh_options="-i ~/.ssh/id_rsa -o StrictHostKeyChecking=no" iplist=`cat $inventory_path/inventory | awk '{print $2}' | sed 's/ansible_host=//'` if [[ "$2" == "FORCE" ]] diff --git a/bin/create_cluster.sh b/bin/create_cluster.sh index e88a2e69..0cdba575 100755 --- a/bin/create_cluster.sh +++ b/bin/create_cluster.sh @@ -17,6 +17,13 @@ then else debug=0 fi + +if [ $EUID -eq 0 ] +then + echo "Run this script as opc or ubuntu and not as root" + exit +fi + date=`date '+%Y%m%d%H%M'` scripts=`realpath $0` folder=`dirname $scripts` diff --git a/bin/delete_cluster.sh b/bin/delete_cluster.sh index f771e1da..7328a206 100755 --- a/bin/delete_cluster.sh +++ b/bin/delete_cluster.sh @@ -5,6 +5,13 @@ then echo "No arguments supplied" exit fi + +if [ $EUID -eq 0 ] +then + echo "Run this script as opc or ubuntu and not as root" + exit +fi + date=`date -u '+%Y%m%d%H%M'` start=`date -u +%s` start_timestamp=`date -u +'%F %T'` diff --git a/bin/gpu_throttle.sh b/bin/gpu_throttle.sh new file mode 100644 index 00000000..72e19071 --- /dev/null +++ b/bin/gpu_throttle.sh @@ -0,0 +1,5 @@ +#!/bin/bash + + +/usr/bin/nvidia-smi --query-gpu=timestamp,pci.bus,utilization.gpu,utilization.memory,temperature.gpu,power.draw,clocks.mem,clocks.gr,clocks_throttle_reasons.sw_power_cap,clocks_throttle_reasons.hw_thermal_slowdown,clocks_throttle_reasons.hw_power_brake_slowdown,clocks_throttle_reasons.sw_thermal_slowdown,clocks_throttle_reasons.sync_boost,clocks_throttle_reasons.active --format=csv + diff --git a/bin/pcie_el.sh b/bin/pcie_el.sh new file mode 100644 index 00000000..f15061ff --- /dev/null +++ b/bin/pcie_el.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +for dev in `/usr/sbin/lspci | grep ConnectX-5 | awk '{print $1}'` +do + echo ${dev} + sudo lspci -vvv -s ${dev} | grep LnkSta: +done + diff --git a/bin/pcie_ubuntu.sh b/bin/pcie_ubuntu.sh new file mode 100644 index 00000000..95c5c456 --- /dev/null +++ b/bin/pcie_ubuntu.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +for dev in `/usr/bin/lspci | grep ConnectX-5 | awk '{print $1}'` +do + echo ${dev} + sudo lspci -vvv -s ${dev} | grep LnkSta: +done + diff --git a/bin/resize.py b/bin/resize.py index 576d9298..acc7b43a 100644 --- a/bin/resize.py +++ b/bin/resize.py @@ -49,7 +49,9 @@ def get_instances(comp_ocid,cn_ocid,CN): for instance_summary in instance_summaries: try: instance=computeClient.get_instance(instance_summary.id).data - vnic_attachment = oci.pagination.list_call_get_all_results(computeClient.list_vnic_attachments,compartment_id=comp_ocid,instance_id=instance.id).data[0] + for potential_vnic_attachment in oci.pagination.list_call_get_all_results(computeClient.list_vnic_attachments,compartment_id=comp_ocid,instance_id=instance.id).data: + if potential_vnic_attachment.display_name is None: + vnic_attachment = potential_vnic_attachment vnic = virtualNetworkClient.get_vnic(vnic_attachment.vnic_id).data except: continue @@ -339,21 +341,21 @@ def getreachable(instances,username,delay=0): reachable_ips=[] for i in delays: - input_file=open('/tmp/input_hosts_to_check','w') + input_file=open('/tmp/input_hosts_to_check_'+cluster_name,'w') for node in instances: if not node['ip'] in reachable_ips: input_file.write(node['ip']+"\n") input_file.close() my_env = os.environ.copy() my_env["ANSIBLE_HOST_KEY_CHECKING"] = "False" - p = subprocess.Popen(["/opt/oci-hpc/bin/find_reachable_hosts.sh","/tmp/input_hosts_to_check","/tmp/reachable_hosts",username,"0"],env=my_env,stderr = subprocess.PIPE, stdout=subprocess.PIPE) + p = subprocess.Popen(["/opt/oci-hpc/bin/find_reachable_hosts.sh","/tmp/input_hosts_to_check_"+cluster_name,"/tmp/reachable_hosts_"+cluster_name,username,"0"],env=my_env,stderr = subprocess.PIPE, stdout=subprocess.PIPE) while True: output = p.stdout.readline().decode() if output == '' and p.poll() is not None: break if output: print(output.strip()) - output_file=open('/tmp/reachable_hosts','r') + output_file=open('/tmp/reachable_hosts_'+cluster_name,'r') for line in output_file: reachable_ips.append(line.strip()) output_file.close() @@ -516,7 +518,7 @@ def updateTFState(inventory,cluster_name,size): parser.add_argument('--cluster_name', help='Name of the cluster to resize. Defaults to the name included in the bastion') parser.add_argument('mode', help='Mode type. add/remove node options, implicitly configures newly added nodes. Also implicitly reconfigure/restart services like Slurm to recognize new nodes. Similarly for remove option, terminates nodes and implicitly reconfigure/restart services like Slurm on rest of the cluster nodes to remove reference to deleted nodes.',choices=['add','remove','remove_unreachable','list','reconfigure'],default='list',nargs='?') parser.add_argument('number', type=int, help="Number of nodes to add or delete if a list of hostnames is not defined",nargs='?') -parser.add_argument('--nodes', help="List of nodes to delete",nargs='+') +parser.add_argument('--nodes', help="List of nodes to delete (Space Separated)",nargs='+') parser.add_argument('--no_reconfigure', help='If present. Does not rerun the playbooks',action='store_true',default=False) parser.add_argument('--user_logging', help='If present. Use the default settings in ~/.oci/config to connect to the API. Default is using instance_principal',action='store_true',default=False) parser.add_argument('--force', help='If present. Nodes will be removed even if the destroy playbook failed',action='store_true',default=False) @@ -651,9 +653,13 @@ def updateTFState(inventory,cluster_name,size): hostnames_to_remove=[i['display_name'] for i in unreachable_instances] else: print("STDOUT: No list of nodes were specified and no unreachable nodes were found") - exit() + exit(1) else: - reachable_instances,unreachable_instances=getreachable(inventory_instances,username,delay=10) + inventory_instances_to_test = [] + for instance_to_test in inventory_instances: + if not instance_to_test['display_name'] in hostnames: + inventory_instances_to_test.append(instance_to_test) + reachable_instances,unreachable_instances=getreachable(inventory_instances_to_test,username,delay=10) hostnames_to_remove=hostnames if len(unreachable_instances): print("STDOUT: At least one unreachable node is in the inventory and was not mentionned with OCI hostname to be removed. Trying anyway") @@ -663,7 +669,7 @@ def updateTFState(inventory,cluster_name,size): if not remove_unreachable: print("STDOUT: At least one unreachable node is in the inventory") print("STDOUT: Not doing anything") - exit() + exit(1) else: hostnames_to_remove=[i['display_name'] for i in unreachable_instances] else: @@ -690,7 +696,7 @@ def updateTFState(inventory,cluster_name,size): if error_code != 0: print("STDOUT: The nodes could not be removed. Try running this with Force") if not force: - exit() + exit(1) else: print("STDOUT: Force deleting the nodes") while len(hostnames_to_remove) > 0: @@ -722,7 +728,7 @@ def updateTFState(inventory,cluster_name,size): if args.mode == 'add': size = current_size - hostnames_to_remove_len + args.number update_size = oci.core.models.UpdateInstancePoolDetails(size=size) - ComputeManagementClientCompositeOperations.update_instance_pool_and_wait_for_state(ipa_ocid,update_size,['RUNNING']) + ComputeManagementClientCompositeOperations.update_instance_pool_and_wait_for_state(ipa_ocid,update_size,['RUNNING'],waiter_kwargs={'max_wait_seconds':3600}) updateTFState(inventory,cluster_name,size) if not no_reconfigure: add_reconfigure(comp_ocid,cn_ocid,inventory,CN) \ No newline at end of file diff --git a/bin/resize.sh b/bin/resize.sh index ba956ce2..d2082db8 100755 --- a/bin/resize.sh +++ b/bin/resize.sh @@ -9,6 +9,12 @@ autoscaling_folder=$folder/../autoscaling monitoring_folder=$folder/../monitoring logs_folder=$folder/../logs +if [ $EUID -eq 0 ] +then + echo "Run this script as opc or ubuntu and not as root" + exit +fi + if [ $# -eq 0 ] then python3 $folder/resize.py --help diff --git a/bin/validation.py b/bin/validation.py new file mode 100644 index 00000000..8c28b2a1 --- /dev/null +++ b/bin/validation.py @@ -0,0 +1,618 @@ +import subprocess +import re +import requests +import oci +from datetime import datetime +import argparse +import os +import shlex + + + +# change ownership of all files to opc so that the files can be copied +def changeOwner(path): + out = subprocess.Popen(["whoami"],stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=True,universal_newlines=True) + stdout,stderr = out.communicate() + username = stdout.split("\n") + del username[-1] + cmd = f'sudo chown -R {username[0]}:{username[0]} {path}' + run_cmd(cmd) + + +def getDateTime(): + # datetime object containing current date and time + now = datetime.now() + dt_string = now.strftime("%m%d%Y%H%M%S") + return dt_string + + +# create directory to hold results +def createDir(): + # Parent Directory path + parent_dir = "/tmp/" + directory = getDateTime() + # Path + path = os.path.join(parent_dir, directory) + try: + os.mkdir(path) + except OSError as error: + print(error) + return path + + +def run_cmd(cmd=None): + """ Run command on shell""" + cmd_split = shlex.split(cmd) + try: + results = subprocess.run(cmd_split, shell=False, stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, check=True, encoding='utf8') + output = results.stdout.splitlines() + except subprocess.CalledProcessError as e_process_error: + return (9000, f"Error code: {e_process_error.returncode} Output: {e_process_error.output}") + return output + + +def get_metadata(): + """ Make a request to metadata endpoint """ + headers = { 'Authorization' : 'Bearer Oracle' } + metadata_url = "http://169.254.169.254/opc/" + metadata_ver = "2" + request_url = metadata_url + "v" + metadata_ver + "/instance/" + return requests.get(request_url, headers=headers).json() + + +def get_summary(comp_ocid,cluster_name): + signer = oci.auth.signers.InstancePrincipalsSecurityTokenSigner() + computeManagementClient = oci.core.ComputeManagementClient(config={}, signer=signer) + CN = True + cn_summaries = computeManagementClient.list_cluster_networks(comp_ocid,display_name=cluster_name).data + running_clusters = 0 + scaling_clusters = 0 + cn_summary=None + for cn_summary_tmp in cn_summaries: + if cn_summary_tmp.lifecycle_state == "RUNNING": + cn_summary = cn_summary_tmp + running_clusters = running_clusters + 1 + elif cn_summary_tmp.lifecycle_state == "SCALING": + scaling_clusters = scaling_clusters + 1 + if running_clusters == 0: + cn_summaries = computeManagementClient.list_instance_pools(comp_ocid,display_name=cluster_name).data + if len(cn_summaries) > 0: + CN = False + for cn_summary_tmp in cn_summaries: + if cn_summary_tmp.lifecycle_state == "RUNNING": + cn_summary = cn_summary_tmp + running_clusters = running_clusters + 1 + elif cn_summary_tmp.lifecycle_state == "SCALING": + scaling_clusters = scaling_clusters + 1 + if running_clusters == 0: + if scaling_clusters: + print("No running cluster was found but there is a cluster in SCALING mode, try rerunning in a moment") + else: + print("The cluster was not found") + return None,None,True + if running_clusters > 1: + print("There were multiple running clusters with this name, we selected the one with OCID:"+cn_summary.id) + if CN: + ip_summary=cn_summary.instance_pools[0] + else: + ip_summary=cn_summary + return cn_summary,ip_summary,CN + + +def get_instances(comp_ocid,cn_ocid): + signer = oci.auth.signers.InstancePrincipalsSecurityTokenSigner() + computeManagementClient = oci.core.ComputeManagementClient(config={}, signer=signer) + instance_summaries = oci.pagination.list_call_get_all_results(computeManagementClient.list_cluster_network_instances,comp_ocid,cn_ocid).data + node_list = [] + for instance_summary in instance_summaries: + node_list.append(instance_summary.display_name) + return node_list + + +def parse_inventory(inventory): + try: + inv = open(inventory,"r") + except: + return None + inventory_dict = {} + current_section = None + for line in inv: + if line.strip().startswith("[") and line.strip().endswith("]"): + current_section=line.split('[')[1].split(']')[0] + if not current_section in inventory_dict.keys(): + inventory_dict[current_section]=[] + else: + if not current_section is None: + inventory_dict[current_section].append(line) + inv.close() + return inventory_dict + + +# this is the source of truth for cluster names and total number of nodes +def getResizeClusterNames(filepath): + if filepath is None: + out = subprocess.Popen(["ls /opt/oci-hpc/autoscaling/clusters/"],stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=True,universal_newlines=True) + stdout,stderr = out.communicate() + x = stdout.split("\n") + del x[-1] + cluster_name_set = set() + for i in range(len(x)): + if x[i] == 'README': + continue + else: + cluster_name_set.add(x[i]) + return cluster_name_set + else: + out = subprocess.Popen(["cat "+filepath],stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=True,universal_newlines=True) + stdout,stderr = out.communicate() + x = stdout.split("\n") + del x[-1] + cluster_name_set = set() + for i in range(len(x)): + cluster_name_set.add(x[i]) + return cluster_name_set + + +# this is the source of truth for total number of nodes in a cluster +def getResizeNodes(args, metadata, cluster_names, mode): + if mode == 1 or mode == 2: + resize_cluster_node_dict = {} + str = "ocid1.instance." + for cluster in cluster_names: + out = subprocess.Popen(["/opt/oci-hpc/bin/resize.sh --cluster_name "+cluster],stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=True,universal_newlines=True) + stdout,stderr = out.communicate() + x = stdout.split("\n") + del x[-1] + cluster_node_set = set() + for i in range(len(x)): + if str in x[i]: + split_str = x[i].split() + cluster_node_set.add(split_str[0].replace('"','')) + if len(cluster_node_set) > 0: + resize_cluster_node_dict.update({cluster: cluster_node_set}) + if mode == 2 or (mode == 1 and args.cluster_names is None): + out = subprocess.Popen(["/opt/oci-hpc/bin/resize.sh list"],stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=True,universal_newlines=True) + stdout,stderr = out.communicate() + x = stdout.split("\n") + del x[-1] + permanent_cluster = '' + cluster_node_set = set() + for i in range(len(x)): + if str in x[i]: + permanent_cluster = metadata['displayName'].replace('-bastion','') + if permanent_cluster in cluster_names: + return cluster_names, resize_cluster_node_dict + else: + split_str = x[i].split() + cluster_node_set.add(split_str[0].replace('"','')) + if len(cluster_node_set) > 0: + resize_cluster_node_dict.update({permanent_cluster: cluster_node_set}) + cluster_names.add(permanent_cluster) + return cluster_names, resize_cluster_node_dict + + +# given a cluster name, return all the nodes in that cluster +def getNodesInClusters(cluster_name): + out = subprocess.Popen(["cat /etc/hosts | grep "+cluster_name+" | grep local.vcn | awk '{print $4}'"],stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=True,universal_newlines=True) + stdout,stderr = out.communicate() + nodes = set() + x = stdout.split("\n") + for i in range(0,len(x)-1): + nodes.add(x[i]) + return nodes + + +def nodesFromEtcHosts(resize_cluster_names): + etc_node_cluster_dict = {} + etc_cluster_node_dict = {} + for cluster in resize_cluster_names: + etc_nodes = getNodesInClusters(cluster) + for n in etc_nodes: + etc_node_cluster_dict.update({n: cluster}) + etc_cluster_node_dict.update({cluster: etc_nodes}) + return etc_node_cluster_dict, etc_cluster_node_dict + + +def getConsoleNodeName(slurm_node_name): + name = slurm_node_name + ".local.vcn" + out = subprocess.Popen(["cat /etc/hosts | grep "+name+" | awk '{print $4}'"],stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=True,universal_newlines=True) + stdout,stderr = out.communicate() + node_name_output = stdout.split("\n") + del node_name_output[-1] + if len(node_name_output) == 0: + return None + return node_name_output[0] + + +# get number of nodes and their state using slurm +def slurmGetNodes(resize_cluster_names, all_node_cluster_dict, path): + out = subprocess.run(['sinfo','-hN','-o','\"%T %D %N\"'], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + lines = out.stdout.decode("utf-8") + x = lines.split("\n") + del x[-1] + warning_node_dict = {} + slurm_node_cluster_dict = {} + for i in range(len(x)): + split_str = x[i].split() + node_state = split_str[0].replace('"','') + node_name = split_str[2].replace('"','') + proper_node_name = getConsoleNodeName(node_name) + if proper_node_name is not None: + if proper_node_name in all_node_cluster_dict: + slurm_node_cluster = all_node_cluster_dict[proper_node_name] + if slurm_node_cluster in resize_cluster_names: + slurm_node_cluster_dict.update({proper_node_name: slurm_node_cluster}) + if node_state.endswith("*"): + warning_node_dict.update({proper_node_name: node_state}) + else: + if path is None: + path = createDir() + changeOwner(path) + f = open(path+"/slurmNumNodes.txt", "a") + f.write(proper_node_name + " not found in resize" + "\n") + f.close() + else: + if path is None: + path = createDir() + changeOwner(path) + f = open(path+"/slurmNumNodes.txt", "a") + f.write(node_name + " not found in /etc/hosts file for getting the oci console name" + "\n") + f.close() + return slurm_node_cluster_dict, warning_node_dict, path + + +def topologyGetNodes(resize_cluster_names, all_node_cluster_dict, path): + str1 = "SwitchName=inactive" + str2 = "Switches=" + out = subprocess.Popen(["cat /etc/slurm/topology.conf"],stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=True,universal_newlines=True) + stdout,stderr = out.communicate() + x = stdout.split("\n") + del x[-1] + topo_node_cluster_dict = {} + for i in range(len(x)): + if str1 in x[i] or str2 in x[i] or x[i].startswith("#"): + continue + else: + split_str = x[i].split() + node_name_str = split_str[1].rsplit("=") + node_name_1 = node_name_str[1].replace('"','') + node_name = node_name_1.replace(' ','') + res = re.findall(r'\[([^]]*)\]', node_name) + if len(res) == 0: + topo_node_name = getConsoleNodeName(node_name) + if topo_node_name is not None: + if topo_node_name in all_node_cluster_dict: + topo_node_cluster = all_node_cluster_dict[topo_node_name] + if topo_node_cluster in resize_cluster_names: + topo_node_cluster_dict.update({topo_node_name: topo_node_cluster}) + else: + if path is None: + path = createDir() + changeOwner(path) + f = open(path+"/topoNumNodes.txt", "a") + f.write(topo_node_name + " not found in resize" + "\n") + f.close() + else: + if path is None: + path = createDir() + changeOwner(path) + f = open(path+"/topoNumNodes.txt", "a") + f.write(node_name + " not found in /etc/hosts file for getting the oci console name" + "\n") + f.close() + else: + out = subprocess.Popen(["scontrol show hostnames "+node_name],stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=True,universal_newlines=True) + stdout,stderr = out.communicate() + nodes = stdout.split("\n") + del nodes[-1] + for n in nodes: + oci_console_node_name = getConsoleNodeName(n) + if oci_console_node_name is not None: + if oci_console_node_name in all_node_cluster_dict: + topo_node_cluster = all_node_cluster_dict[oci_console_node_name] + if topo_node_cluster in resize_cluster_names: + topo_node_cluster_dict.update({oci_console_node_name: topo_node_cluster}) + else: + if path is None: + path = createDir() + changeOwner(path) + f = open(path+"/topoNumNodes.txt", "a") + f.write(oci_console_node_name + " not found in resize" + "\n") + f.close() + else: + if path is None: + path = createDir() + changeOwner(path) + f = open(path+"/topoNumNodes.txt", "a") + f.write(n + " not found in /etc/hosts file for getting the oci console name" + "\n") + f.close() + return topo_node_cluster_dict, path + + +def etcHostsSame(nodes, path): + out = subprocess.Popen(["linecount=`cat /etc/hosts | wc -l ` ; lines=$((linecount-3)) ; tail -n $lines /etc/hosts | md5sum"],stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=True,universal_newlines=True) + stdout,stderr = out.communicate() + x = stdout.split("\n") + del x[-1] + bastion_md5 = x[0].replace('"','') + md5_set = set() + md5_set.add(bastion_md5) + out = subprocess.Popen(["pdsh -w "+nodes+" 'linecount=`cat /etc/hosts | wc -l ` ; lines=$((linecount-3)) ; tail -n $lines /etc/hosts | md5sum'"],stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=True,universal_newlines=True) + stdout,stderr = out.communicate() + x = stdout.split("\n") + del x[-1] + str = "exit" + for i in range(len(x)): + split_str = x[i].split(':') + if str in x[i]: + if path is None: + path = createDir() + changeOwner(path) + f = open(path+"/etcHostsMD5Sum.txt", "a") + f.write(split_str[1] + " not ssh-able at the moment" + "\n") + f.close() + continue + else: + md5 = split_str[1].lstrip() + if md5 != bastion_md5: + if path is None: + path = createDir() + changeOwner(path) + f = open(path+"/etcHostsMD5Sum.txt", "a") + f.write("/etc/hosts file does not match on " + split_str[0] + "\n") + f.close() + md5_set.add(md5) + if len(md5_set) > 1: + print("/etc/hosts on bastion and nodes is different") + else: + print("/etc/hosts is same on bastion and all nodes that are ssh-able") + return path + + +def ociCommand(metadata, cluster_names): + comp_ocid=metadata['compartmentId'] + oci_node_cluster_dict = {} + node_list = [] + for cluster in cluster_names: + cn_summary,ip_summary,CN = get_summary(comp_ocid,cluster) + if cn_summary is not None: + cn_ocid = cn_summary.id + node_list = get_instances(comp_ocid, cn_ocid) + for node in node_list: + oci_node_cluster_dict.update({node: cluster}) + elif ip_summary is not None: + cn_ocid = ip_summary.id + node_list = get_instances(comp_ocid, cn_ocid) + for node in node_list: + oci_node_cluster_dict.update({node: cluster}) + return oci_node_cluster_dict + + +def inventoryNodes(metadata, cluster_names): + inventory_node_cluster_dict = {} + permanent_cluster = metadata['displayName'].replace('-bastion','') + for cluster in cluster_names: + if cluster == permanent_cluster: + inventory = "/etc/ansible/hosts" + inventory_dict = parse_inventory(inventory) + inv_list = inventory_dict["compute_configured"] + for i in inv_list: + split_str = i.split() + node_name = split_str[0] + inventory_node_cluster_dict.update({node_name: cluster}) + else: + inventory = "/opt/oci-hpc/autoscaling/clusters/"+cluster+"/inventory" + inventory_dict = parse_inventory(inventory) + inv_list = inventory_dict["compute_configured"] + for i in inv_list: + split_str = i.split() + node_name = split_str[0] + inventory_node_cluster_dict.update({node_name: cluster}) + return inventory_node_cluster_dict + + +def pcie_check(hostfile, path): + out = subprocess.Popen(["cat /etc/os-release | grep PRETTY_NAME="],stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=True,universal_newlines=True) + stdout,stderr = out.communicate() + os_name = stdout.split("\n") + del os_name[-1] + if "Linux" in os_name[0]: + out = subprocess.Popen(["sudo cp /opt/oci-hpc/bin/pcie_el.sh ~/."],stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=True,universal_newlines=True) + stdout,stderr = out.communicate() + out = subprocess.Popen(["for h in `less "+hostfile+"` ; do echo $h ; ssh $h \"~/pcie_el.sh\" ; done > "+path+"/pcie-output"],stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=True,universal_newlines=True) + stdout,stderr = out.communicate() + elif "Ubuntu" in os_name[0]: + out = subprocess.Popen(["sudo cp /opt/oci-hpc/bin/pcie_ubuntu.sh ~/."],stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=True,universal_newlines=True) + stdout,stderr = out.communicate() + out = subprocess.Popen(["for h in `less "+hostfile+"` ; do echo $h ; ssh $h \"~/pcie_ubuntu.sh\" ; done > "+path+"/pcie-output"],stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=True,universal_newlines=True) + stdout,stderr = out.communicate() + else: + print("Cannot run pcie check as OS is not determined to be Linux or Ubuntu") + + +def gpu_throttle(hostfile, path): + out = subprocess.Popen(["sudo cp /opt/oci-hpc/bin/gpu_throttle.sh ~/."],stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=True,universal_newlines=True) + stdout,stderr = out.communicate() + out = subprocess.Popen(["for h in `less "+hostfile+"` ; do echo $h ; ssh $h \"~/gpu_throttle.sh\" ; done > "+path+"/gpu-throttle-output"],stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=True,universal_newlines=True) + stdout,stderr = out.communicate() + +def getResizeCluster(args, metadata): + resize_cluster_names = [] + resize_cluster_node_dict = {} + resize_node_cluster_dict = {} + resize_cluster_names = getResizeClusterNames(args.cluster_names) + resize_cluster_names, resize_cluster_node_dict = getResizeNodes(args, metadata, resize_cluster_names, 1) + + if len(resize_cluster_names) == 0 or len(resize_cluster_node_dict) == 0: + print("There are no clusters available") + else: + for k, v in resize_cluster_node_dict.items(): + for v1 in v: + resize_node_cluster_dict[v1] = k + + return resize_cluster_names, resize_cluster_node_dict, resize_node_cluster_dict + +def dictEqualCheck(resize_node_cluster_dict, comp_dict, type, txt_file_name, path): + if resize_node_cluster_dict == comp_dict: + print("Number of nodes from " +type+ " is same as resize") + else: + for key in resize_node_cluster_dict.keys(): + if not key in comp_dict: + if path is None: + path = createDir() + changeOwner(path) + f = open(path+"/" +txt_file_name+ ".txt", "a") + f.write(key + " is not in " +type+ " file" + "\n") + f.close() + for key in comp_dict.keys(): + if not key in resize_node_cluster_dict: + if path is None: + path = createDir() + changeOwner(path) + f = open(path+"/" +txt_file_name+ ".txt", "a") + f.write(key + " is not in resize list" + "\n") + f.close() + return path + +def runChecks(args, type, name, hostFileWritten, resize_node_cluster_dict, metadata, path): + if type is not None: + if type == 'y' or type == 'Y': + if args.cluster_names is not None: + if hostFileWritten is False: + if len(resize_node_cluster_dict) == 0: + resize_cluster_names, resize_cluster_node_dict, resize_node_cluster_dict = getResizeCluster(args, metadata) + if len(resize_cluster_names) == 0: + exit() + if path is None: + path = createDir() + changeOwner(path) + f = open(path+"/host.txt", "a") + for v in resize_node_cluster_dict.keys(): + hostFileWritten = True + f.write(str(v) + "\n") + f.close() + hostfile = path+"/host.txt" + if name == "pcie": + pcie_check(hostfile, path) + if name == "gpu throttle": + gpu_throttle(hostfile, path) + if name == "/etc/hosts md5 sum": + out = subprocess.Popen(["cat "+hostfile],stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=True,universal_newlines=True) + stdout,stderr = out.communicate() + x = stdout.split("\n") + del x[-1] + nodes_comma = ','.join(str(s) for s in x) + path = etcHostsSame(nodes_comma, path) + else: + print("Provide cluster_names file or hosts file to run " +name+ " check") + else: + hostfile = type + if path is None: + path = createDir() + changeOwner(path) + if name == "pcie": + pcie_check(hostfile, path) + if name == "gpu throttle": + gpu_throttle(hostfile, path) + if name == "/etc/hosts md5 sum": + out = subprocess.Popen(["cat "+hostfile],stdout=subprocess.PIPE, stderr=subprocess.STDOUT,shell=True,universal_newlines=True) + stdout,stderr = out.communicate() + x = stdout.split("\n") + del x[-1] + nodes_comma = ','.join(str(s) for s in x) + path = etcHostsSame(nodes_comma, path) + return hostFileWritten, path + + +############### + +parser = argparse.ArgumentParser(description = 'Performs these checks. \ +-> Check the number of nodes is consistent across resize, /etc/hosts, slurm, topology.conf, OCI console, \ + inventory files. \ +-> PCIe bandwidth check \ +-> GPU Throttle check \ +-> /etc/hosts md5 sum validation \ + Provide at least one argument: [-n NUM_NODES] [-p PCIE] [-g GPU_THROTTLE] [-e ETC_HOSTS] \ +Optional argument with [-n NUM_NODES] [-p PCIE] [-g GPU_THROTTLE] [-e ETC_HOSTS]: [-cn CLUSTER_NAMES] --> \ +Provide a file that lists each cluster on a separate line for which you want to validate the \ + number of nodes and/or pcie check and/or gpu throttle check and/or /etc/hosts md5 sum. \ +For all of the above, you can either provide y or Y along with -cn or you can give the hostfile path (each host on a separate line) for each argument') + +parser.add_argument('-n', '--num_nodes', help = "Check the number of nodes is consistent across resize, /etc/hosts, slurm, topology.conf, OCI console, \ + inventory files.") +parser.add_argument('-cn', '--cluster_names', help = "Provide a file that lists each cluster on a separate line for which you want to validate the \ + number of nodes and/or pcie check and/or gpu throttle check.") +parser.add_argument('-p', '--pcie', help = "Runs PCIe bandwidth check") +parser.add_argument('-g', '--gpu_throttle', help = "Performs GPU throttle check") +parser.add_argument('-e', '--etc_hosts', help = "Performs md5 sum check on all hosts and checks if it matches with the bastion") + +args = parser.parse_args() + +args_vars = vars(args) +if not any(args_vars.values()): + parser.error('No arguments provided') + exit() + +metadata=get_metadata() + +path = None + +resize_cluster_names = [] +resize_cluster_node_dict = {} +resize_node_cluster_dict = {} + +if args.num_nodes is not None: + resize_cluster_names, resize_cluster_node_dict, resize_node_cluster_dict = getResizeCluster(args, metadata) + + if len(resize_cluster_names) > 0: + + # get all clusters and its corresponding nodes --> this is required to get the cluster name of the nodes from slurm and topology.conf \ + # so as to filter out clusters if -cn option is given + all_cluster_names = [] + all_cluster_node_dict = {} + all_node_cluster_dict = {} + all_cluster_names = getResizeClusterNames(None) + all_cluster_names, all_cluster_node_dict = getResizeNodes(args, metadata, all_cluster_names, 2) + if len(all_cluster_names) == 0 or len(all_cluster_node_dict) == 0: + print("There are no clusters available") + else: + for k, v in all_cluster_node_dict.items(): + for v1 in v: + all_node_cluster_dict[v1] = k + + etc_node_cluster_dict, etc_cluster_node_dict = nodesFromEtcHosts(resize_cluster_names) + + slurm_node_cluster_dict, warning_node_dict, path = slurmGetNodes(resize_cluster_names, all_node_cluster_dict, path) + + topo_node_cluster_dict, path = topologyGetNodes(resize_cluster_names, all_node_cluster_dict, path) + + inventory_node_cluster_dict = inventoryNodes(metadata, resize_cluster_names) + + oci_node_cluster_dict = ociCommand(metadata, resize_cluster_names) + + path = dictEqualCheck(resize_node_cluster_dict, etc_node_cluster_dict, "/etc/hosts", "etcHostsNumNodes", path) + if resize_node_cluster_dict == etc_node_cluster_dict: + path = dictEqualCheck(resize_node_cluster_dict, slurm_node_cluster_dict, "slurm", "slurmNumNodes", path) + path = dictEqualCheck(resize_node_cluster_dict, topo_node_cluster_dict, "topology.conf", "topoNumNodes", path) + + path = dictEqualCheck(resize_node_cluster_dict, inventory_node_cluster_dict, "inventory", "inventoryNumNodes", path) + path = dictEqualCheck(resize_node_cluster_dict, oci_node_cluster_dict, "oci cli", "ociCliNumNodes", path) + + if len(warning_node_dict) > 0: + for key in warning_node_dict.keys(): + if path is None: + path = createDir() + changeOwner(path) + f = open(path+"/slurmWarnNodes.txt", "a") + f.write(key + " is in slurm state " + warning_node_dict[key] + "\n") + f.close() + +hostFileWritten = False + +hostFileWritten, path = runChecks(args, args.pcie, "pcie", hostFileWritten, resize_node_cluster_dict, metadata, path) +hostFileWritten, path = runChecks(args, args.gpu_throttle, "gpu throttle",hostFileWritten, resize_node_cluster_dict, metadata, path) +hostFileWritten, path = runChecks(args, args.etc_hosts, "/etc/hosts md5 sum", hostFileWritten, resize_node_cluster_dict, metadata, path) + +if path is not None: + print(f"Output is in folder: {path}") + diff --git a/conf/variables.tpl b/conf/variables.tpl index f87fc58c..9a100245 100755 --- a/conf/variables.tpl +++ b/conf/variables.tpl @@ -13,7 +13,7 @@ variable "boot_volume_size" {default = "##BOOT##"} variable "use_marketplace_image" { default = "##USEMP##" } variable "use_old_marketplace_image" { default = "##USEOLDMP##" } variable "scratch_nfs_path" { default = "${scratch_nfs_path}" } -variable "use_scratch_nfs" { default = true } +variable "use_scratch_nfs" { default = ${use_scratch_nfs} } variable "cluster_nfs_path" {default = "${cluster_nfs_path}"} variable "use_cluster_nfs" { default = ${use_cluster_nfs} } variable "image" { default = "##IMAGE##" } @@ -24,9 +24,12 @@ variable "public_subnet_id" { default = "${public_subnet_id}"} variable "public_subnet" {default = "${public_subnet}"} variable "private_subnet_id" { default = "##PRIVATE_SUBNET_ID##"} variable "private_subnet" {default = "##PRIVATE_SUBNET##"} +variable "rdma_subnet" { default = "${rdma_subnet}" } variable "slurm" { default = ${slurm} } variable "rack_aware" { default = ${rack_aware} } variable "pyxis" { default = ${pyxis} } +variable "pam" { default = ${pam} } +variable "sacct_limits" { default = ${sacct_limits} } variable "enroot" { default = ${enroot} } variable "slurm_nfs_path" { default = "${slurm_nfs_path}" } variable "spack" { default = ${spack} } @@ -46,9 +49,12 @@ variable "marketplace_version_id" { "2" = "OL7.8-OFED5.0-1.0.0.0-UEK-20200826" "3" = "OL7.7-OFED-4.4-2.0.7.0-UEK-20200229" "4" = "OL7.9-OFED5.0-2.1.8.0-RHCK-20210709" - "HPC_OL7" = "OL7.9-RHCK-3.10.0-OFED-5.4-3.4.0-1" - "HPC_OL8" = "OracleLinux-8-RHCK-OFED-5.4-3.5.8.0-2022.11.15-0" - "GPU" = "OracleLinux-7-RHCK-3.10.0-OFED-5.4-3.4.0.0-GPU-510-2022.09.23-1" + "HPC_OL7" = "OracleLinux-7-RHCK-3.10.0-OFED-5.4-3.6.8.1-2023.01.10-0" + "HPC_OL8" = "OracleLinux-8-RHCK-OFED-5.4-3.6.8.1-2023.01.10-0" + "HPC_OL7_old" = "OL7.9-RHCK-3.10.0-OFED-5.4-3.4.0-1" + "HPC_OL8_old" = "OracleLinux-8-RHCK-OFED-5.4-3.5.8.0-2022.11.15-0" + "GPU_old" = "OracleLinux-7-RHCK-3.10.0-OFED-5.4-3.4.0.0-GPU-510-2022.09.23-1" + "GPU" = "OracleLinux-7-RHCK-3.10.0-OFED-5.4-3.6.8.1-GPU-515-2023.01.10-0" } } @@ -82,6 +88,8 @@ variable "bastion_name" {default = "${bastion_name}"} variable "bastion_ip" {default = "${bastion_ip}"} variable "backup_name" {default = "${backup_name}"} variable "backup_ip" {default = "${backup_ip}"} +variable "login_name" {default = "${login_name}"} +variable "login_ip" {default = "${login_ip}"} variable "scripts_folder" {default = "/opt/oci-hpc/bin/"} variable "autoscaling_folder" {default = "/opt/oci-hpc/autoscaling/"} variable "cluster_block_volume_size" {default="${cluster_block_volume_size}"} @@ -117,4 +125,6 @@ variable "use_multiple_ads" { default = ${use_multiple_ads} } variable "bastion_username" { default = "${bastion_username}" } variable "compute_username" { default = "${compute_username}" } -variable "localdisk" { default = "${localdisk}" } \ No newline at end of file +variable "localdisk" { default = "${localdisk}" } + +variable "instance_pool_ocpus_denseIO_flex" { default = "##OCPU##"} diff --git a/data.tf b/data.tf index 475d8b26..c90b6ced 100755 --- a/data.tf +++ b/data.tf @@ -70,4 +70,11 @@ data "oci_resourcemanager_private_endpoint_reachable_ip" "private_endpoint_reach count = (var.private_deployment && var.slurm_ha) ? 1 : 0 private_endpoint_id = oci_resourcemanager_private_endpoint.rms_private_endpoint[0].id private_ip = tostring(oci_core_instance.backup[0].private_ip) +} + +data "oci_resourcemanager_private_endpoint_reachable_ip" "private_endpoint_reachable_ip_login" { + #Required + count = (var.private_deployment && var.login_node) ? 1 : 0 + private_endpoint_id = oci_resourcemanager_private_endpoint.rms_private_endpoint[0].id + private_ip = tostring(oci_core_instance.login[0].private_ip) } \ No newline at end of file diff --git a/inventory.tpl b/inventory.tpl index 8b941154..3134339a 100755 --- a/inventory.tpl +++ b/inventory.tpl @@ -2,6 +2,8 @@ ${bastion_name} ansible_host=${bastion_ip} ansible_user=${bastion_username} role=bastion [slurm_backup] %{ if backup_name != "" }${backup_name} ansible_host=${backup_ip} ansible_user=${compute_username} role=bastion%{ endif } +[login] +%{ if login_name != "" }${login_name} ansible_host=${login_ip} ansible_user=${compute_username} role=login%{ endif } [compute_to_add] [compute_configured] %{ for host, ip in compute ~} @@ -58,8 +60,16 @@ admin_username = ${admin_username} instance_type=permanent enroot=${enroot} pyxis=${pyxis} +pam=${pam} privilege_sudo=${privilege_sudo} privilege_group_name=${privilege_group_name} latency_check=${latency_check} compute_username=${compute_username} -bastion_username=${bastion_username} \ No newline at end of file +bastion_username=${bastion_username} +region= ${region} +tenancy_ocid = ${tenancy_ocid} +inst_prin = ${inst_prin} +api_fingerprint = ${api_fingerprint} +api_user_ocid = ${api_user_ocid} +sacct_limits=${sacct_limits} + diff --git a/locals.tf b/locals.tf index 016f1800..a7f83527 100755 --- a/locals.tf +++ b/locals.tf @@ -5,7 +5,12 @@ locals { image_ocid = var.unsupported ? var.image_ocid : var.image custom_bastion_image_ocid = var.unsupported_bastion ? var.unsupported_bastion_image : var.custom_bastion_image + custom_login_image_ocid = var.unsupported_login ? var.unsupported_login_image : var.custom_login_image + shape = var.cluster_network ? var.cluster_network_shape : var.instance_pool_shape + instance_pool_ocpus = local.shape == "VM.DenseIO.E4.Flex" ? var.instance_pool_ocpus_denseIO_flex : var.instance_pool_ocpus + bastion_ocpus = length(regexall(".*VM.*.*Flex$", var.bastion_shape)) > 0 ? var.bastion_ocpus_denseIO_flex : var.bastion_ocpus + login_ocpus = length(regexall(".*VM.*.*Flex$", var.login_shape)) > 0 ? var.login_ocpus_denseIO_flex : var.login_ocpus // ips of the instances cluster_instances_ips = var.cluster_network ? data.oci_core_instance.cluster_network_instances.*.private_ip : data.oci_core_instance.instance_pool_instances.*.private_ip @@ -25,14 +30,18 @@ locals { bastion_image = var.use_standard_image ? oci_core_app_catalog_subscription.bastion_mp_image_subscription[0].listing_resource_id : local.custom_bastion_image_ocid + login_image = var.login_node && ( var.use_standard_image_login || var.use_marketplace_image_login ) ? oci_core_app_catalog_subscription.login_mp_image_subscription[0].listing_resource_id : local.custom_login_image_ocid + cluster_network_image = var.use_marketplace_image ? oci_core_app_catalog_subscription.mp_image_subscription[0].listing_resource_id : local.image_ocid instance_pool_image = ! var.cluster_network && var.use_marketplace_image ? oci_core_app_catalog_subscription.mp_image_subscription[0].listing_resource_id : local.image_ocid // image = (var.cluster_network && var.use_marketplace_image == true) || (var.cluster_network == false && var.use_marketplace_image == false) ? var.image : data.oci_core_images.linux.images.0.id - is_bastion_flex_shape = length(regexall(".*VM.*.*Flex$", var.bastion_shape)) > 0 ? [var.bastion_ocpus]:[] - is_instance_pool_flex_shape = length(regexall(".*VM.*.*Flex$", var.instance_pool_shape)) > 0 ? [var.instance_pool_ocpus]:[] + is_bastion_flex_shape = length(regexall(".*VM.*.*Flex$", var.bastion_shape)) > 0 ? [local.bastion_ocpus]:[] + is_login_flex_shape = length(regexall(".*VM.*.*Flex$", var.login_shape)) > 0 ? [local.login_ocpus]:[] + + is_instance_pool_flex_shape = length(regexall(".*VM.*.*Flex$", var.instance_pool_shape)) > 0 ? [local.instance_pool_ocpus]:[] bastion_mount_ip = var.bastion_block ? element(concat(oci_core_volume_attachment.bastion_volume_attachment.*.ipv4, [""]), 0) : "none" @@ -48,10 +57,13 @@ locals { cluster_ocid = var.node_count > 0 ? var.cluster_network ? oci_core_cluster_network.cluster_network[0].id : oci_core_instance_pool.instance_pool[0].id : "" host = var.private_deployment ? data.oci_resourcemanager_private_endpoint_reachable_ip.private_endpoint_reachable_ip[0].ip_address : oci_core_instance.bastion.public_ip bastion_bool_ip = var.private_deployment ? false : true + login_bool_ip = var.private_deployment ? false : true bastion_subnet = var.private_deployment ? oci_core_subnet.private-subnet : oci_core_subnet.private-subnet private_subnet_cidr = var.private_deployment ? [var.public_subnet, var.private_subnet] : [var.private_subnet] host_backup = var.slurm_ha ? var.private_deployment ? data.oci_resourcemanager_private_endpoint_reachable_ip.private_endpoint_reachable_ip_backup[0].ip_address : oci_core_instance.backup[0].public_ip : "none" + host_login = var.login_node ? var.private_deployment ? data.oci_resourcemanager_private_endpoint_reachable_ip.private_endpoint_reachable_ip_login[0].ip_address : oci_core_instance.login[0].public_ip : "none" timeout_per_batch= var.cluster_network ? 30 : 15 timeout_ip = join("",[ (( var.node_count - ( var.node_count % 20 ) )/20 + 1 ) * local.timeout_per_batch,"m"]) + } diff --git a/login.tf b/login.tf new file mode 100644 index 00000000..22200fc5 --- /dev/null +++ b/login.tf @@ -0,0 +1,59 @@ +resource "oci_core_volume" "login_volume" { + count = var.login_block && var.login_node ? 1 : 0 + availability_domain = var.login_ad + compartment_id = var.targetCompartment + display_name = "${local.cluster_name}-login" + size_in_gbs = var.login_block_volume_size + vpus_per_gb = split(".", var.login_block_volume_performance)[0] +} + + +resource "oci_core_volume_attachment" "login_volume_attachment" { + count = var.login_block && var.login_node ? 1 : 0 + attachment_type = "iscsi" + volume_id = oci_core_volume.login_volume[0].id + instance_id = oci_core_instance.login[0].id + display_name = "${local.cluster_name}-login-volume-attachment" + device = "/dev/oracleoci/oraclevdb" +} + +resource "oci_core_instance" "login" { + count = var.login_node ? 1 : 0 + depends_on = [oci_core_subnet.public-subnet] + availability_domain = var.login_ad + compartment_id = var.targetCompartment + shape = var.login_shape + + dynamic "shape_config" { + for_each = local.is_login_flex_shape + content { + ocpus = shape_config.value + memory_in_gbs = var.login_custom_memory ? var.login_memory : 16 * shape_config.value + } + } + agent_config { + is_management_disabled = true + } + display_name = "${local.cluster_name}-login" + + freeform_tags = { + "cluster_name" = local.cluster_name + "parent_cluster" = local.cluster_name + } + + metadata = { + ssh_authorized_keys = "${var.ssh_key}\n${tls_private_key.ssh.public_key_openssh}" + user_data = base64encode(data.template_file.bastion_config.rendered) + } + source_details { +// source_id = var.use_standard_image ? data.oci_core_images.linux.images.0.id : local.custom_bastion_image_ocid + source_id = local.login_image + boot_volume_size_in_gbs = var.login_boot_volume_size + source_type = "image" + } + + create_vnic_details { + subnet_id = local.bastion_subnet_id + assign_public_ip = local.login_bool_ip + } +} diff --git a/marketplace.tf b/marketplace.tf index 3aee746f..5917390b 100755 --- a/marketplace.tf +++ b/marketplace.tf @@ -2,8 +2,10 @@ locals { // listing_number = split(".", var.marketplace_listing)[0] mp_listing_id = var.use_marketplace_image ? var.use_old_marketplace_image ? var.old_marketplace_listing_id : substr(var.marketplace_listing,0,3) == "HPC" ? var.marketplace_listing_id_HPC : var.marketplace_listing_id_GPU : "" mp_bastion_listing_id = var.use_standard_image ? var.use_old_marketplace_image ? var.old_marketplace_listing_id :var.marketplace_listing_id_HPC : "" + mp_login_listing_id = var.use_marketplace_image_login ? var.use_old_marketplace_image_login ? var.old_marketplace_listing_id : substr(var.marketplace_listing_login,0,3) == "HPC" ? var.marketplace_listing_id_HPC : var.marketplace_listing_id_GPU : "" mp_version_id = var.use_old_marketplace_image ? var.marketplace_version_id[split(".", var.old_marketplace_listing)[0]] : var.marketplace_version_id[var.marketplace_listing] mp_bastion_version_id = var.use_old_marketplace_image ? var.marketplace_version_id[split(".", var.old_marketplace_listing)[0]] : var.marketplace_version_id["HPC_OL7"] + mp_login_version_id = var.use_old_marketplace_image_login ? var.marketplace_version_id[split(".", var.old_marketplace_listing_login)[0]] : var.marketplace_version_id[var.marketplace_listing_login] } /* @@ -73,3 +75,31 @@ resource "oci_core_app_catalog_subscription" "bastion_mp_image_subscription" { create = "20m" } } + +data "oci_core_app_catalog_listing_resource_versions" "login_app_catalog_listing_resource_versions" { + count = var.login_node && ( var.use_marketplace_image_login || var.use_standard_image_login ) ? 1 : 0 + listing_id = local.mp_login_listing_id +} + +resource "oci_core_app_catalog_listing_resource_version_agreement" "login_mp_image_agreement" { + count = var.login_node && ( var.use_marketplace_image_login || var.use_standard_image_login ) ? 1 : 0 + + listing_id = local.mp_login_listing_id + listing_resource_version = local.mp_login_version_id + +} + +resource "oci_core_app_catalog_subscription" "login_mp_image_subscription" { + count = var.login_node && ( var.use_marketplace_image_login || var.use_standard_image_login ) ? 1 : 0 + compartment_id = var.targetCompartment + eula_link = oci_core_app_catalog_listing_resource_version_agreement.login_mp_image_agreement[0].eula_link + listing_id = oci_core_app_catalog_listing_resource_version_agreement.login_mp_image_agreement[0].listing_id + listing_resource_version = oci_core_app_catalog_listing_resource_version_agreement.login_mp_image_agreement[0].listing_resource_version + oracle_terms_of_use_link = oci_core_app_catalog_listing_resource_version_agreement.login_mp_image_agreement[0].oracle_terms_of_use_link + signature = oci_core_app_catalog_listing_resource_version_agreement.login_mp_image_agreement[0].signature + time_retrieved = oci_core_app_catalog_listing_resource_version_agreement.login_mp_image_agreement[0].time_retrieved + + timeouts { + create = "20m" + } +} diff --git a/outputs.tf b/outputs.tf index 7f95f526..b11d640f 100755 --- a/outputs.tf +++ b/outputs.tf @@ -8,4 +8,8 @@ output "private_ips" { output "backup" { value = var.slurm_ha ? local.host_backup : "No Slurm Backup Defined" +} + +output "login" { + value = var.login_node ? local.host_login : "No Login Node Defined" } \ No newline at end of file diff --git a/playbooks/destroy.yml b/playbooks/destroy.yml index 2b5ba8cf..520b756d 100755 --- a/playbooks/destroy.yml +++ b/playbooks/destroy.yml @@ -1,4 +1,4 @@ -- hosts: compute +- hosts: compute, slurm_backup become: true vars: destroy: true @@ -9,7 +9,7 @@ - include_role: name: slurm when: slurm|default(false)|bool -- hosts: bastion +- hosts: bastion, slurm_backup, login become: true vars: destroy: true diff --git a/playbooks/new_nodes.yml b/playbooks/new_nodes.yml index d86ac463..c54b519f 100755 --- a/playbooks/new_nodes.yml +++ b/playbooks/new_nodes.yml @@ -49,7 +49,7 @@ name: rdma-interface when: cluster_network|bool -- hosts: bastion,slurm_backup,compute +- hosts: bastion,slurm_backup,login,compute become: true vars: destroy: false @@ -163,6 +163,8 @@ - include_role: name: nvidia-enroot when: enroot|default(true)|bool + - include_role: + name: tuned - hosts: compute @@ -171,11 +173,11 @@ name: latency_check when: cluster_network|bool and not 'GPU' in shape -- hosts: compute +- hosts: compute, slurm_backup vars: destroy: false initial: false - download_path: "{{ cluster_nfs_path if cluster_nfs|bool else '/tmp' }}" + download_path: "{{ nfs_target_path if create_fss | bool else ( cluster_nfs_path if cluster_nfs|bool else '/tmp') }}" enroot_top_path: "{{ nvme_path }}/enroot/" vars_files: - "/opt/oci-hpc/conf/queues.conf" @@ -191,4 +193,4 @@ when: slurm|default(false)|bool - include_role: name: telegraf - when: monitoring|default(false)|bool + when: monitoring|default(false)|bool \ No newline at end of file diff --git a/playbooks/resize_add.yml b/playbooks/resize_add.yml index 0327973d..11ed903e 100755 --- a/playbooks/resize_add.yml +++ b/playbooks/resize_add.yml @@ -47,7 +47,7 @@ name: rdma-interface when: cluster_network|bool -- hosts: bastion,slurm_backup,compute +- hosts: bastion,slurm_backup,login,compute become: true vars: destroy: false @@ -165,6 +165,8 @@ - include_role: name: nvidia-enroot when: enroot|default(true)|bool + - include_role: + name: tuned - hosts: compute_to_add @@ -177,7 +179,7 @@ vars: destroy: false initial: false - download_path: "{{ cluster_nfs_path if cluster_nfs|bool else '/tmp' }}" + download_path: "{{ nfs_target_path if create_fss | bool else ( cluster_nfs_path if cluster_nfs|bool else '/tmp') }}" enroot_top_path: "{{ nvme_path }}/enroot/" vars_files: - "/opt/oci-hpc/conf/queues.conf" diff --git a/playbooks/resize_remove.yml b/playbooks/resize_remove.yml index e3dcecab..c75ea9fc 100755 --- a/playbooks/resize_remove.yml +++ b/playbooks/resize_remove.yml @@ -1,4 +1,4 @@ -- hosts: bastion, slurm_backup, compute +- hosts: bastion, slurm_backup, compute, login become: true gather_facts: true vars: @@ -9,7 +9,7 @@ - include_role: name: etc-hosts -- hosts: compute_to_destroy +- hosts: compute_to_destroy, slurm_backup become: true vars: destroy: true diff --git a/playbooks/resize_remove_unreachable.yml b/playbooks/resize_remove_unreachable.yml index 1760ab7d..4a5b95e7 100644 --- a/playbooks/resize_remove_unreachable.yml +++ b/playbooks/resize_remove_unreachable.yml @@ -1,4 +1,4 @@ -- hosts: bastion, compute +- hosts: bastion, compute, slurm_backup, login become: true gather_facts: true vars: diff --git a/playbooks/roles/autoscaling_mon/files/initial.sh b/playbooks/roles/autoscaling_mon/files/initial.sh index 77d5198a..b4c8bc87 100644 --- a/playbooks/roles/autoscaling_mon/files/initial.sh +++ b/playbooks/roles/autoscaling_mon/files/initial.sh @@ -3,7 +3,7 @@ sudo yum install -y grafana-7.5.0-1.x86_64.rpm sudo yum install -y https://dev.mysql.com/get/mysql80-community-release-el7-3.noarch.rpm sudo yum install -y mysql-shell sudo pip3 install protobuf==3.19.4 -sudo pip3 install mysql-connector-python +sudo pip3 install mysql-connector-python==8.0.31 sudo systemctl daemon-reload sudo systemctl start grafana-server sudo systemctl status grafana-server diff --git a/playbooks/roles/autoscaling_mon/files/monitor_slurm.sh b/playbooks/roles/autoscaling_mon/files/monitor_slurm.sh index b8550e2e..51a661fb 100644 --- a/playbooks/roles/autoscaling_mon/files/monitor_slurm.sh +++ b/playbooks/roles/autoscaling_mon/files/monitor_slurm.sh @@ -56,12 +56,23 @@ def getClusterName(node): out = subprocess.Popen(['scontrol','show','topology',node], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True) stdout,stderr = out.communicate() clusterName = None - if len(stdout.split('\n')) > 2: - for output in stdout.split('\n')[:-1]: - if "Switches=" in output: - clusterName=output.split()[0].split('SwitchName=')[1] - elif len(stdout.split('\n')) == 2: - clusterName=stdout.split('\n')[0].split()[0].split('SwitchName=')[1] + try: + if len(stdout.split('\n')) > 2: + for output in stdout.split('\n')[:-1]: + if "Switches=" in output: + clusterName=output.split()[0].split('SwitchName=')[1] + break + elif "SwitchName=inactive-" in output: + continue + else: + clusterName=output.split()[0].split('SwitchName=')[1] + elif len(stdout.split('\n')) == 2: + clusterName=stdout.split('\n')[0].split()[0].split('SwitchName=')[1] + if clusterName.startswith("inactive-"): + return "NOCLUSTERFOUND" + except: + print('No ClusterName could be found for '+node) + return "NOCLUSTERFOUND" return clusterName #def getCPUsDetails(job): diff --git a/playbooks/roles/autoscaling_mon/tasks/el.yml b/playbooks/roles/autoscaling_mon/tasks/el.yml index 8ecfeba6..c14ccd72 100755 --- a/playbooks/roles/autoscaling_mon/tasks/el.yml +++ b/playbooks/roles/autoscaling_mon/tasks/el.yml @@ -53,8 +53,8 @@ - name: install grafana vars: package_name: - - grafana - package_state: latest + - grafana-8.5.21-1 + package_state: present include_role: name: safe_yum @@ -81,19 +81,6 @@ no_log: false register: existing_api_keys -- name: Import mysql-2022 key - become: true - rpm_key: - state: present - key: https://repo.mysql.com/RPM-GPG-KEY-mysql-2022 - -- name: install mysql - vars: - package_name: - - https://dev.mysql.com/get/mysql80-community-release-el7-5.noarch.rpm - include_role: - name: safe_yum - - name: install mysql-shell and connector vars: package_name: @@ -226,6 +213,6 @@ - name: install protobuf v3.19.4 and mysql connector become: true pip: - name: [protobuf==3.19.4,mysql-connector-python] + name: [protobuf==3.19.4,mysql-connector-python==8.0.31] executable: pip3 ignore_errors: yes \ No newline at end of file diff --git a/playbooks/roles/autoscaling_mon/tasks/ubuntu.yml b/playbooks/roles/autoscaling_mon/tasks/ubuntu.yml index adb81837..4f46e0a8 100644 --- a/playbooks/roles/autoscaling_mon/tasks/ubuntu.yml +++ b/playbooks/roles/autoscaling_mon/tasks/ubuntu.yml @@ -65,8 +65,8 @@ - name: install grafana vars: package_name: - - grafana - package_state: latest + - grafana-8.5.21-1 + package_state: present include_role: name: safe_yum @@ -239,18 +239,18 @@ # overwrite: yes # path: files/cluster.json -- name: Install pip - vars: - package_name: - - python3-pip - include_role: - name: safe_yum +# - name: Install pip +# vars: +# package_name: +# - python3-pip +# include_role: +# name: safe_yum - name: install protobuf v3.19.4 and mysql connector become: true vars: ansible_python_interpreter: /usr/bin/python3 pip: - name: [protobuf==3.19.4,mysql-connector-python] + name: [protobuf==3.19.4,mysql-connector-python==8.0.31] executable: pip3 ignore_errors: yes \ No newline at end of file diff --git a/playbooks/roles/cluster-cli/tasks/debian.yml b/playbooks/roles/cluster-cli/tasks/debian.yml index 7e0f9e57..c1f5e422 100644 --- a/playbooks/roles/cluster-cli/tasks/debian.yml +++ b/playbooks/roles/cluster-cli/tasks/debian.yml @@ -1,12 +1,13 @@ --- - - name: Install required packages - apt: - name: + vars: + package_name: - python3-click - python3-ldap3 - state: present - update_cache: yes + package_state: present + package_cache: true + include_role: + name: safe_yum - name: copy cluster cli copy: diff --git a/playbooks/roles/destroy_unreachable/tasks/common.yml b/playbooks/roles/destroy_unreachable/tasks/common.yml index 5ec25b83..111778da 100644 --- a/playbooks/roles/destroy_unreachable/tasks/common.yml +++ b/playbooks/roles/destroy_unreachable/tasks/common.yml @@ -46,6 +46,15 @@ with_items: "{{all_unreachable_nodes}}" ignore_unreachable: yes +- name: remove from /etc/hosts + become: true + lineinfile: + path: "/etc/hosts" + regexp: "{{item}}-rdma\\s" + state: absent + with_items: "{{all_unreachable_nodes}}" + ignore_unreachable: yes + - name: "remove from hostfile.rdma.{{ cluster_name }}" lineinfile: path: "/etc/opt/oci-hpc/hostfile.rdma.{{ cluster_name }}" diff --git a/playbooks/roles/destroy_unreachable/tasks/slurm-rack-aware.yml b/playbooks/roles/destroy_unreachable/tasks/slurm-rack-aware.yml index 485f865d..4471c98c 100644 --- a/playbooks/roles/destroy_unreachable/tasks/slurm-rack-aware.yml +++ b/playbooks/roles/destroy_unreachable/tasks/slurm-rack-aware.yml @@ -47,7 +47,7 @@ when: ( item.stdout_lines | length ) == 0 - name: get UpperSwitchNames - shell: "scontrol show topology {{item}} | grep -v inactive | grep Switches= | awk '{print $1}' | cut -d \"=\" -f 2" + shell: "scontrol show topology {{item}} | grep -v inactive | grep Switches= | grep Level=1 | awk '{print $1}' | cut -d \"=\" -f 2" register: current_UpperSwitchName run_once: true delegate_to: 127.0.0.1 @@ -128,7 +128,7 @@ state: present with_items: "{{unreachable_slurm_nodes}}" ignore_errors: yes - when: ( not upperswitchnames[item] is match("inactive-.*") ) and ( ( racks_on_switch_dict[item] | difference(switchnames[item]) | length ) > 0 ) and ( upperswitchnames[item] | length ) > 1 + when: ( not upperswitchnames[item] is match("inactive-.*") ) and ( ( racks_on_switch_dict[item] | difference(switchnames[item]) | length ) > 0 ) and ( upperswitchnames[item] | length ) > 1 and ( nodes_on_switch[item] | length ) < 2 run_once: true delegate_to: 127.0.0.1 @@ -148,7 +148,7 @@ state: absent with_items: "{{unreachable_slurm_nodes}}" ignore_unreachable: yes - when: ( not upperswitchnames[item] is match("inactive-.*") ) and ( ( racks_on_switch_dict[item] | difference(switchnames[item]) | length ) == 0 ) and ( upperswitchnames[item] | length ) > 1 + when: ( not upperswitchnames[item] is match("inactive-.*") ) and ( ( racks_on_switch_dict[item] | difference(switchnames[item]) | length ) == 0 ) and ( upperswitchnames[item] | length ) > 1 and ( nodes_on_switch[item] | length ) < 2 run_once: true delegate_to: 127.0.0.1 @@ -255,6 +255,18 @@ delegate_to: 127.0.0.1 when: ('bastion' in group_names) +- name: move topology.conf on backup servers + become: true + copy: + dest: '{{ slurm_conf_path }}/topology.conf' + src: '{{ slurm_conf_path }}/topology.conf' + force: yes + register: topology_copied + until: topology_copied is not failed + retries: 10 + delay: 5 + when: ('slurm_backup' in group_names) + - name: Reconfigure Slurm for topology become: true command: "scontrol reconfigure" diff --git a/playbooks/roles/destroy_unreachable/tasks/slurm.yml b/playbooks/roles/destroy_unreachable/tasks/slurm.yml index ada27290..e06e77a3 100644 --- a/playbooks/roles/destroy_unreachable/tasks/slurm.yml +++ b/playbooks/roles/destroy_unreachable/tasks/slurm.yml @@ -145,6 +145,18 @@ delegate_to: 127.0.0.1 when: ('bastion' in group_names) +- name: move topology.conf on backup servers + become: true + copy: + dest: '{{ slurm_conf_path }}/topology.conf' + src: '{{ slurm_conf_path }}/topology.conf' + force: yes + register: topology_copied + until: topology_copied is not failed + retries: 10 + delay: 5 + when: ('slurm_backup' in group_names) + - name: Reconfigure Slurm for topology become: true command: "scontrol reconfigure" diff --git a/playbooks/roles/etc-hosts/tasks/common.yml b/playbooks/roles/etc-hosts/tasks/common.yml index 7fffd1bf..4a128bc0 100644 --- a/playbooks/roles/etc-hosts/tasks/common.yml +++ b/playbooks/roles/etc-hosts/tasks/common.yml @@ -52,13 +52,13 @@ run_once: true when: not destroy|bool and groups['compute']|length > 0 -- name: move /etc/hosts on backup slurm +- name: move /etc/hosts on backup slurm and login node become: true copy: dest: /etc/hosts src: /etc/hosts force: yes - when: ( not destroy|bool ) and ('slurm_backup' in group_names) + when: ( not destroy|bool ) and (('slurm_backup' in group_names) or ('login' in group_names)) - name: move /etc/hosts on all compute nodes become: true @@ -66,7 +66,7 @@ dest: /etc/hosts src: /tmp/hosts.etc.{{ cluster_name }} force: yes - when: ( not destroy|bool ) and (not 'bastion' in group_names) and (not 'slurm_backup' in group_names) + when: ( not destroy|bool ) and (not 'bastion' in group_names) and (not 'slurm_backup' in group_names) and (not 'login' in group_names) - name: remove cluster from etc-host become: true @@ -86,4 +86,14 @@ state: absent delegate_to: "{{ groups['slurm_backup'][0] }}" run_once: true - when: destroy|bool and (groups['slurm_backup']|length > 0)|bool \ No newline at end of file + when: destroy|bool and (groups['slurm_backup']|length > 0)|bool + +- name: remove cluster from etc-host on login + become: true + blockinfile: + dest: /etc/hosts + marker: "# {mark} ANSIBLE MANAGED BLOCK {{ cluster_name }}" + state: absent + delegate_to: "{{ groups['login'][0] }}" + run_once: true + when: destroy|bool and (groups['login']|length > 0)|bool \ No newline at end of file diff --git a/playbooks/roles/etc-hosts/templates/etc-hosts-bastion.j2 b/playbooks/roles/etc-hosts/templates/etc-hosts-bastion.j2 index f5fdd943..0289b5c0 100755 --- a/playbooks/roles/etc-hosts/templates/etc-hosts-bastion.j2 +++ b/playbooks/roles/etc-hosts/templates/etc-hosts-bastion.j2 @@ -5,4 +5,8 @@ {% for item in groups['slurm_backup'] %} {% set short_name = hostvars[item]['ansible_fqdn'].split('.') %} {{ hostvars[item]['ansible_host'] }} {{ hostvars[item]['ansible_fqdn'] }} {{ short_name[0] }} backup +{% endfor %} +{% for item in groups['login'] %} +{% set short_name = hostvars[item]['ansible_fqdn'].split('.') %} +{{ hostvars[item]['ansible_host'] }} {{ hostvars[item]['ansible_fqdn'] }} {{ short_name[0] }} login {% endfor %} \ No newline at end of file diff --git a/playbooks/roles/fix_broken/tasks/ubuntu.yml b/playbooks/roles/fix_broken/tasks/ubuntu.yml index 81e27814..a522df45 100644 --- a/playbooks/roles/fix_broken/tasks/ubuntu.yml +++ b/playbooks/roles/fix_broken/tasks/ubuntu.yml @@ -27,10 +27,13 @@ until: result.stdout | int == 0 - name: Purge unattended-upgrades - apt: - name: unattended-upgrades - purge: yes - state: absent + vars: + package_name: + - unattended-upgrades + package_state: absent + package_purge: true + include_role: + name: safe_yum ignore_errors: yes - name: stop and mask timers diff --git a/playbooks/roles/grafana/tasks/el.yml b/playbooks/roles/grafana/tasks/el.yml index 7f38fe90..7172bf96 100755 --- a/playbooks/roles/grafana/tasks/el.yml +++ b/playbooks/roles/grafana/tasks/el.yml @@ -15,8 +15,8 @@ - name: install grafana vars: package_name: - - grafana - package_state: latest + - grafana-8.5.21-1 + package_state: present include_role: name: safe_yum diff --git a/playbooks/roles/grafana/tasks/ubuntu.yml b/playbooks/roles/grafana/tasks/ubuntu.yml index f3e5fc2d..af9fa526 100644 --- a/playbooks/roles/grafana/tasks/ubuntu.yml +++ b/playbooks/roles/grafana/tasks/ubuntu.yml @@ -27,8 +27,8 @@ - name: install grafana vars: package_name: - - grafana - package_state: latest + - grafana-8.5.21-1 + package_state: present include_role: name: safe_yum diff --git a/playbooks/roles/influxdb/tasks/config_influxdb.yml b/playbooks/roles/influxdb/tasks/config_influxdb.yml new file mode 100644 index 00000000..96d0ec86 --- /dev/null +++ b/playbooks/roles/influxdb/tasks/config_influxdb.yml @@ -0,0 +1,51 @@ +--- +- name: Create /etc/opt/oci-hpc/passwords/influxdb + become: true + file: + path: /etc/opt/oci-hpc/passwords/influxdb + state: directory + owner: '{{ ansible_user }}' + mode: 0770 + group: '{{ ansible_user }}' + recurse: yes + +- name: Generate password for Influx admin and save it to /etc/opt/oci-hpc/passwords + set_fact: + tmp_pwd: "{{ lookup('password', + '/etc/opt/oci-hpc/passwords/influxdb/root.txt + chars=ascii_letters,digits,hexdigits') }}" + +- name: Get influx password from /etc/opt/oci-hpc/passwords + set_fact: + influx_admin_pwd: "{{ lookup('password', + '/etc/opt/oci-hpc/passwords/influxdb/root.txt + chars=ascii_letters,digits,hexdigits') }}" + +- name: Start InfluxDB + become: true + service: + name: influxdb + state: started + enabled: true + + +- name: Set configuration directory path + become: true + file: + path: "{{ influxdb_configuration_dir }}" + state: directory + +- name: Set templatized InfluxDB configuration + become: true + template: + src: influxdb.conf.j2 + dest: "{{ influxdb_configuration_dir }}/influxdb.conf" + force: yes + backup: yes + owner: influxdb + group: influxdb + mode: 0744 + register: influx_config + notify: restart influxdb + + diff --git a/playbooks/roles/influxdb/tasks/el.yml b/playbooks/roles/influxdb/tasks/el.yml index edc56d0e..d8e45e5b 100755 --- a/playbooks/roles/influxdb/tasks/el.yml +++ b/playbooks/roles/influxdb/tasks/el.yml @@ -1,70 +1,7 @@ --- -- name: Create /etc/opt/oci-hpc/passwords/influxdb - become: true - file: - path: /etc/opt/oci-hpc/passwords/influxdb - state: directory - owner: '{{ ansible_user }}' - mode: 0770 - group: '{{ ansible_user }}' - recurse: yes - -- name: Generate password for Influx admin and save it to /etc/opt/oci-hpc/passwords - set_fact: - tmp_pwd: "{{ lookup('password', - '/etc/opt/oci-hpc/passwords/influxdb/root.txt - chars=ascii_letters,digits,hexdigits') }}" - -- name: Get influx password from /etc/opt/oci-hpc/passwords - set_fact: - influx_admin_pwd: "{{ lookup('password', - '/etc/opt/oci-hpc/passwords/influxdb/root.txt - chars=ascii_letters,digits,hexdigits') }}" - -- name: Add influxdb repository - become: true - yum_repository: - name: influxdb - description: InfluxDB Repository - RHEL $releasever - baseurl: https://repos.influxdata.com/rhel/{{ ansible_distribution_major_version }}/$basearch/stable - enabled: 1 - gpgcheck: 1 - gpgkey: https://repos.influxdata.com/influxdb.key - -- name: Install InfluxDB - vars: - package_name: - - influxdb - - python-pip - package_state: latest - include_role: - name: safe_yum - -- name: Start InfluxDB - become: true - service: - name: influxdb - state: started - enabled: true - - -- name: Set configuration directory path - become: true - file: - path: "{{ influxdb_configuration_dir }}" - state: directory - -- name: Set templatized InfluxDB configuration - become: true - template: - src: influxdb.conf.j2 - dest: "{{ influxdb_configuration_dir }}/influxdb.conf" - force: yes - backup: yes - owner: influxdb - group: influxdb - mode: 0744 - register: influx_config - notify: restart influxdb - +- name: install influxdb + include_tasks: el_install_influxdb.yml +- name: configure influxdb on bastion + include_tasks: config_influxdb.yml + when: "'bastion' in group_names" \ No newline at end of file diff --git a/playbooks/roles/influxdb/tasks/el_install_influxdb.yml b/playbooks/roles/influxdb/tasks/el_install_influxdb.yml new file mode 100644 index 00000000..1f3c0185 --- /dev/null +++ b/playbooks/roles/influxdb/tasks/el_install_influxdb.yml @@ -0,0 +1,24 @@ +--- +- name: Add influxdb repository + become: true + yum_repository: + name: influxdb + description: InfluxDB Repository - RHEL $releasever + baseurl: https://repos.influxdata.com/rhel/{{ ansible_distribution_major_version }}/$basearch/stable + enabled: 1 + gpgcheck: 1 + gpgkey: https://repos.influxdata.com/influxdata-archive_compat.key + +- name: Install InfluxDB + vars: + package_name: + - influxdb + package_state: latest + include_role: + name: safe_yum + +- name: install influx pip + become: true + pip: + name: influxdb + executable: pip3 \ No newline at end of file diff --git a/playbooks/roles/influxdb/tasks/ubuntu.yml b/playbooks/roles/influxdb/tasks/ubuntu.yml index 42919848..a4cf3be1 100644 --- a/playbooks/roles/influxdb/tasks/ubuntu.yml +++ b/playbooks/roles/influxdb/tasks/ubuntu.yml @@ -1,83 +1,7 @@ --- -- name: Create /etc/opt/oci-hpc/passwords/influxdb - become: true - file: - path: /etc/opt/oci-hpc/passwords/influxdb - state: directory - owner: '{{ ansible_user }}' - mode: 0770 - group: '{{ ansible_user }}' - recurse: yes - -- name: Generate password for Influx admin and save it to /etc/opt/oci-hpc/passwords - set_fact: - tmp_pwd: "{{ lookup('password', - '/etc/opt/oci-hpc/passwords/influxdb/root.txt - chars=ascii_letters,digits,hexdigits') }}" - -- name: Get influx password from /etc/opt/oci-hpc/passwords - set_fact: - influx_admin_pwd: "{{ lookup('password', - '/etc/opt/oci-hpc/passwords/influxdb/root.txt - chars=ascii_letters,digits,hexdigits') }}" - -# - name: Add influxdb repository -# become: true -# apt_repository: -# repo: "deb [arch=amd64 signed-by=https://repos.influxdata.com/influxdb.key] https://repos.influxdata.com/{{ ansible_distribution | lower }} {{ ansible_distribution_release }} stable" -# state: present - # name: influxdb - # description: InfluxDB Repository - Debian - # baseurl: deb https://repos.influxdata.com/{{ ansible_distribution | lower }} {{ ansible_distribution_release }} stable - # enabled: 1 - # gpgcheck: 1 - # gpgkey: https://repos.influxdata.com/influxdb.key - -- name: Add InfluxData's key - become: true - apt_key: - state: present - url: https://repos.influxdata.com/influxdb.key - -- name: Manage InfluxData APT repositories - become: true - apt_repository: - repo: deb https://repos.influxdata.com/{{ ansible_distribution | lower }} {{ ansible_distribution_release }} stable - state: present - -- name: Install InfluxDB - vars: - package_name: - - influxdb - package_state: latest - include_role: - name: safe_yum - -- name: Start InfluxDB - become: true - service: - name: influxdb - state: started - enabled: true - - -- name: Set configuration directory path - become: true - file: - path: "{{ influxdb_configuration_dir }}" - state: directory - -- name: Set templatized InfluxDB configuration - become: true - template: - src: influxdb.conf.j2 - dest: "{{ influxdb_configuration_dir }}/influxdb.conf" - force: yes - backup: yes - owner: influxdb - group: influxdb - mode: 0744 - register: influx_config - notify: restart influxdb - +- name: install influxdb + include_tasks: ubuntu_install_influxdb.yml +- name: configure influxdb on bastion + include_tasks: config_influxdb.yml + when: "'bastion' in group_names" \ No newline at end of file diff --git a/playbooks/roles/influxdb/tasks/ubuntu_install_influxdb.yml b/playbooks/roles/influxdb/tasks/ubuntu_install_influxdb.yml new file mode 100644 index 00000000..ef93e456 --- /dev/null +++ b/playbooks/roles/influxdb/tasks/ubuntu_install_influxdb.yml @@ -0,0 +1,28 @@ +--- +- name: Add InfluxData's key + become: true + apt_key: + state: present + url: https://repos.influxdata.com/influxdata-archive_compat.key + +- name: Manage InfluxData APT repositories + become: true + apt_repository: + repo: deb https://repos.influxdata.com/{{ ansible_distribution | lower }} {{ ansible_distribution_release }} stable + state: present + +- name: Install InfluxDB + vars: + package_name: + - influxdb + package_state: latest + include_role: + name: safe_yum + +- name: install influx pip + become: true + vars: + ansible_python_interpreter: /usr/bin/python3 + pip: + name: influxdb + executable: pip3 \ No newline at end of file diff --git a/playbooks/roles/mpi-hostfiles/tasks/common.yml b/playbooks/roles/mpi-hostfiles/tasks/common.yml index ec1f8330..a713c88a 100644 --- a/playbooks/roles/mpi-hostfiles/tasks/common.yml +++ b/playbooks/roles/mpi-hostfiles/tasks/common.yml @@ -12,7 +12,6 @@ mode: '0644' owner: "{{ ansible_user }}" group: "{{ ansible_user }}" - tags: hostfile delegate_to: 127.0.0.1 run_once: true when: cluster_network|bool @@ -24,7 +23,6 @@ mode: '0644' owner: "{{ ansible_user }}" group: "{{ ansible_user }}" - tags: hostfile delegate_to: 127.0.0.1 run_once: true diff --git a/playbooks/roles/mysql/tasks/centos.yml b/playbooks/roles/mysql/tasks/centos.yml index 3b4720cb..710c4a81 100644 --- a/playbooks/roles/mysql/tasks/centos.yml +++ b/playbooks/roles/mysql/tasks/centos.yml @@ -29,7 +29,6 @@ package_name: '{{ mariadb_packages }}' include_role: name: safe_yum - tags: install-only - name: Update SELinux context for {{ mysql_db_path }} diff --git a/playbooks/roles/mysql/tasks/debian.yml b/playbooks/roles/mysql/tasks/debian.yml index 98eb655c..9d4b47be 100644 --- a/playbooks/roles/mysql/tasks/debian.yml +++ b/playbooks/roles/mysql/tasks/debian.yml @@ -34,7 +34,6 @@ package_name: '{{ deb_mariadb_packages }}' include_role: name: safe_yum - tags: install-only - name: Ensure {{ mysql_db_path }} exists become: true diff --git a/playbooks/roles/mysql/tasks/el.yml b/playbooks/roles/mysql/tasks/el.yml index ea71c748..d893abb8 100644 --- a/playbooks/roles/mysql/tasks/el.yml +++ b/playbooks/roles/mysql/tasks/el.yml @@ -36,7 +36,6 @@ package_repo: ol7_MySQL80 include_role: name: safe_yum - tags: install-only - name: Update SELinux context for {{ mysql_db_path }} become: true diff --git a/playbooks/roles/nfs-client/tasks/debian.yml b/playbooks/roles/nfs-client/tasks/debian.yml index 4d8ed999..6d6a84f7 100644 --- a/playbooks/roles/nfs-client/tasks/debian.yml +++ b/playbooks/roles/nfs-client/tasks/debian.yml @@ -3,7 +3,6 @@ ansible.builtin.package: name: '{{ deb_packages }}' state: present - tags: install-only - name: create share directory become: true diff --git a/playbooks/roles/nfs-client/tasks/el.yml b/playbooks/roles/nfs-client/tasks/el.yml index cdcd5936..944d9fc2 100755 --- a/playbooks/roles/nfs-client/tasks/el.yml +++ b/playbooks/roles/nfs-client/tasks/el.yml @@ -4,7 +4,6 @@ package_name: '{{ nfs_rpm_packages }}' include_role: name: safe_yum - tags: install-only - name: create share directory become: true diff --git a/playbooks/roles/nfs-client/tasks/ubuntu.yml b/playbooks/roles/nfs-client/tasks/ubuntu.yml index a2af7e1c..e512a800 100644 --- a/playbooks/roles/nfs-client/tasks/ubuntu.yml +++ b/playbooks/roles/nfs-client/tasks/ubuntu.yml @@ -3,7 +3,6 @@ ansible.builtin.package: name: "{{ nfs_deb_packages }}" state: present - tags: install-only - name: create share directory become: true diff --git a/playbooks/roles/nfs-server/tasks/el.yml b/playbooks/roles/nfs-server/tasks/el.yml index cdc409f6..70f5d39f 100755 --- a/playbooks/roles/nfs-server/tasks/el.yml +++ b/playbooks/roles/nfs-server/tasks/el.yml @@ -1,6 +1,12 @@ --- - name: Ensure NFS utilities are installed. - package: name=nfs-utils state=present + vars: + package_name: + - nfs-utils + package_state: present + include_role: + name: safe_yum + ignore_errors: true - name: Start NFS server service: diff --git a/playbooks/roles/no_instance_principal/defaults/main.yml b/playbooks/roles/no_instance_principal/defaults/main.yml new file mode 100755 index 00000000..e69de29b diff --git a/playbooks/roles/no_instance_principal/meta/main.yml b/playbooks/roles/no_instance_principal/meta/main.yml new file mode 100755 index 00000000..e69de29b diff --git a/playbooks/roles/no_instance_principal/tasks/common.yml b/playbooks/roles/no_instance_principal/tasks/common.yml new file mode 100755 index 00000000..cae6bc29 --- /dev/null +++ b/playbooks/roles/no_instance_principal/tasks/common.yml @@ -0,0 +1,25 @@ +--- +- name: create .oci directory + become: true + file: + path: /home/{{ ansible_user }}/.oci + state: directory + owner: "{{ ansible_user }}" + group: "{{ ansible_user }}" + mode: 0775 + +- name: Generate config file + become: true + template: + src: 'config.j2' + dest: '/home/{{ ansible_user }}/.oci/config' + mode: 0600 + owner: "{{ ansible_user }}" + group: "{{ ansible_user }}" + +- name: delete --auth in create_cluster.sh + become: true + replace: + path: /opt/oci-hpc/bin/create_cluster.sh + regexp: '--auth instance_principal' + replace: '' \ No newline at end of file diff --git a/playbooks/roles/no_instance_principal/tasks/main.yml b/playbooks/roles/no_instance_principal/tasks/main.yml new file mode 100755 index 00000000..270202fc --- /dev/null +++ b/playbooks/roles/no_instance_principal/tasks/main.yml @@ -0,0 +1,3 @@ +- include: common.yml + + diff --git a/playbooks/roles/no_instance_principal/templates/config.j2 b/playbooks/roles/no_instance_principal/templates/config.j2 new file mode 100644 index 00000000..31a1d924 --- /dev/null +++ b/playbooks/roles/no_instance_principal/templates/config.j2 @@ -0,0 +1,6 @@ +[DEFAULT] +user={{ api_user_ocid }} +fingerprint={{ api_fingerprint }} +tenancy={{ tenancy_ocid}} +region={{ region }} +key_file=/opt/oci-hpc/autoscaling/credentials/key.pem \ No newline at end of file diff --git a/playbooks/roles/no_instance_principal/vars/main.yml b/playbooks/roles/no_instance_principal/vars/main.yml new file mode 100755 index 00000000..e69de29b diff --git a/playbooks/roles/nvidia-container/tasks/ubuntu.yml b/playbooks/roles/nvidia-container/tasks/ubuntu.yml index 9bbc1537..49fe6d8a 100644 --- a/playbooks/roles/nvidia-container/tasks/ubuntu.yml +++ b/playbooks/roles/nvidia-container/tasks/ubuntu.yml @@ -13,15 +13,15 @@ owner: root group: root + - name: install packages - apt: - name: libnvidia-container-tools{{ libnvidia_container_tools_package_version | ternary("="+libnvidia_container_tools_package_version, "") }} - state: "{{ libnvidia_container_tools_package_state }}" - update_cache: yes - register: result - until: result is not failed - retries: 5 - delay: 5 + vars: + package_name: + - libnvidia-container-tools{{ libnvidia_container_tools_package_version | ternary("="+libnvidia_container_tools_package_version, "") }} + package_state: "{{ libnvidia_container_tools_package_state }}" + package_cache: true + include_role: + name: safe_yum - name: Install nvidia-container-toolkit vars: diff --git a/playbooks/roles/nvidia-enroot/tasks/oraclelinux-7.yml b/playbooks/roles/nvidia-enroot/tasks/oraclelinux-7.yml index 7b531689..41e0e56b 100644 --- a/playbooks/roles/nvidia-enroot/tasks/oraclelinux-7.yml +++ b/playbooks/roles/nvidia-enroot/tasks/oraclelinux-7.yml @@ -36,13 +36,22 @@ - name: execute enroot-check_*.run command: bash -c "/tmp/enroot-check_*.run --verify" + - name: + set_fact: + enroot_top_path_checked: "/etc/enroot/" + when: " not 'nvme0n1' in hostvars[inventory_hostname].ansible_devices" + + - name: + set_fact: + enroot_top_path_checked: "{{enroot_top_path}}" + when: "'nvme0n1' in hostvars[inventory_hostname].ansible_devices" - name: update ENROOT_RUNTIME_PATH lineinfile: dest: /etc/enroot/enroot.conf state: present regexp: '^#ENROOT_RUNTIME_PATH.*' - line: 'ENROOT_RUNTIME_PATH {{enroot_top_path}}/enroot_runtime/user-$(id -u)' + line: 'ENROOT_RUNTIME_PATH {{enroot_top_path_checked}}/enroot_runtime/user-$(id -u)' backup: yes - name: update ENROOT_DATA_PATH @@ -50,7 +59,7 @@ dest: /etc/enroot/enroot.conf state: present regexp: '^#ENROOT_DATA_PATH.*' - line: 'ENROOT_DATA_PATH {{enroot_top_path}}/enroot_data/user-$(id -u)' + line: 'ENROOT_DATA_PATH {{enroot_top_path_checked}}/enroot_data/user-$(id -u)' backup: yes - name: update ENROOT_CACHE_PATH @@ -58,7 +67,7 @@ dest: /etc/enroot/enroot.conf state: present regexp: '^#ENROOT_CACHE_PATH.*' - line: 'ENROOT_CACHE_PATH {{enroot_top_path}}/enroot_cache' + line: 'ENROOT_CACHE_PATH {{enroot_top_path_checked}}/enroot_cache' backup: yes - name: update ENROOT_TEMP_PATH @@ -66,14 +75,30 @@ dest: /etc/enroot/enroot.conf state: present regexp: '^#ENROOT_TEMP_PATH.*' - line: 'ENROOT_TEMP_PATH {{enroot_top_path}}/enroot_tmp' + line: 'ENROOT_TEMP_PATH {{enroot_top_path_checked}}/enroot_tmp' + backup: yes + + - name: update ENROOT_SQUASH_OPTIONS + lineinfile: + dest: /etc/enroot/enroot.conf + state: present + regexp: '^#ENROOT_SQUASH_OPTIONS.*' + line: 'ENROOT_SQUASH_OPTIONS -b 262144' + backup: yes + + - name: update ENROOT_ROOTFS_WRITABLE + lineinfile: + dest: /etc/enroot/enroot.conf + state: present + regexp: '^#ENROOT_ROOTFS_WRITABLE.*' + line: 'ENROOT_ROOTFS_WRITABLE yes' backup: yes - - name: set permissions on {{enroot_top_path}} + - name: set permissions on {{enroot_top_path_checked}} become: true file: - path: "{{enroot_top_path}}" + path: "{{enroot_top_path_checked}}" state: directory owner: opc mode: 0777 @@ -81,9 +106,9 @@ recurse: no - - name: Make sure all {{enroot_top_path}} directories exist + - name: Make sure all {{enroot_top_path_checked}} directories exist file: - path: "{{enroot_top_path}}/{{item}}" + path: "{{enroot_top_path_checked}}/{{item}}" state: directory mode: '0775' owner: opc diff --git a/playbooks/roles/nvidia-enroot/tasks/ubuntu.yml b/playbooks/roles/nvidia-enroot/tasks/ubuntu.yml index d8c9619a..cdbcaa00 100644 --- a/playbooks/roles/nvidia-enroot/tasks/ubuntu.yml +++ b/playbooks/roles/nvidia-enroot/tasks/ubuntu.yml @@ -6,16 +6,13 @@ dpkg_arch: "{{ 'amd64' if ansible_architecture == 'x86_64' else ansible_architecture }}" - name: install required packages - apt: - deb: '{{ item }}' - with_items: - - "https://github.com/NVIDIA/enroot/releases/download/v3.4.0/enroot_3.4.0-1_{{ dpkg_arch }}.deb" - - "https://github.com/NVIDIA/enroot/releases/download/v3.4.0/enroot+caps_3.4.0-1_{{ dpkg_arch }}.deb" - register: result - until: result is not failed - retries: 5 - delay: 5 - + vars: + deb_name: + - "https://github.com/NVIDIA/enroot/releases/download/v3.4.0/enroot_3.4.0-1_{{ dpkg_arch }}.deb" + - "https://github.com/NVIDIA/enroot/releases/download/v3.4.0/enroot+caps_3.4.0-1_{{ dpkg_arch }}.deb" + package_state: present + include_role: + name: safe_yum - name: set kernel.unprivileged_userns_clone using sysctl ansible.posix.sysctl: name: kernel.unprivileged_userns_clone @@ -35,13 +32,22 @@ - name: execute enroot-check_*.run command: bash -c "/tmp/enroot-check_*.run --verify" + - name: + set_fact: + enroot_top_path_checked: "/etc/enroot/" + when: " not 'nvme0n1' in hostvars[inventory_hostname].ansible_devices" + + - name: + set_fact: + enroot_top_path_checked: "{{enroot_top_path}}" + when: "'nvme0n1' in hostvars[inventory_hostname].ansible_devices" - name: update ENROOT_RUNTIME_PATH lineinfile: dest: /etc/enroot/enroot.conf state: present regexp: '^#ENROOT_RUNTIME_PATH.*' - line: 'ENROOT_RUNTIME_PATH {{enroot_top_path}}/enroot_runtime/user-$(id -u)' + line: 'ENROOT_RUNTIME_PATH {{enroot_top_path_checked}}/enroot_runtime/user-$(id -u)' backup: yes - name: update ENROOT_DATA_PATH @@ -49,7 +55,7 @@ dest: /etc/enroot/enroot.conf state: present regexp: '^#ENROOT_DATA_PATH.*' - line: 'ENROOT_DATA_PATH {{enroot_top_path}}/enroot_data/user-$(id -u)' + line: 'ENROOT_DATA_PATH {{enroot_top_path_checked}}/enroot_data/user-$(id -u)' backup: yes - name: update ENROOT_CACHE_PATH @@ -57,7 +63,7 @@ dest: /etc/enroot/enroot.conf state: present regexp: '^#ENROOT_CACHE_PATH.*' - line: 'ENROOT_CACHE_PATH {{enroot_top_path}}/enroot_cache' + line: 'ENROOT_CACHE_PATH {{enroot_top_path_checked}}/enroot_cache' backup: yes - name: update ENROOT_TEMP_PATH @@ -65,14 +71,30 @@ dest: /etc/enroot/enroot.conf state: present regexp: '^#ENROOT_TEMP_PATH.*' - line: 'ENROOT_TEMP_PATH {{enroot_top_path}}/enroot_tmp' + line: 'ENROOT_TEMP_PATH {{enroot_top_path_checked}}/enroot_tmp' + backup: yes + + - name: update ENROOT_SQUASH_OPTIONS + lineinfile: + dest: /etc/enroot/enroot.conf + state: present + regexp: '^#ENROOT_SQUASH_OPTIONS.*' + line: 'ENROOT_SQUASH_OPTIONS -b 262144' + backup: yes + + - name: update ENROOT_ROOTFS_WRITABLE + lineinfile: + dest: /etc/enroot/enroot.conf + state: present + regexp: '^#ENROOT_ROOTFS_WRITABLE.*' + line: 'ENROOT_ROOTFS_WRITABLE yes' backup: yes - - name: set permissions on {{enroot_top_path}} + - name: set permissions on {{enroot_top_path_checked}} become: true file: - path: "{{enroot_top_path}}" + path: "{{enroot_top_path_checked}}" state: directory owner: "{{ ansible_user }}" mode: 0777 @@ -80,9 +102,9 @@ recurse: no - - name: Make sure all {{enroot_top_path}} directories exist + - name: Make sure all {{enroot_top_path_checked}} directories exist file: - path: "{{enroot_top_path}}/{{item}}" + path: "{{enroot_top_path_checked}}/{{item}}" state: directory mode: '0775' owner: "{{ ansible_user }}" diff --git a/playbooks/roles/openldap/tasks/debian.yml b/playbooks/roles/openldap/tasks/debian.yml index 2c24d2e0..49215f6c 100644 --- a/playbooks/roles/openldap/tasks/debian.yml +++ b/playbooks/roles/openldap/tasks/debian.yml @@ -6,27 +6,21 @@ name: apparmor state: stopped failed_when: false - tags: - - configuration - - apparmor - name: Remove Apparmor service service: name: apparmor enabled: false failed_when: false - tags: - - configuration - - apparmor - name: Remove Apparmor package - apt: - name: apparmor - state: absent - purge: true - tags: - - configuration - - apparmor + vars: + package_name: + - apparmor + package_state: absent + package_purge: true + include_role: + name: safe_yum - name: Create /etc/opt/oci-hpc/passwords/openldap become: true @@ -51,10 +45,12 @@ chars=ascii_letters,digits,hexdigits') }}" - name: Install the openldap and required Packages for Ubuntu - apt: - name: "{{ openldap_packages }}" - state: present - update_cache: yes + vars: + package_name: "{{ openldap_packages }}" + package_state: present + package_cache: true + include_role: + name: safe_yum - name: Hash OpenLDAP root password command: slappasswd -h {SSHA} -s {{ openldap_root_pwd }} diff --git a/playbooks/roles/openldap/tasks/el-7.yml b/playbooks/roles/openldap/tasks/el-7.yml index e8b31246..3f55faac 100644 --- a/playbooks/roles/openldap/tasks/el-7.yml +++ b/playbooks/roles/openldap/tasks/el-7.yml @@ -18,7 +18,6 @@ package_state: present include_role: name: safe_yum - tags: install-only - name: Generate openldap root password set_fact: @@ -37,7 +36,6 @@ package_name: "{{openldap_packages}}" include_role: name: safe_yum - tags: install-only - block: - name: Selinux fcontext on files diff --git a/playbooks/roles/openldap/vars/debian_vars.yml b/playbooks/roles/openldap/vars/debian_vars.yml index 724309b7..bb2fc0a6 100644 --- a/playbooks/roles/openldap/vars/debian_vars.yml +++ b/playbooks/roles/openldap/vars/debian_vars.yml @@ -5,7 +5,6 @@ openldap_packages: - slapd - ldap-utils - openssl - - python3-pip - libsasl2-dev - libldap2-dev - libssl-dev diff --git a/playbooks/roles/packages/tasks/centos-7.yml b/playbooks/roles/packages/tasks/centos-7.yml index 248d372d..30a8dace 100644 --- a/playbooks/roles/packages/tasks/centos-7.yml +++ b/playbooks/roles/packages/tasks/centos-7.yml @@ -6,6 +6,7 @@ - python2-cryptography - pssh - pdsh + - python3-pip package_state: latest include_role: name: safe_yum \ No newline at end of file diff --git a/playbooks/roles/packages/tasks/debian.yml b/playbooks/roles/packages/tasks/debian.yml index bd8c4991..d3911656 100644 --- a/playbooks/roles/packages/tasks/debian.yml +++ b/playbooks/roles/packages/tasks/debian.yml @@ -8,6 +8,7 @@ - pssh - pdsh - jq + - python3-pip package_state: latest include_role: name: safe_yum diff --git a/playbooks/roles/packages/tasks/el-7.yml b/playbooks/roles/packages/tasks/el-7.yml index d3bbd6e0..fb61f6a8 100755 --- a/playbooks/roles/packages/tasks/el-7.yml +++ b/playbooks/roles/packages/tasks/el-7.yml @@ -7,6 +7,7 @@ - python3-oci-cli - pssh - pdsh + - python3-pip package_state: latest include_role: name: safe_yum diff --git a/playbooks/roles/packages/tasks/main.yml b/playbooks/roles/packages/tasks/main.yml index 5774423d..275cffe6 100755 --- a/playbooks/roles/packages/tasks/main.yml +++ b/playbooks/roles/packages/tasks/main.yml @@ -1,6 +1,9 @@ - include: ol-7.yml when: ansible_os_family == 'RedHat' and ansible_distribution == 'OracleLinux' and ansible_distribution_major_version == '7' +- include: ol-8.yml + when: ansible_os_family == 'RedHat' and ansible_distribution == 'OracleLinux' and ansible_distribution_major_version == '8' + - include: centos-7.yml when: ansible_os_family == 'RedHat' and ansible_distribution == 'CentOS' and ansible_distribution_major_version == '7' diff --git a/playbooks/roles/packages/tasks/ol-7.yml b/playbooks/roles/packages/tasks/ol-7.yml index cfd59817..f0d58a2a 100644 --- a/playbooks/roles/packages/tasks/ol-7.yml +++ b/playbooks/roles/packages/tasks/ol-7.yml @@ -7,6 +7,7 @@ - python36-oci-cli - pssh - pdsh + - python3-pip package_state: latest package_repo: "epel,ol7_developer_EPEL" include_role: diff --git a/playbooks/roles/packages/tasks/ol-8.yml b/playbooks/roles/packages/tasks/ol-8.yml new file mode 100644 index 00000000..61607e48 --- /dev/null +++ b/playbooks/roles/packages/tasks/ol-8.yml @@ -0,0 +1,15 @@ +--- +- name: Make sure python OpenSSL and parallel ssh is installed + vars: + package_name: + #- pyOpenSSL + #- python2-cryptography + - python36-oci-cli + - pssh + - pdsh + - python3-pip + package_state: latest + package_repo: "epel,ol8_developer_EPEL" + include_role: + name: safe_yum + ignore_errors: true diff --git a/playbooks/roles/packages/tasks/ubuntu.yml b/playbooks/roles/packages/tasks/ubuntu.yml index 26f1acbb..408e6075 100644 --- a/playbooks/roles/packages/tasks/ubuntu.yml +++ b/playbooks/roles/packages/tasks/ubuntu.yml @@ -10,6 +10,7 @@ - pdsh - python3-netaddr - jq + - python3-pip package_state: latest include_role: name: safe_yum diff --git a/playbooks/roles/rack-aware/files/node_ordering_by_rack.py b/playbooks/roles/rack-aware/files/node_ordering_by_rack.py index 6f2446ac..f874595f 100644 --- a/playbooks/roles/rack-aware/files/node_ordering_by_rack.py +++ b/playbooks/roles/rack-aware/files/node_ordering_by_rack.py @@ -1,11 +1,8 @@ #!/usr/bin/env python3 -from pssh.clients import ParallelSSHClient import json -import sys, getopt import os import argparse -from operator import itemgetter -from collections import OrderedDict +import subprocess def write_ordered_hostfile(ordered_hosts=[],hostfile=None): #ordered_hostfile="ordered_hostfile" @@ -43,28 +40,47 @@ def write_ordered_rankfile(ordered_hosts=[],hostfile=None): #with open('/etc/opt/oci-hpc/hostfile.tcp', 'r') as f: hosts = f.read().splitlines() -client = ParallelSSHClient(hosts) -output = client.run_command('curl http://169.254.169.254/opc/v1/host/') -#print(output) r = {} -for host_out in output: - j = json.loads(bytearray(''.join(list(host_out.stdout)).encode())) - #print(j) - if j['rackId'] in r: - r[j['rackId']].append( host_out.host ) - else: - r[j['rackId']] = [ host_out.host ] - - friendly_name_to_system_hostname = {} -hostname_output = client.run_command('/usr/bin/hostname') -#print(hostname_output) -for host_out in hostname_output: - #j = bytearray(''.join(list(host_out.stdout)).encode()) - j = bytearray(''.join(list(host_out.stdout)).encode()) - friendly_name_to_system_hostname[host_out.host] = j.decode(encoding='ascii') - #print(j.decode(encoding='ascii')+" "+host_out.host) +try: + from pssh.clients import ParallelSSHClient + client = ParallelSSHClient(hosts) + output = client.run_command('curl http://169.254.169.254/opc/v1/host/') + #print(output) + for host_out in output: + j = json.loads(bytearray(''.join(list(host_out.stdout)).encode())) + #print(j) + if j['rackId'] in r: + r[j['rackId']].append( host_out.host ) + else: + r[j['rackId']] = [ host_out.host ] + hostname_output = client.run_command('/usr/bin/hostname') + #print(hostname_output) + for host_out in hostname_output: + #j = bytearray(''.join(list(host_out.stdout)).encode()) + j = bytearray(''.join(list(host_out.stdout)).encode()) + friendly_name_to_system_hostname[host_out.host] = j.decode(encoding='ascii') + #print(j.decode(encoding='ascii')+" "+host_out.host) +except ImportError: + try: + for h in hosts: + out = subprocess.run(["ssh "+h+" \"curl -s http://169.254.169.254/opc/v1/host/\""],stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True, universal_newlines=True, check=True) + x = out.stdout.splitlines() + del x[-1] + del x[0] + rackId_str = x[1].split(":")[1].replace('"','') + rackId = rackId_str.replace(' ','') + if rackId in r: + r[rackId].append( h ) + else: + r[rackId] = [ h ] + for h in hosts: + out = subprocess.run(["ssh "+h+" /usr/bin/hostname"],stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True, universal_newlines=True, check=True) + x = out.stdout.splitlines() + friendly_name_to_system_hostname[h] = x[0] + except subprocess.CalledProcessError as e_process_error: + exit(f"Error code: {e_process_error.returncode} Output: {e_process_error.output}") ordered_hosts = [] diff --git a/playbooks/roles/rack-aware/tasks/el.yml b/playbooks/roles/rack-aware/tasks/el.yml index 1e68e989..56bab6c8 100644 --- a/playbooks/roles/rack-aware/tasks/el.yml +++ b/playbooks/roles/rack-aware/tasks/el.yml @@ -22,9 +22,10 @@ owner: "{{ ansible_user }}" group: "{{ privilege_group_name }}" -- name: "Safe Yum install of latest {{package_name}}" +- name: Copy node_ordering_by_rack.py block: - name: copy node_ordering_by_rack.py + become: true copy: src: node_ordering_by_rack.py dest: /opt/oci-hpc/bin/ @@ -33,6 +34,7 @@ mode: '0755' rescue: - name: copy node_ordering_by_rack.py + become: true copy: src: node_ordering_by_rack.py dest: /opt/oci-hpc/bin/ diff --git a/playbooks/roles/rack-aware/tasks/ubuntu.yml b/playbooks/roles/rack-aware/tasks/ubuntu.yml index 341a2ed5..1d0cc93f 100644 --- a/playbooks/roles/rack-aware/tasks/ubuntu.yml +++ b/playbooks/roles/rack-aware/tasks/ubuntu.yml @@ -1,21 +1,11 @@ -- name: Install Pip3 - vars: - package_name: - - python3-pip - package_state: latest - include_role: - name: safe_yum - ignore_errors: true - - name: install pssh and parallel-ssh become: true + vars: + ansible_python_interpreter: /usr/bin/python3 pip: name: ['pssh', 'parallel-ssh'] executable: pip3 state: latest - with_items: - - pssh - - parallel-ssh ignore_errors: yes - name: Make sure /opt/oci-hpc/bin/ exists @@ -28,9 +18,10 @@ owner: "{{ ansible_user }}" group: "{{ privilege_group_name }}" -- name: "Safe Yum install of latest {{package_name}}" +- name: Copy node_ordering_by_rack.py block: - name: copy node_ordering_by_rack.py + become: true copy: src: node_ordering_by_rack.py dest: /opt/oci-hpc/bin/ @@ -39,6 +30,7 @@ mode: '0755' rescue: - name: copy node_ordering_by_rack.py + become: true copy: src: node_ordering_by_rack.py dest: /opt/oci-hpc/bin/ diff --git a/playbooks/roles/rdma-interface/tasks/debian.yml b/playbooks/roles/rdma-interface/tasks/debian.yml index 069b7de2..42d28db8 100644 --- a/playbooks/roles/rdma-interface/tasks/debian.yml +++ b/playbooks/roles/rdma-interface/tasks/debian.yml @@ -18,13 +18,20 @@ return_content: yes register: i_metadata +- name: Change CIDR range for RDMA + become: true + replace: + path: /etc/oci-hpc/rdma-network.conf + regexp: 'rdma_network=192.168.0.0/255.255.0.0' + replace: 'rdma_network={{rdma_network}}/{{rdma_netmask}}' + - name: Append subnet part to /etc/oci-hpc/rdma-network.conf blockinfile: path: /etc/oci-hpc/rdma-network.conf block: | [subnet] modify_arp=true - override_netconfig_netmask=255.255.0.0 + override_netconfig_netmask={{rdma_netmask}} when: new_image.stat.exists - name: Start the OCI RDMA service diff --git a/playbooks/roles/rdma-interface/tasks/el.yml b/playbooks/roles/rdma-interface/tasks/el.yml index 2d5ee6e6..e37e2ce4 100755 --- a/playbooks/roles/rdma-interface/tasks/el.yml +++ b/playbooks/roles/rdma-interface/tasks/el.yml @@ -22,13 +22,20 @@ path: /sbin/oci-rdma-configure register: new_image +- name: Change CIDR range for RDMA + become: true + replace: + path: /etc/oci-hpc/rdma-network.conf + regexp: 'rdma_network=192.168.0.0/255.255.0.0' + replace: 'rdma_network={{rdma_network}}/{{rdma_netmask}}' + - name: Append subnet part to /etc/oci-hpc/rdma-network.conf blockinfile: path: /etc/oci-hpc/rdma-network.conf block: | [subnet] modify_arp=true - override_netconfig_netmask=255.255.0.0 + override_netconfig_netmask={{rdma_netmask}} when: new_image.stat.exists - name: Start the OCI RDMA service diff --git a/playbooks/roles/safe_yum/tasks/ubuntu.yml b/playbooks/roles/safe_yum/tasks/ubuntu.yml index 63a7cb80..6ad15a88 100755 --- a/playbooks/roles/safe_yum/tasks/ubuntu.yml +++ b/playbooks/roles/safe_yum/tasks/ubuntu.yml @@ -7,15 +7,33 @@ delay: 10 until: result.stdout | int == 0 -- name: "Installing {{package_name}}" +- name: "Installing/Removing {{package_name}}" become: true apt: name: "{{package_name}}" state: "{{package_state | default('latest')}}" + purge: "{{package_purge | default('false')}}" + update_cache: "{{package_cache | default('false')}}" register: result until: result is not failed retries: 5 delay: 5 + when: not deb_name is defined + +- name: "Installing/Removing {{package_name}}" + become: true + apt: + deb: "{{item}}" + state: "{{package_state | default('latest')}}" + purge: "{{package_purge | default('false')}}" + update_cache: "{{package_cache | default('false')}}" + register: result + until: result is not failed + retries: 5 + delay: 5 + when: deb_name is defined + with_items: "{{deb_name}}" + - name: Ensure apt process is completed become: true diff --git a/playbooks/roles/slurm/files/cgroup.conf b/playbooks/roles/slurm/files/cgroup.conf index 804efb72..57b5c5a2 100755 --- a/playbooks/roles/slurm/files/cgroup.conf +++ b/playbooks/roles/slurm/files/cgroup.conf @@ -1 +1,4 @@ -CgroupAutomount=yes \ No newline at end of file +CgroupMountpoint="/sys/fs/cgroup" +CgroupAutomount=yes +ConstrainDevices=yes +ConstrainCores=yes \ No newline at end of file diff --git a/playbooks/roles/slurm/files/sshd b/playbooks/roles/slurm/files/sshd new file mode 100644 index 00000000..186a3bf2 --- /dev/null +++ b/playbooks/roles/slurm/files/sshd @@ -0,0 +1,20 @@ +#%PAM-1.0 +auth required pam_nologin.so +auth include password-auth +# Used with polkit to reauthorize users in remote sessions +-auth optional pam_reauthorize.so prepare +account required pam_nologin.so +account include password-auth +password include password-auth +-account required pam_slurm_adopt.so +# pam_selinux.so close should be the first session rule +session required pam_selinux.so close +session required pam_loginuid.so +# pam_selinux.so open should only be followed by sessions to be executed in the user context +session required pam_selinux.so open env_params +session required pam_namespace.so +session optional pam_keyinit.so force revoke +session include password-auth +session include postlogin +# Used with polkit to reauthorize users in remote sessions +-session optional pam_reauthorize.so prepare diff --git a/playbooks/roles/slurm/tasks/common.yml b/playbooks/roles/slurm/tasks/common.yml index 4519a958..24287a59 100755 --- a/playbooks/roles/slurm/tasks/common.yml +++ b/playbooks/roles/slurm/tasks/common.yml @@ -42,17 +42,43 @@ name: safe_yum - name: Create Slurm RPM directory + become: true + file: + path: "{{ download_path }}/slurm_rpms" + state: directory + owner: '{{ ansible_user }}' + group: '{{ ansible_user }}' + when: download_path == '/tmp' + +- name: Create Slurm RPM directory + become: true file: path: "{{ download_path }}/slurm_rpms" state: directory + owner: '{{ ansible_user }}' + group: '{{ ansible_user }}' delegate_to: 127.0.0.1 run_once: true + when: download_path != '/tmp' - name: Download slurm .deb get_url: url: "https://objectstorage.eu-frankfurt-1.oraclecloud.com/p/VnkLhYXOSNVilVa9d24Riz1fz4Ul-KTXeK4HCKoyqv0ghW3gry3Xz8CZqloqphLw/n/hpc/b/source/o/slurm/slurm-{{slurm_version}}_amd64.deb" dest: "{{ download_path }}/slurm_rpms" - when: ansible_os_family == 'Debian' + when: ansible_os_family == 'Debian' and download_path == '/tmp' + +- name: Download slurm .rpm + get_url: + url: "https://objectstorage.eu-frankfurt-1.oraclecloud.com/p/VnkLhYXOSNVilVa9d24Riz1fz4Ul-KTXeK4HCKoyqv0ghW3gry3Xz8CZqloqphLw/n/hpc/b/source/o/slurm/{{ item }}" + dest: "{{ download_path }}/slurm_rpms" + with_items: "{{slurm_all_packages}}" + when: ansible_os_family == 'RedHat' and download_path == '/tmp' + +- name: Download slurm .deb + get_url: + url: "https://objectstorage.eu-frankfurt-1.oraclecloud.com/p/VnkLhYXOSNVilVa9d24Riz1fz4Ul-KTXeK4HCKoyqv0ghW3gry3Xz8CZqloqphLw/n/hpc/b/source/o/slurm/slurm-{{slurm_version}}_amd64.deb" + dest: "{{ download_path }}/slurm_rpms" + when: ansible_os_family == 'Debian' and download_path != '/tmp' delegate_to: 127.0.0.1 run_once: true @@ -61,15 +87,17 @@ url: "https://objectstorage.eu-frankfurt-1.oraclecloud.com/p/VnkLhYXOSNVilVa9d24Riz1fz4Ul-KTXeK4HCKoyqv0ghW3gry3Xz8CZqloqphLw/n/hpc/b/source/o/slurm/{{ item }}" dest: "{{ download_path }}/slurm_rpms" with_items: "{{slurm_all_packages}}" - when: ansible_os_family == 'RedHat' + when: ansible_os_family == 'RedHat' and download_path != '/tmp' delegate_to: 127.0.0.1 - run_once: true + run_once: true - name: Install .deb - become: true - apt: - deb: "{{ download_path }}/slurm_rpms/slurm-22.05.4-1_amd64.deb" - state: present + vars: + deb_name: + - "{{ download_path }}/slurm_rpms/slurm-22.05.4-1_amd64.deb" + package_state: present + include_role: + name: safe_yum when: ansible_os_family == 'Debian' - name: install SLURM common packages RedHat diff --git a/playbooks/roles/slurm/tasks/common_pyxis.yml b/playbooks/roles/slurm/tasks/common_pyxis.yml index 3d17f750..596b1286 100644 --- a/playbooks/roles/slurm/tasks/common_pyxis.yml +++ b/playbooks/roles/slurm/tasks/common_pyxis.yml @@ -1,4 +1,15 @@ --- + +- name: + set_fact: + enroot_top_path_checked: "/etc/enroot/" + when: " not 'nvme0n1' in hostvars[inventory_hostname].ansible_devices" + +- name: + set_fact: + enroot_top_path_checked: "{{enroot_top_path}}" + when: "'nvme0n1' in hostvars[inventory_hostname].ansible_devices" + - name: copy files become: true become_method: sudo diff --git a/playbooks/roles/slurm/tasks/compute-rack-aware.yml b/playbooks/roles/slurm/tasks/compute-rack-aware.yml index 5bc86dd1..6da70b8f 100755 --- a/playbooks/roles/slurm/tasks/compute-rack-aware.yml +++ b/playbooks/roles/slurm/tasks/compute-rack-aware.yml @@ -1,10 +1,11 @@ --- + - name: install SLURM compute packages - vars: + vars: package_name: '{{ slurm_compute_packages }}' package_repo: "{{ slurm_repos }}" disable_gpg_check_var: True - include_role: + include_role: name: safe_yum - name: Render systemd units for slurm, slurmdbd and munge @@ -82,9 +83,12 @@ shell: 'curl -sH "Authorization: Bearer Oracle" -L http://169.254.169.254/opc/v1/host | jq .rackId' # shell: echo $RANDOM | md5sum | head -c 20 register: rackID_script + retries: 5 + delay: 5 + until: rackID_script is not failed - name: Set RackID fact - set_fact: + set_fact: rackID: "{{ rackID_script.stdout[1:-1]}}" - name: Get nodes from Inactive Switch @@ -98,7 +102,7 @@ - name: Get rackIDs for all compute nodes set_fact: racks_to_add_temp: "{{cluster_name}}:{{hostvars[item]['rackID']}}" - with_items: "{{ play_hosts | difference(groups['bastion']) | difference(groups['slurm_backup']) }}" + with_items: "{{ play_hosts | difference(groups['bastion']) | difference(groups['slurm_backup']) | difference(groups['login'])}}" run_once: true register: racks_to_add_temp_results @@ -109,7 +113,7 @@ - name: Get hostnames set_fact: nodes_to_add_temp: "{{hostvars[item]['ansible_hostname']}}" - with_items: "{{ play_hosts | difference(groups['bastion']) | difference(groups['slurm_backup']) }}" + with_items: "{{ play_hosts | difference(groups['bastion']) | difference(groups['slurm_backup']) | difference(groups['login']) }}" run_once: true register: nodes_to_add_temp_results @@ -136,7 +140,7 @@ - name: Get hostlist if switch exists vars: - new_line: "{% for node in ( play_hosts | difference(groups['bastion']) | difference(groups['slurm_backup']) ) %}{% if cluster_name+':'+hostvars[node]['rackID'] == item.item.item %}{{hostvars[node]['ansible_hostname']}},{% endif %}{% endfor %}" + new_line: "{% for node in ( play_hosts | difference(groups['bastion']) | difference(groups['slurm_backup']) | difference(groups['login']) ) %}{% if cluster_name+':'+hostvars[node]['rackID'] == item.item.item %}{{hostvars[node]['ansible_hostname']}},{% endif %}{% endfor %}" command: "scontrol show hostlistsorted {{ item.stdout_lines | union (new_line[:-1].split(',') | list )| join(',') }}" register: rack_hostlist1 delegate_to: 127.0.0.1 @@ -146,7 +150,7 @@ - name: Get hostlist if switch does not exists vars: - new_line: "{% for node in ( play_hosts | difference(groups['bastion']) | difference(groups['slurm_backup']) ) %}{% if cluster_name+':'+hostvars[node]['rackID'] == item.item.item %}{{hostvars[node]['ansible_hostname']}},{% endif %}{% endfor %}" + new_line: "{% for node in ( play_hosts | difference(groups['bastion']) | difference(groups['slurm_backup']) | difference(groups['login']) ) %}{% if cluster_name+':'+hostvars[node]['rackID'] == item.item.item %}{{hostvars[node]['ansible_hostname']}},{% endif %}{% endfor %}" command: "scontrol show hostlistsorted {{ new_line[:-1] }}" register: rack_hostlist2 delegate_to: 127.0.0.1 @@ -169,7 +173,7 @@ run_once: true delegate_to: 127.0.0.1 when: item.item.item.rc > 0 - + - name: Add the nodes in the rack switches become: true lineinfile: @@ -234,6 +238,13 @@ delegate_to: 127.0.0.1 when: racks_left_list | length > 0 +- name: Run Pam settings + include: compute_pam.yml + when: pam|bool + +- name: run handlers + meta: flush_handlers + - name: start slurmd become: true service: @@ -241,23 +252,41 @@ state: restarted enabled: true -- name: Grab Node State - shell: 'sinfo -h -o "%t" -n {{ ansible_hostname }}' - register: node_state - delegate_to: 127.0.0.1 - until: node_state.stdout.find("failure") == -1 - retries: 10 - delay: 5 - -- set_fact: - node_state2={{ node_state.stdout }} - name: Update node state on bastion - become: true - command: scontrol update nodename={{ ansible_hostname }} state=RESUME - when: node_state2 != "idle" and node_state2 != "alloc" - register: result - retries: 10 - delay: 5 - until: result is not failed - delegate_to: 127.0.0.1 \ No newline at end of file + block: + - name: Grab Node State + shell: 'sinfo -h -o "%t" -n {{ ansible_hostname }}' + register: node_state + delegate_to: 127.0.0.1 + - set_fact: + node_state2: "{{ node_state.stdout }}" + - name: Update node state on bastion + become: true + command: scontrol update nodename={{ ansible_hostname }} state=RESUME + when: node_state2 != "idle" and node_state2 != "alloc" + rescue: + - name: Sleep 5 seconds + pause: + seconds: 10 + + - name: Grab Node State + shell: 'sinfo -h -o "%t" -n {{ ansible_hostname }}' + register: node_state + delegate_to: 127.0.0.1 + until: node_state.stdout.find("failure") == -1 + retries: 10 + delay: 5 + + - set_fact: + node_state2: "{{ node_state.stdout }}" + + - name: Update node state on bastion + become: true + command: scontrol update nodename={{ ansible_hostname }} state=RESUME + when: node_state2 != "idle" and node_state2 != "alloc" + register: result + retries: 10 + delay: 5 + until: result is not failed + delegate_to: 127.0.0.1 \ No newline at end of file diff --git a/playbooks/roles/slurm/tasks/compute.yml b/playbooks/roles/slurm/tasks/compute.yml index 943886cc..a994c174 100755 --- a/playbooks/roles/slurm/tasks/compute.yml +++ b/playbooks/roles/slurm/tasks/compute.yml @@ -1,4 +1,9 @@ --- + +- name: Run Pam settings + include: compute_pam.yml + when: pam|bool + - name: install SLURM compute packages vars: package_name: '{{ slurm_compute_packages }}' @@ -82,7 +87,7 @@ - name: Get hostnames set_fact: nodes_to_add_temp: "{{hostvars[item]['ansible_hostname']}}" - with_items: "{{ play_hosts | difference(groups['bastion']) | difference(groups['slurm_backup']) }}" + with_items: "{{ play_hosts | difference(groups['bastion']) | difference(groups['slurm_backup']) | difference(groups['login']) }}" run_once: true register: nodes_to_add_temp_results @@ -157,6 +162,10 @@ delegate_to: 127.0.0.1 notify: reconfigure slurm +- name: Run Pam settings + include: compute_pam.yml + when: pam|bool + - name: start slurmd become: true service: @@ -164,23 +173,40 @@ state: restarted enabled: true -- name: Grab Node State - shell: 'sinfo -h -o "%t" -n {{ ansible_hostname }}' - register: node_state - delegate_to: 127.0.0.1 - until: node_state.stdout.find("failure") == -1 - retries: 10 - delay: 5 - -- set_fact: - node_state2={{ node_state.stdout }} - - name: Update node state on bastion - become: true - command: scontrol update nodename={{ ansible_hostname }} state=RESUME - when: node_state2 != "idle" and node_state2 != "alloc" - register: result - retries: 10 - delay: 5 - until: result is not failed - delegate_to: 127.0.0.1 \ No newline at end of file + block: + - name: Grab Node State + shell: 'sinfo -h -o "%t" -n {{ ansible_hostname }}' + register: node_state + delegate_to: 127.0.0.1 + - set_fact: + node_state2: "{{ node_state.stdout }}" + - name: Update node state on bastion + become: true + command: scontrol update nodename={{ ansible_hostname }} state=RESUME + when: node_state2 != "idle" and node_state2 != "alloc" + rescue: + - name: Sleep 5 seconds + pause: + seconds: 10 + + - name: Grab Node State + shell: 'sinfo -h -o "%t" -n {{ ansible_hostname }}' + register: node_state + delegate_to: 127.0.0.1 + until: node_state.stdout.find("failure") == -1 + retries: 10 + delay: 5 + + - set_fact: + node_state2: "{{ node_state.stdout }}" + + - name: Update node state on bastion + become: true + command: scontrol update nodename={{ ansible_hostname }} state=RESUME + when: node_state2 != "idle" and node_state2 != "alloc" + register: result + retries: 10 + delay: 5 + until: result is not failed + delegate_to: 127.0.0.1 \ No newline at end of file diff --git a/playbooks/roles/slurm/tasks/compute_pam.yml b/playbooks/roles/slurm/tasks/compute_pam.yml new file mode 100644 index 00000000..0e4a29ff --- /dev/null +++ b/playbooks/roles/slurm/tasks/compute_pam.yml @@ -0,0 +1,24 @@ +--- +- name: Edit /etc/security/access.conf + become: true + blockinfile: + dest: /etc/security/access.conf + block: | + +:root:ALL + +:wheel:ALL + +:opc:ALL + -:ALL:ALL + +- name: Copy sshd file + become: true + copy: + src: sshd + dest: /etc/pam.d/sshd + +- name: Stop logind + become: true + systemd: + name: systemd-logind + state: stopped + enabled: no + masked: yes diff --git a/playbooks/roles/slurm/tasks/destroy-rack-aware.yml b/playbooks/roles/slurm/tasks/destroy-rack-aware.yml index 68dc62a3..fb4604d3 100755 --- a/playbooks/roles/slurm/tasks/destroy-rack-aware.yml +++ b/playbooks/roles/slurm/tasks/destroy-rack-aware.yml @@ -49,7 +49,7 @@ - name: Get hostnames set_fact: nodes_to_remove_temp: "{{hostvars[item]['ansible_hostname']}}" - with_items: "{{ play_hosts | difference(groups['bastion']) | difference(groups['slurm_backup']) }}" + with_items: "{{ play_hosts | difference(groups['bastion']) | difference(groups['slurm_backup']) | difference(groups['login'])}}" run_once: true register: nodes_to_remove_temp_results @@ -57,6 +57,24 @@ set_fact: nodes_to_remove="{{nodes_to_remove_temp_results.results | map(attribute='ansible_facts.nodes_to_remove_temp') | list}}" run_once: true +- name: Get new inactive_nodes list + command: "scontrol show hostlistsorted {{inactive_list | union(nodes_to_remove) | join(',')}}" + register: new_inactive_list + run_once: true + delegate_to: 127.0.0.1 + +- name: Adding nodes to inactive + vars: + - keyword: "{% for partition in queues %}{% for instance in partition.instance_types %}{% if instance.name == instance_type %}{{instance.instance_keyword}}{% endif %}{% endfor %}{% endfor %}" + become: true + lineinfile: + path: "{{ slurm_conf_path }}/topology.conf" + regexp: "SwitchName=inactive-{{queue}}-{{keyword}}\\sNodes.*" + line: "SwitchName=inactive-{{queue}}-{{keyword}} Nodes={{new_inactive_list.stdout }}" + state: present + run_once: true + delegate_to: 127.0.0.1 + - name: Run the script to get the RackID shell: 'curl -sH "Authorization: Bearer Oracle" -L http://169.254.169.254/opc/v1/host | jq .rackId' # shell: echo $RANDOM | md5sum | head -c 20 @@ -69,7 +87,7 @@ - name: Get rackIDs set_fact: racks_to_remove_temp: "{{cluster_name}}:{{hostvars[item]['rackID']}}" - with_items: "{{ play_hosts | difference(groups['bastion']) | difference(groups['slurm_backup']) }}" + with_items: "{{ play_hosts | difference(groups['bastion']) | difference(groups['slurm_backup']) | difference(groups['login'])}}" run_once: true register: racks_to_remove_temp_results @@ -160,26 +178,9 @@ delegate_to: 127.0.0.1 when: racks_left_list | list | length > 0 -- name: Get new inactive_nodes list - command: "scontrol show hostlistsorted {{inactive_list | union(nodes_to_remove) | join(',')}}" - register: new_inactive_list - run_once: true - delegate_to: 127.0.0.1 - -- name: Adding nodes to inactive - vars: - - keyword: "{% for partition in queues %}{% for instance in partition.instance_types %}{% if instance.name == instance_type %}{{instance.instance_keyword}}{% endif %}{% endfor %}{% endfor %}" - become: true - lineinfile: - path: "{{ slurm_conf_path }}/topology.conf" - regexp: "SwitchName=inactive-{{queue}}-{{keyword}}\\sNodes.*" - line: "SwitchName=inactive-{{queue}}-{{keyword}} Nodes={{new_inactive_list.stdout }}" - state: present - run_once: true - delegate_to: 127.0.0.1 - - name: Reconfigure Slurm for topology become: true command: "scontrol reconfigure" delegate_to: 127.0.0.1 - run_once: true \ No newline at end of file + run_once: true + ignore_errors: yes \ No newline at end of file diff --git a/playbooks/roles/slurm/tasks/destroy.yml b/playbooks/roles/slurm/tasks/destroy.yml index c030419c..406b3b5b 100755 --- a/playbooks/roles/slurm/tasks/destroy.yml +++ b/playbooks/roles/slurm/tasks/destroy.yml @@ -55,7 +55,7 @@ - name: Get hostnames set_fact: nodes_to_add_temp: "{{hostvars[item]['ansible_hostname']}}" - with_items: "{{ play_hosts | difference(groups['bastion']) }}" + with_items: "{{ play_hosts | difference(groups['bastion']) | difference(groups['slurm_backup']) | difference(groups['login']) }}" run_once: true register: nodes_to_add_temp_results @@ -120,4 +120,5 @@ become: true command: "scontrol reconfigure" delegate_to: 127.0.0.1 - run_once: true \ No newline at end of file + run_once: true + ignore_errors: true \ No newline at end of file diff --git a/playbooks/roles/slurm/tasks/el7.yml b/playbooks/roles/slurm/tasks/el7.yml index ad8ecca8..b2be275f 100755 --- a/playbooks/roles/slurm/tasks/el7.yml +++ b/playbooks/roles/slurm/tasks/el7.yml @@ -19,18 +19,24 @@ include_tasks: server.yml when: ('bastion' in group_names) and (not destroy|bool) and (initial| bool) -- name: run backup server directives - vars: - slurm_repos: "epel,ol7_developer_EPEL" - include_tasks: backup_server.yml - when: ('slurm_backup' in group_names) and (not destroy|bool) and (initial| bool) - - name: run compute directives vars: slurm_repos: "epel,ol7_developer_EPEL" include_tasks: "compute{{rack_aware_playbook_suffix}}.yml" when: ('compute' in group_names) and (not destroy|bool) +- name: run login server directives + vars: + slurm_repos: "epel,ol7_developer_EPEL" + include_tasks: login.yml + when: ('login' in group_names) and (not destroy|bool) and (initial| bool) + +- name: run backup server directives + vars: + slurm_repos: "epel,ol7_developer_EPEL" + include_tasks: backup_server.yml + when: ('slurm_backup' in group_names) and (not destroy|bool) and (initial| bool) + - name: cleanup include_tasks: cleanup.yml when: ('compute' in group_names) and (not destroy|bool) @@ -38,3 +44,7 @@ - name: destroy include_tasks: destroy{{rack_aware_playbook_suffix}}.yml when: ('compute' in group_names or 'compute_to_destroy' in group_names) and (destroy|bool) + +- name: move topology.conf on backup slurm controller + include_tasks: move-topology.yml + when: ('slurm_backup' in group_names) and (not initial| bool) diff --git a/playbooks/roles/slurm/tasks/el8.yml b/playbooks/roles/slurm/tasks/el8.yml index 76bf75dd..d4b2cbbb 100755 --- a/playbooks/roles/slurm/tasks/el8.yml +++ b/playbooks/roles/slurm/tasks/el8.yml @@ -19,18 +19,24 @@ include_tasks: server.yml when: ('bastion' in group_names) and (not destroy|bool) and (initial| bool) -- name: run backup server directives - vars: - slurm_repos: "ol8_developer_EPEL,ol8_codeready_builder" - include_tasks: backup_server.yml - when: ('slurm_backup' in group_names) and (not destroy|bool) and (initial| bool) - - name: run compute directives vars: slurm_repos: "ol8_developer_EPEL,ol8_codeready_builder" include_tasks: "compute{{rack_aware_playbook_suffix}}.yml" when: ('compute' in group_names) and (not destroy|bool) +- name: run login server directives + vars: + slurm_repos: "ol8_developer_EPEL,ol8_codeready_builder" + include_tasks: login.yml + when: ('login' in group_names) and (not destroy|bool) and (initial| bool) + +- name: run backup server directives + vars: + slurm_repos: "ol8_developer_EPEL,ol8_codeready_builder" + include_tasks: backup_server.yml + when: ('slurm_backup' in group_names) and (not destroy|bool) and (initial| bool) + - name: cleanup include_tasks: cleanup.yml when: ('compute' in group_names) and (not destroy|bool) @@ -38,3 +44,7 @@ - name: destroy include_tasks: destroy{{rack_aware_playbook_suffix}}.yml when: ('compute' in group_names or 'compute_to_destroy' in group_names) and (destroy|bool) + +- name: move topology.conf on backup slurm controller + include_tasks: move-topology.yml + when: ('slurm_backup' in group_names) and (not initial| bool) diff --git a/playbooks/roles/slurm/tasks/login.yml b/playbooks/roles/slurm/tasks/login.yml new file mode 100755 index 00000000..48998e34 --- /dev/null +++ b/playbooks/roles/slurm/tasks/login.yml @@ -0,0 +1,86 @@ +--- +- name: install SLURM login packages + vars: + package_name: '{{ slurm_login_packages }}' + package_repo: "{{ slurm_repos }}" + disable_gpg_check_var: True + include_role: + name: safe_yum + +- name: Render systemd units for slurmd + become: true + template: + src: 'systemd/{{ item }}.service' + dest: '/lib/systemd/system/{{ item }}.service' + backup: "yes" + with_items: + - slurmd + when: ansible_os_family == 'Debian' + +- name: Create systemd unit dirs for slurmd and munge + become: true + file: + name: '/etc/systemd/system/{{ item }}.service.d' + state: directory + with_items: + - munge + - slurmd + +- name: Render systemd units for slurmd and munge + become: true + template: + src: 'systemd/{{ item }}.service.d/unit.conf.j2' + dest: '/etc/systemd/system/{{ item }}.service.d/unit.conf' + backup: "yes" + with_items: + - munge + - slurmd + +- name: Create munge dir + become: true + file: + name: '{{ munge_conf_path }}' + state: directory + owner: munge + group: munge + mode: 0700 + +- name: copy munge.key to tmp + become: true + shell: + cmd: cp /etc/munge/munge.key /tmp/munge.key + warn: false + delegate_to: 127.0.0.1 + run_once: true + +- name: set permissions + become: true + shell: + cmd: chown {{ ansible_user }}:{{ ansible_user }} /tmp/munge.key + warn: false + delegate_to: 127.0.0.1 + run_once: true + +- name: Copy munge key + become: true + copy: + src: /tmp/munge.key + dest: /etc/munge/munge.key + owner: munge + group: munge + mode: '0400' + notify: restart munge + +- name: restart munge + become: true + service: + name: munge + state: restarted + enabled: true + +- name: start slurmd + become: true + service: + name: slurmd + state: restarted + enabled: true \ No newline at end of file diff --git a/playbooks/roles/slurm/tasks/move-topology.yml b/playbooks/roles/slurm/tasks/move-topology.yml new file mode 100644 index 00000000..8e09347d --- /dev/null +++ b/playbooks/roles/slurm/tasks/move-topology.yml @@ -0,0 +1,12 @@ +--- + +- name: move topology.conf on backup servers + become: true + copy: + dest: '{{ slurm_conf_path }}/topology.conf' + src: '{{ slurm_conf_path }}/topology.conf' + force: yes + register: topology_copied + until: topology_copied is not failed + retries: 10 + delay: 5 \ No newline at end of file diff --git a/playbooks/roles/slurm/tasks/server.yml b/playbooks/roles/slurm/tasks/server.yml index 2c751f6c..33678c6a 100755 --- a/playbooks/roles/slurm/tasks/server.yml +++ b/playbooks/roles/slurm/tasks/server.yml @@ -166,6 +166,12 @@ line: alias max_nodes="python3 /opt/oci-hpc/bin/max_nodes_partition.py" state: present +- name: add alias for validation of number of nodes, pcie, and gpu throttle check + lineinfile: + path: '/home/{{ ansible_user }}/.bashrc' + line: alias validate="python3 /opt/oci-hpc/bin/validation.py" + state: present + - name: Generate gres.conf become: true template: diff --git a/playbooks/roles/slurm/tasks/ubuntu.yml b/playbooks/roles/slurm/tasks/ubuntu.yml index 57ccbbf0..96a8843e 100644 --- a/playbooks/roles/slurm/tasks/ubuntu.yml +++ b/playbooks/roles/slurm/tasks/ubuntu.yml @@ -6,14 +6,18 @@ include_tasks: server.yml when: ('bastion' in group_names) and (not destroy|bool) and (initial| bool) -- name: run backup server directives - include_tasks: backup_server.yml - when: ('slurm_backup' in group_names) and (not destroy|bool) and (initial| bool) - - name: run compute directives include_tasks: "compute{{rack_aware_playbook_suffix}}.yml" when: ('compute' in group_names) and (not destroy|bool) +- name: run login server directives + include_tasks: login.yml + when: ('login' in group_names) and (not destroy|bool) and (initial| bool) + +- name: run backup server directives + include_tasks: backup_server.yml + when: ('slurm_backup' in group_names) and (not destroy|bool) and (initial| bool) + - name: cleanup include_tasks: cleanup.yml when: ('compute' in group_names) and (not destroy|bool) @@ -21,3 +25,7 @@ - name: destroy include_tasks: destroy{{rack_aware_playbook_suffix}}.yml when: ('compute' in group_names or 'compute_to_destroy' in group_names) and (destroy|bool) + +- name: move topology.conf on backup slurm controller + include_tasks: move-topology.yml + when: ('slurm_backup' in group_names) and (not initial| bool) diff --git a/playbooks/roles/slurm/templates/gres.conf.j2 b/playbooks/roles/slurm/templates/gres.conf.j2 index a18b5bfb..f241cdd9 100644 --- a/playbooks/roles/slurm/templates/gres.conf.j2 +++ b/playbooks/roles/slurm/templates/gres.conf.j2 @@ -1,41 +1,44 @@ {% set size = hostvars[inventory_hostname]['private_subnet'] | ipaddr('size')%} {% for partition in queues %} {% for instance in partition.instance_types %} - {% if instance.shape == "BM.GPU2.2"%} -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia0 Type=P100 Cores=[0-13] -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia1 Type=P100 Cores=[14-27] +NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia0 Type=P100 Cores=[0-13] AutoDetect=nvml +NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia1 Type=P100 Cores=[14-27] AutoDetect=nvml {% elif instance.shape == "VM.GPU2.1"%} -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia0 Type=P100 Cores=[0-11] +NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia0 Type=P100 Cores=[0-11] AutoDetect=nvml {% elif instance.shape == "VM.GPU3.1"%} -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia0 Type=V100 Cores=[0-5] +NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia0 Type=V100 Cores=[0-5] AutoDetect=nvml {% elif instance.shape == "VM.GPU3.2"%} -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[0-1] Type=V100 Cores=[0-11] +NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[0-1] Type=V100 Cores=[0-11] AutoDetect=nvml {% elif instance.shape == "VM.GPU3.4"%} -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[0-3] Type=V100 Cores=[0-23] +NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[0-3] Type=V100 Cores=[0-23] AutoDetect=nvml {% elif instance.shape == "BM.GPU3.8"%} -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[0-3] Type=V100 Cores=[0-25] -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[4-7] Type=V100 Cores=[26-51] +NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[0-3] Type=V100 Cores=[0-25] AutoDetect=nvml +NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[4-7] Type=V100 Cores=[26-51] AutoDetect=nvml {% elif instance.shape == "BM.GPU4.8"%} -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[0-1] Type=A100 Cores=[48-63] -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[2-3] Type=A100 Cores=[16-31] -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[4-5] Type=A100 Cores=[112-127] -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[6-7] Type=A100 Cores=[80-95] +NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[0-1] Type=A100 Cores=[48-63] AutoDetect=nvml +NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[2-3] Type=A100 Cores=[16-31] AutoDetect=nvml +NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[4-5] Type=A100 Cores=[112-127] AutoDetect=nvml +NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[6-7] Type=A100 Cores=[80-95] AutoDetect=nvml {% elif instance.shape == "BM.GPU.B4.8"%} -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[0-1] Type=A100 Cores=[48-63] -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[2-3] Type=A100 Cores=[16-31] -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[4-5] Type=A100 Cores=[112-127] -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[6-7] Type=A100 Cores=[80-95] +NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[0-1] Type=A100 Cores=[48-63] AutoDetect=nvml +NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[2-3] Type=A100 Cores=[16-31] AutoDetect=nvml +NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[4-5] Type=A100 Cores=[112-127] AutoDetect=nvml +NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[6-7] Type=A100 Cores=[80-95] AutoDetect=nvml {% elif instance.shape == "BM.GPU.A100-v2.8"%} -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[0-1] Type=A100 Cores=[48-63] -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[2-3] Type=A100 Cores=[16-31] -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[4-5] Type=A100 Cores=[112-127] -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[6-7] Type=A100 Cores=[80-95] +NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[0-1] Type=A100 Cores=[48-63] AutoDetect=nvml +NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[2-3] Type=A100 Cores=[16-31] AutoDetect=nvml +NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[4-5] Type=A100 Cores=[112-127] AutoDetect=nvml +NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[6-7] Type=A100 Cores=[80-95] AutoDetect=nvml {% elif instance.shape == "BM.GPU.T1.2" %} -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[0-1] Type=A10 Cores=[0-31] +NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[0-1] Type=A10 Cores=[0-31] AutoDetect=nvml {% elif instance.shape == "BM.GPU.A10.4" %} -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[0-1] Type=A10 Cores=[0-31] -NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[2-3] Type=A10 Cores=[32-63] +NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[0-1] Type=A10 Cores=[0-31] AutoDetect=nvml +NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[2-3] Type=A10 Cores=[32-63] AutoDetect=nvml +{% elif instance.shape == "VM.GPU.A10.2" %} +NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia[0-1] Type=A10 Cores=[0-29] AutoDetect=nvml +{% elif instance.shape == "VM.GPU.A10.1" %} +NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Name=gpu File=/dev/nvidia0 Type=A10 Cores=[0-14] AutoDetect=nvml {% endif %} {% endfor %} {% endfor %} \ No newline at end of file diff --git a/playbooks/roles/slurm/templates/prolog.sh.j2 b/playbooks/roles/slurm/templates/prolog.sh.j2 index 0c799685..25f23573 100644 --- a/playbooks/roles/slurm/templates/prolog.sh.j2 +++ b/playbooks/roles/slurm/templates/prolog.sh.j2 @@ -1,5 +1,5 @@ #!/bin/sh -runtime_path="$(sudo -u "$SLURM_JOB_USER" sh -c 'echo "{{enroot_top_path}}/enroot_runtime/user-$(id -u)"')" +runtime_path="$(sudo -u "$SLURM_JOB_USER" sh -c 'echo "{{enroot_top_path_checked}}/enroot_runtime/user-$(id -u)"')" mkdir -p "$runtime_path" chown "$SLURM_JOB_USER:$(id -g "$SLURM_JOB_USER")" "$runtime_path" #chmod 777 -R /tmp diff --git a/playbooks/roles/slurm/templates/slurm.conf.j2 b/playbooks/roles/slurm/templates/slurm.conf.j2 index 23999c59..7ad0c47a 100755 --- a/playbooks/roles/slurm/templates/slurm.conf.j2 +++ b/playbooks/roles/slurm/templates/slurm.conf.j2 @@ -19,7 +19,8 @@ SlurmctldLogFile=/var/log/slurm/slurmctld.log SlurmdLogFile=/var/log/slurm/slurmd.log StateSaveLocation={{ slurm_nfs_path }}/spool/slurm SwitchType=switch/none -TaskPlugin=task/affinity +TaskPlugin=task/affinity,task/cgroup +PrologFlags=contain InactiveLimit=0 KillWait=30 MinJobAge=300 @@ -45,9 +46,21 @@ TopologyPlugin=topology/tree TreeWidth=2048 SlurmctldParameters=enable_configless -{% set size = hostvars[inventory_hostname]['private_subnet'] | ipaddr('size')%} +{% if sacct_limits|bool %} +AccountingStorageTRES=gres/gpu +AccountingStorageEnforce=limits,associations,qos,safe +JobCompType=jobcomp/none +TrackWckey=no +{% endif %} + + +{% if (groups['login']| length ) > 0 %} +NodeName={{ hostvars[groups['login'][0]]['ansible_fqdn'].split('.')[0] }} +{% endif %} + {% for partition in queues %} {% for instance in partition.instance_types %} +{% set size = instance.private_subnet | ipaddr('size')%} {% if instance.hyperthreading | bool %} {% set threadspercore = 2 %} {% else %} @@ -75,6 +88,10 @@ NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boar NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=1 CoresPerSocket=32 ThreadsPerCore={{threadspercore}} State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} Gres=gpu:A10:2 {% elif instance.shape == "BM.GPU.A10.4" %} NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=2 CoresPerSocket=32 ThreadsPerCore={{threadspercore}} State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} Gres=gpu:A10:4 +{% elif instance.shape == "VM.GPU.A10.2" %} +NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=1 CoresPerSocket=30 ThreadsPerCore={{threadspercore}} State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} Gres=gpu:A10:2 +{% elif instance.shape == "VM.GPU.A10.1" %} +NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=1 CoresPerSocket=15 ThreadsPerCore={{threadspercore}} State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} Gres=gpu:A10:1 {% elif instance.shape == "VM.Standard.E3.Flex" %} NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=1 CoresPerSocket={{instance.instance_pool_ocpus}} ThreadsPerCore={{threadspercore}} State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} {% elif instance.shape == "VM.Standard.E4.Flex" %} @@ -83,14 +100,20 @@ NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boar NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=1 CoresPerSocket={{instance.instance_pool_ocpus}} ThreadsPerCore={{threadspercore}} State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} {% elif instance.shape == "VM.Standard3.Flex" %} NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=1 CoresPerSocket={{instance.instance_pool_ocpus}} ThreadsPerCore={{threadspercore}} State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} +{% elif instance.shape == "VM.DenseIO.E4.Flex" %} +NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=1 CoresPerSocket={{instance.instance_pool_ocpus}} ThreadsPerCore={{threadspercore}} State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} {% elif instance.shape == "VM.Standard.A1.Flex" %} NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=1 CoresPerSocket={{instance.instance_pool_ocpus}} ThreadsPerCore=1 State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} {% elif instance.shape == "BM.Standard.E3.128" %} NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=2 CoresPerSocket=64 ThreadsPerCore={{threadspercore}} State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} {% elif instance.shape == "BM.Standard.E4.128" %} NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=2 CoresPerSocket=64 ThreadsPerCore={{threadspercore}} State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} +{% elif instance.shape == "BM.DenseIO.E4.128" %} +NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=2 CoresPerSocket=64 ThreadsPerCore={{threadspercore}} State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} {% elif instance.shape == "BM.HPC2.36" %} NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=2 CoresPerSocket=18 ThreadsPerCore={{threadspercore}} State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} +{% elif instance.shape == "BM.HPC.E5.128" %} +NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=2 CoresPerSocket=64 ThreadsPerCore={{threadspercore}} State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} {% elif instance.shape == "BM.Optimized3.36" %} NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boards=1 SocketsPerBoard=2 CoresPerSocket=18 ThreadsPerCore={{threadspercore}} State=FUTURE Features={% if instance.shape != instance.name%}{{ instance.shape }},{% endif %}{{ instance.name }} {% elif "VM.Standard2." in instance.shape %} @@ -111,10 +134,18 @@ NodeName={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] Boar {% for partition in queues %} {% if partition.default %} -PartitionName={{partition.name}} Nodes={% for instance in partition.instance_types -%} - {{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}],{%- endfor %} Default=YES MaxTime=INFINITE State=UP +{% set nodesList = [] %} +{% for instance in partition.instance_types %} +{% set size = instance.private_subnet | ipaddr('size')%} +{{ nodesList.append(partition.name+'-'+instance.instance_keyword+'-node-[1-'+size|string+']')}} +{%- endfor %} +PartitionName={{partition.name}} Nodes={{nodesList|join(',')}} Default=YES MaxTime=INFINITE State=UP {% else %} -PartitionName={{partition.name}} Nodes={% for instance in partition.instance_types -%} - {{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}],{%- endfor %} Default=NO MaxTime=INFINITE State=UP +{% set nodesList = [] %} +{% for instance in partition.instance_types %} +{% set size = instance.private_subnet | ipaddr('size')%} +{{ nodesList.append(partition.name+'-'+instance.instance_keyword+'-node-[1-'+size|string+']')}} +{%- endfor %} +PartitionName={{partition.name}} Nodes={{nodesList|join(',')}} Default=NO MaxTime=INFINITE State=UP {% endif %} {% endfor %} \ No newline at end of file diff --git a/playbooks/roles/slurm/templates/slurmdbd.conf.j2 b/playbooks/roles/slurm/templates/slurmdbd.conf.j2 index 66f49698..cfa7606f 100755 --- a/playbooks/roles/slurm/templates/slurmdbd.conf.j2 +++ b/playbooks/roles/slurm/templates/slurmdbd.conf.j2 @@ -23,3 +23,6 @@ StoragePass={{ slurmdbd_sql_pwd }} StorageUser={{ slurm_db_user }} StorageLoc={{ slurm_db_name }} +{% if sacct_limits|bool %} +TrackWckey=no +{% endif %} \ No newline at end of file diff --git a/playbooks/roles/slurm/templates/topology.conf.j2 b/playbooks/roles/slurm/templates/topology.conf.j2 index 12bb02d2..66654ab1 100644 --- a/playbooks/roles/slurm/templates/topology.conf.j2 +++ b/playbooks/roles/slurm/templates/topology.conf.j2 @@ -1,7 +1,10 @@ ### Topology File -{% set size = hostvars[inventory_hostname]['private_subnet'] | ipaddr('size')%} +{% if (groups['login']| length ) > 0 %} +SwitchName=login-node Nodes={{ hostvars[groups['login'][0]]['ansible_fqdn'].split('.')[0] }} +{% endif %} {% for partition in queues %} {% for instance in partition.instance_types %} +{% set size = instance.private_subnet | ipaddr('size')%} SwitchName=inactive-{{partition.name}}-{{instance.instance_keyword}} Nodes={{partition.name}}-{{instance.instance_keyword}}-node-[1-{{size}}] {% endfor %} {% endfor %} \ No newline at end of file diff --git a/playbooks/roles/slurm/vars/centos_vars.yml b/playbooks/roles/slurm/vars/centos_vars.yml index d37fe9ad..7498933c 100644 --- a/playbooks/roles/slurm/vars/centos_vars.yml +++ b/playbooks/roles/slurm/vars/centos_vars.yml @@ -44,6 +44,11 @@ slurm_compute_packages: slurm_backup_server_packages: - "{{ download_path }}/slurm_rpms/slurm-centos-slurmctld-{{slurm_version}}.el7.x86_64.rpm" + - "{{ download_path }}/slurm_rpms/slurm-centos-pam_slurm-{{slurm_version}}.el7.x86_64.rpm" + - "{{ download_path }}/slurm_rpms/slurm-centos-libpmi-{{slurm_version}}.el7.x86_64.rpm" + - "{{ download_path }}/slurm_rpms/slurm-centos-slurmd-{{slurm_version}}.el7.x86_64.rpm" + +slurm_login_packages: - "{{ download_path }}/slurm_rpms/slurm-centos-pam_slurm-{{slurm_version}}.el7.x86_64.rpm" - "{{ download_path }}/slurm_rpms/slurm-centos-libpmi-{{slurm_version}}.el7.x86_64.rpm" - "{{ download_path }}/slurm_rpms/slurm-centos-slurmd-{{slurm_version}}.el7.x86_64.rpm" \ No newline at end of file diff --git a/playbooks/roles/slurm/vars/el_vars.yml b/playbooks/roles/slurm/vars/el_vars.yml index a72d3681..cabe9ef0 100644 --- a/playbooks/roles/slurm/vars/el_vars.yml +++ b/playbooks/roles/slurm/vars/el_vars.yml @@ -43,6 +43,11 @@ slurm_compute_packages: slurm_backup_server_packages: - "{{ download_path }}/slurm_rpms/slurm-slurmctld-{{slurm_version}}.el{{ansible_distribution_major_version}}.x86_64.rpm" + - "{{ download_path }}/slurm_rpms/slurm-pam_slurm-{{slurm_version}}.el{{ansible_distribution_major_version}}.x86_64.rpm" + - "{{ download_path }}/slurm_rpms/slurm-libpmi-{{slurm_version}}.el{{ansible_distribution_major_version}}.x86_64.rpm" + - "{{ download_path }}/slurm_rpms/slurm-slurmd-{{slurm_version}}.el{{ansible_distribution_major_version}}.x86_64.rpm" + +slurm_login_packages: - "{{ download_path }}/slurm_rpms/slurm-pam_slurm-{{slurm_version}}.el{{ansible_distribution_major_version}}.x86_64.rpm" - "{{ download_path }}/slurm_rpms/slurm-libpmi-{{slurm_version}}.el{{ansible_distribution_major_version}}.x86_64.rpm" - "{{ download_path }}/slurm_rpms/slurm-slurmd-{{slurm_version}}.el{{ansible_distribution_major_version}}.x86_64.rpm" \ No newline at end of file diff --git a/playbooks/roles/slurm/vars/ubuntu_vars.yml b/playbooks/roles/slurm/vars/ubuntu_vars.yml index e313d4da..c820e9b8 100644 --- a/playbooks/roles/slurm/vars/ubuntu_vars.yml +++ b/playbooks/roles/slurm/vars/ubuntu_vars.yml @@ -20,4 +20,7 @@ slurm_compute_packages: - libpmi0 slurm_backup_server_packages: + - libpmi0 + +slurm_login_packages: - libpmi0 \ No newline at end of file diff --git a/playbooks/roles/spack/tasks/debian.yml b/playbooks/roles/spack/tasks/debian.yml index 45006236..841fafc7 100644 --- a/playbooks/roles/spack/tasks/debian.yml +++ b/playbooks/roles/spack/tasks/debian.yml @@ -1,24 +1,21 @@ --- - name: install GIT - apt: - name: git - state: latest - become: true + vars: + package_name: + - git + package_state: latest + include_role: + name: safe_yum when: cluster_nfs - register: result - until: result is not failed - retries: 5 - delay: 5 - name: Development Tools" - apt: - name: build-essential - become: true + vars: + package_name: + - build-essential + package_state: latest + include_role: + name: safe_yum when: cluster_nfs - register: result - until: result is not failed - retries: 5 - delay: 5 - name: Clone SPACK git: diff --git a/playbooks/roles/sssd/tasks/debian.yml b/playbooks/roles/sssd/tasks/debian.yml index b5f8ec9d..9e0c1d71 100644 --- a/playbooks/roles/sssd/tasks/debian.yml +++ b/playbooks/roles/sssd/tasks/debian.yml @@ -46,10 +46,12 @@ replace: 'password [success=1 user_unknown=ignore default=die] pam_ldap.so try_first_pass' - name: Install the openldap and required Packages for Ubuntu - apt: - name: "{{ openldap_packages }}" - state: present - update_cache: yes + vars: + package_name: "{{ openldap_packages }}" + package_state: present + package_cache: true + include_role: + name: safe_yum - name: Update sshd configuration lineinfile: diff --git a/playbooks/roles/sssd/tasks/el-8.yml b/playbooks/roles/sssd/tasks/el-8.yml new file mode 100644 index 00000000..ecfc4255 --- /dev/null +++ b/playbooks/roles/sssd/tasks/el-8.yml @@ -0,0 +1,48 @@ +--- +- name: Install sssd packages + vars: + package_name: + - sssd + - authconfig + include_role: + name: safe_yum + +- name: Add configuration file to /etc/sssd/sssd.conf + template: + src: 'sssd.conf.j2' + dest: '/etc/sssd/sssd.conf' + owner: 'root' + group: 'root' + mode: '0600' + notify: restart sssd + +- name: Copy CA certificate + copy: + src: "{{ ssl_ca_cert }}" + dest: /etc/openldap/certs/cluster-ca.crt + +- name: Adjust OpenLDAP client TLS configuration + lineinfile: + path: '/etc/openldap/ldap.conf' + line: 'TLS_CACERT /etc/openldap/certs/cluster-ca.crt' + +- name: Enable sssd service + systemd: + name: sssd + enabled: "yes" + +- name: Start sssd service + systemd: + name: sssd + state: started + +- name: Update sshd configuration + lineinfile: + path: /etc/ssh/sshd_config + regexp: '^PasswordAuthentication' + line: PasswordAuthentication no + notify: restart sshd + +- name: Setting up the system to use sssd for authentication + command: authconfig --enablemkhomedir --enablesssd --enablesssdauth --update + changed_when: false diff --git a/playbooks/roles/sssd/tasks/main.yml b/playbooks/roles/sssd/tasks/main.yml index 6b221d24..acad08c2 100644 --- a/playbooks/roles/sssd/tasks/main.yml +++ b/playbooks/roles/sssd/tasks/main.yml @@ -1,5 +1,11 @@ +- include_vars: /opt/oci-hpc/playbooks/roles/openldap/vars/debian_vars.yml + when: ansible_distribution == 'Ubuntu' + - include: el-7.yml when: ansible_os_family == 'RedHat' and ansible_distribution_major_version == '7' +- include: el-8.yml + when: ansible_os_family == 'RedHat' and ansible_distribution_major_version == '8' + - include: debian.yml when: ansible_distribution == 'Ubuntu' \ No newline at end of file diff --git a/playbooks/roles/telegraf/tasks/el.yml b/playbooks/roles/telegraf/tasks/common.yml old mode 100755 new mode 100644 similarity index 68% rename from playbooks/roles/telegraf/tasks/el.yml rename to playbooks/roles/telegraf/tasks/common.yml index ebacb8d0..6e531449 --- a/playbooks/roles/telegraf/tasks/el.yml +++ b/playbooks/roles/telegraf/tasks/common.yml @@ -1,27 +1,4 @@ --- -- name: Add influxdb repository - become: true - yum_repository: - name: influxdb - description: InfluxDB Repository - RHEL $releasever - baseurl: https://repos.influxdata.com/rhel/{{ ansible_distribution_major_version }}/$basearch/stable - enabled: 1 - gpgcheck: 1 - gpgkey: https://repos.influxdata.com/influxdb.key - -- name: Install pip - vars: - package_name: - - python3-pip - include_role: - name: safe_yum - -- name: install influx pip - become: true - pip: - name: influxdb - executable: pip3 - - name: Create database shell: "python3 -c \"import influxdb; influxdb.InfluxDBClient(host='{{ hostvars[groups['bastion'][0]]['ansible_fqdn'] }}', port=8086).create_database('telegraph')\"" diff --git a/playbooks/roles/telegraf/tasks/main.yml b/playbooks/roles/telegraf/tasks/main.yml index e3450c91..b1d4a1f1 100755 --- a/playbooks/roles/telegraf/tasks/main.yml +++ b/playbooks/roles/telegraf/tasks/main.yml @@ -1,4 +1,2 @@ -- include: el.yml - when: ansible_os_family == 'RedHat' -- include: ubuntu.yml - when: ansible_os_family == 'Debian' +- include: common.yml + when: ansible_os_family == 'RedHat' or ansible_os_family == 'Debian' diff --git a/playbooks/roles/telegraf/tasks/ubuntu.yml b/playbooks/roles/telegraf/tasks/ubuntu.yml deleted file mode 100644 index 38ee843c..00000000 --- a/playbooks/roles/telegraf/tasks/ubuntu.yml +++ /dev/null @@ -1,80 +0,0 @@ ---- -- name: Add InfluxData's key - become: true - apt_key: - state: present - url: https://repos.influxdata.com/influxdb.key - -- name: Manage InfluxData APT repositories - become: true - apt_repository: - repo: deb https://repos.influxdata.com/{{ ansible_distribution | lower }} {{ ansible_distribution_release }} stable - state: present - -- name: Install pip - vars: - package_name: - - python3-pip - include_role: - name: safe_yum - -- name: install influx pip - become: true - vars: - ansible_python_interpreter: /usr/bin/python3 - pip: - name: influxdb - executable: pip3 - -- name: Create database - shell: "python3 -c \"import influxdb; influxdb.InfluxDBClient(host='{{ hostvars[groups['bastion'][0]]['ansible_fqdn'] }}', port=8086).create_database('telegraph')\"" - -#- name: Create database -# influxdb_database: -# hostname: "{{ hostvars[groups['bastion'][0]]['ansible_fqdn'] }}" -# database_name: "telegraf" -# run_once: true - -- name: Install telegraf - vars: - package_name: - - telegraf - package_state: latest - include_role: - name: safe_yum - -- name: copy telegraf.conf - become: true - copy: - src: "{{ item }}" - dest: /etc/telegraf/{{item}} - force: yes - backup: yes - owner: telegraf - group: telegraf - mode: 0744 - with_items: - - telegraf.conf - -- name: render conf files - become: true - template: - src: "{{ item }}.j2" - dest: /etc/telegraf/telegraf.d/{{item}} - force: yes - backup: yes - owner: telegraf - group: telegraf - mode: 0744 - with_items: - - infiniband.conf - - influxdb.conf - - net.conf - - infiniband_hw_counters.conf -- name: restart telegraf - become: true - service: - name: telegraf - state: restarted - enabled: yes - diff --git a/playbooks/roles/tuned/files/tuned.conf b/playbooks/roles/tuned/files/tuned.conf new file mode 100644 index 00000000..c1bff86a --- /dev/null +++ b/playbooks/roles/tuned/files/tuned.conf @@ -0,0 +1,39 @@ +[main] +summary=Perf tuning for common GPU workloads + +[cpu] +force_latency=1 +governor=performance +energy_perf_bias=performance +min_perf_pct=100 + +[vm] +transparent_huge_pages=never + +[sysctl] +net.ipv4.tcp_timestamps=1 +net.ipv4.tcp_sack=1 +net.ipv4.tcp_dsack=1 +net.ipv4.tcp_low_latency=1 +net.ipv4.tcp_adv_win_scale=2 +net.ipv4.tcp_window_scaling=1 +net.ipv4.tcp_slow_start_after_idle=0 +net.ipv4.tcp_syn_retries=8 +net.ipv4.tcp_rmem=4096 87380 16777216 +net.ipv4.tcp_wmem=4096 65536 16777216 +net.core.rmem_max=16777216 +net.core.wmem_max=16777216 +net.core.rmem_default=16777216 +net.core.wmem_default=16777216 +net.core.optmem_max=16777216 +net.core.somaxconn = 8192 +net.core.netdev_max_backlog=250000 +sunrpc.udp_slot_table_entries=128 +sunrpc.tcp_slot_table_entries=128 +kernel.sysrq = 1 +kernel.sched_min_granularity_ns = 10000000 +kernel.sched_wakeup_granularity_ns = 15000000 +vm.min_free_kbytes = 16777216 +vm.dirty_ratio = 30 +vm.dirty_background_ratio = 10 +vm.swappiness=30 diff --git a/playbooks/roles/tuned/tasks/el-7.yml b/playbooks/roles/tuned/tasks/el-7.yml new file mode 100644 index 00000000..2dc0f0a6 --- /dev/null +++ b/playbooks/roles/tuned/tasks/el-7.yml @@ -0,0 +1,17 @@ +--- + +- name: Ensure tuned profile directory exists + become: true + file: + path='/usr/lib/tuned/oci-network-performance' + state=directory + +- name: Copy profile file + become: true + copy: + src: tuned.conf + dest: "/usr/lib/tuned/oci-network-performance/tuned.conf" + +- name: Start profile + become: true + shell: tuned-adm profile oci-network-performance diff --git a/playbooks/roles/tuned/tasks/main.yml b/playbooks/roles/tuned/tasks/main.yml new file mode 100644 index 00000000..637e8bae --- /dev/null +++ b/playbooks/roles/tuned/tasks/main.yml @@ -0,0 +1,2 @@ + - include: el-7.yml + when: ansible_os_family == 'RedHat' and ansible_distribution_major_version == '7' and (shape == 'BM.GPU.B4.8' or shape == 'BM.GPU4.8' or shape == 'BM.GPU.A100-v2.8') diff --git a/playbooks/site.yml b/playbooks/site.yml old mode 100755 new mode 100644 index 746adf8f..abee7284 --- a/playbooks/site.yml +++ b/playbooks/site.yml @@ -10,7 +10,7 @@ - hostname # for ubuntu, on all compute nodes, run --fix-broken install -- hosts: compute +- hosts: compute, login become: true tasks: - include_role: @@ -81,7 +81,7 @@ name: fss-home when: add_nfs|bool and home_fss|bool -- hosts: bastion, slurm_backup +- hosts: bastion, slurm_backup, login become: true tasks: - include_role: @@ -110,12 +110,22 @@ name: cluster-cli when: ldap|default(true)|bool -- hosts: compute +# configure if instance_principal is False +- hosts: bastion + become: true + tasks: + - include_role: + name: no_instance_principal + when: not inst_prin|bool + + + +- hosts: compute, login become: true tasks: - include_role: name: home_nfs - when: home_nfs|default(true)|bool + when: home_nfs|default(true)|bool or home_fss|bool - include_role: name: nfs-client vars: @@ -154,7 +164,7 @@ - include_role: name: mysql -- hosts: slurm_backup +- hosts: slurm_backup, login become: true vars: iscsi_ip: "{{ bastion_mount_ip }}" @@ -215,10 +225,6 @@ - hosts: all become: true - vars_files: - - "/opt/oci-hpc/playbooks/roles/openldap/defaults/main.yml" - - "/opt/oci-hpc/playbooks/roles/openldap/vars/el_vars.yml" - - "/opt/oci-hpc/playbooks/roles/openldap/vars/debian_vars.yml" tasks: - include_role: name: sssd @@ -239,11 +245,17 @@ - include_role: name: yaml -- hosts: bastion +- hosts: all tasks: - include_role: name: influxdb when: monitoring|default(false)|bool + - include_role: + name: telegraf + when: monitoring|default(false)|bool + +- hosts: bastion + tasks: - include_role: name: grafana when: monitoring|default(false)|bool @@ -268,12 +280,14 @@ - include_role: name: nvidia-enroot when: enroot|default(true)|bool + - include_role: + name: tuned - hosts: all vars: destroy: false initial: true - download_path: "{{ cluster_nfs_path if cluster_nfs|bool else '/tmp' }}" + download_path: "{{ nfs_target_path if create_fss | bool else ( cluster_nfs_path if cluster_nfs|bool else '/tmp') }}" resize: false enroot_top_path: "{{ nvme_path }}/enroot/" vars_files: @@ -285,11 +299,6 @@ - include_role: name: spack when: spack|default(false)|bool - - include_role: - name: telegraf - when: monitoring|default(false)|bool - include_role: name: slurm when: slurm|default(false)|bool - - diff --git a/playbooks/slurm_config.yml b/playbooks/slurm_config.yml index bc2da813..bb3f6995 100755 --- a/playbooks/slurm_config.yml +++ b/playbooks/slurm_config.yml @@ -1,9 +1,9 @@ -- hosts: bastion,slurm_backup,compute +- hosts: bastion,slurm_backup,compute,login gather_facts: true vars: destroy: false initial: true - download_path: "{{ cluster_nfs_path if cluster_nfs|bool else '/tmp' }}" + download_path: "{{ nfs_target_path if create_fss | bool else ( cluster_nfs_path if cluster_nfs|bool else '/tmp') }}" enroot_top_path: "{{ nvme_path }}/enroot/" vars_files: - "/opt/oci-hpc/conf/queues.conf" diff --git a/playbooks/slurm_config_as.yml b/playbooks/slurm_config_as.yml index 4ccebf99..f92067be 100755 --- a/playbooks/slurm_config_as.yml +++ b/playbooks/slurm_config_as.yml @@ -3,7 +3,7 @@ tasks: - debug: msg: "Gathering facts" -- hosts: compute +- hosts: compute, slurm_backup gather_facts: true vars: destroy: false diff --git a/samples/NCCL_readme b/samples/NCCL_readme new file mode 100644 index 00000000..9279fd69 --- /dev/null +++ b/samples/NCCL_readme @@ -0,0 +1,11 @@ +To Run a NCCL test, run the following commands: +chmod 775 /opt/oci-hpc/samples/prep_sample_files.sh +/opt/oci-hpc/samples/prep_sample_files.sh + +SSH to one of the compute nodes and run: ~/compile.sh + +From the bastion, you can edit the third line of /home/opc/nccl_run_allreduce.sbatch with the number of nodes that you would like to test on: +sbatch /home/opc/nccl_run_allreduce.sbatch + +Look at the last line of the log for bandwidth. + diff --git a/samples/gpu/nccl_run_allreduce.sbatch b/samples/gpu/nccl_run_allreduce.sbatch index 505d60a3..bbcfa484 100644 --- a/samples/gpu/nccl_run_allreduce.sbatch +++ b/samples/gpu/nccl_run_allreduce.sbatch @@ -19,20 +19,31 @@ scontrol show hostnames $SLURM_JOB_NODELIST > $MACHINEFILE echo MACHINEFILE cat $MACHINEFILE -python3 /home/opc/node_ordering_by_rack.py --input_file $MACHINEFILE > /dev/null +source /etc/os-release +if [ $ID == "ol" ] || [ $ID == "centos" ] ; then + python3 /home/opc/node_ordering_by_rack.py --input_file $MACHINEFILE > /dev/null + USER=opc +elif [ $ID == "debian" ] || [ $ID == "ubuntu" ] ; then + python3 /home/ubuntu/node_ordering_by_rack.py --input_file $MACHINEFILE > /dev/null + USER=ubuntu +fi + echo ORDEREDMACHINEFILE cat $ORDEREDMACHINEFILE echo ORDEREDRANKMACHINEFILE cat $ORDEREDRANKMACHINEFILE +mpivars_path=`ls /usr/mpi/gcc/openmpi-*/bin/mpivars.sh` -if [ -f /usr/mpi/gcc/openmpi-4.1.0rc5/bin/mpivars.sh ]; then - source /usr/mpi/gcc/openmpi-4.1.0rc5/bin/mpivars.sh -else - source /usr/mpi/gcc/openmpi-4.0.3rc4/bin/mpivars.sh +if [[ "$mpivars_path" == "" ]]; then + mpivars_path=`ls /opt/openmpi-*/bin/mpivars.sh` fi +if [[ "$mpivars_path" == "" ]]; then + echo "Could not find MPIPATH"; exit; fi + +source $mpivars_path export NCCL_DEBUG=WARN @@ -51,6 +62,7 @@ then fi mpirun --mca pml ucx \ --bind-to numa \ + --mca coll ^hcoll \ -x NCCL_DEBUG=WARN \ -x NCCL_IB_SL=0 \ -x NCCL_IB_TC=41 \ @@ -62,6 +74,6 @@ fi -x NCCL_IB_GID_INDEX=3 \ -x NCCL_ALGO=Ring \ -x NCCL_IB_HCA="${var_NCCL_IB_HCA}" \ - --np $((SLURM_NNODES*SLURM_NTASKS_PER_NODE)) --rankfile $ORDEREDRANKMACHINEFILE /home/opc/nccl-tests/build/all_reduce_perf -b1G -e10G -i$((1024*1024*1024*9)) -n 100 + --np $((SLURM_NNODES*SLURM_NTASKS_PER_NODE)) --rankfile $ORDEREDRANKMACHINEFILE /home/$USER/nccl-tests/build/all_reduce_perf -b1G -e10G -i$((1024*1024*1024*9)) -n 100 diff --git a/samples/gpu/nccl_run_allreduce.sh b/samples/gpu/nccl_run_allreduce.sh index 2edd75f8..fd2ae7fc 100644 --- a/samples/gpu/nccl_run_allreduce.sh +++ b/samples/gpu/nccl_run_allreduce.sh @@ -42,11 +42,11 @@ do hostfile=$hostfile; np=$np ; iter=20; - if [ -f /usr/mpi/gcc/openmpi-4.1.0rc5/bin/mpivars.sh ]; then - source /usr/mpi/gcc/openmpi-4.1.0rc5/bin/mpivars.sh - else - source /usr/mpi/gcc/openmpi-4.0.3rc4/bin/mpivars.sh - fi + mpivars_path=`ls /usr/mpi/gcc/openmpi-*/bin/mpivars.sh` + source $mpivars_path + + if [[ "$mpivars_path" == "" ]]; then echo "Could not find MPIPATH"; exit; fi + first_node=`head $hostfile -n 1` shape=`ssh $first_node 'curl -sH "Authorization: Bearer Oracle" -L http://169.254.169.254/opc/v2/instance/' | jq .shape` if [ $shape == \"BM.GPU.B4.8\" ] || [ $shape == \"BM.GPU.A100-v2.8\" ] diff --git a/samples/gpu/nccl_run_alltoall.sh b/samples/gpu/nccl_run_alltoall.sh index 896f4a56..e1be500d 100644 --- a/samples/gpu/nccl_run_alltoall.sh +++ b/samples/gpu/nccl_run_alltoall.sh @@ -49,11 +49,10 @@ do hostfile=$hostfile; np=$np ; iter=50; - if [ -f /usr/mpi/gcc/openmpi-4.1.0rc5/bin/mpivars.sh ]; then - source /usr/mpi/gcc/openmpi-4.1.0rc5/bin/mpivars.sh - else - source /usr/mpi/gcc/openmpi-4.0.3rc4/bin/mpivars.sh - fi + mpivars_path=`ls /usr/mpi/gcc/openmpi-*/bin/mpivars.sh` + source $mpivars_path + + if [[ "$mpivars_path" == "" ]]; then echo "Could not find MPIPATH"; exit; fi first_node=`head $hostfile -n 1` shape=`ssh $first_node 'curl -sH "Authorization: Bearer Oracle" -L http://169.254.169.254/opc/v2/instance/' | jq .shape` diff --git a/samples/gpu/qfabv1_nccl_run_allreduce.sbatch b/samples/gpu/qfabv1_nccl_run_allreduce.sbatch index 92c624a5..203f3ba6 100644 --- a/samples/gpu/qfabv1_nccl_run_allreduce.sbatch +++ b/samples/gpu/qfabv1_nccl_run_allreduce.sbatch @@ -26,11 +26,10 @@ cat $ORDEREDMACHINEFILE echo ORDEREDRANKMACHINEFILE cat $ORDEREDRANKMACHINEFILE -if [ -f /usr/mpi/gcc/openmpi-4.1.0rc5/bin/mpivars.sh ]; then - source /usr/mpi/gcc/openmpi-4.1.0rc5/bin/mpivars.sh -else - source /usr/mpi/gcc/openmpi-4.0.3rc4/bin/mpivars.sh -fi +mpivars_path=`ls /usr/mpi/gcc/openmpi-*/bin/mpivars.sh` +source $mpivars_path + +if [[ "$mpivars_path" == "" ]]; then echo "Could not find MPIPATH"; exit; fi #source /usr/mpi/gcc/openmpi-4.1.0rc5/bin/mpivars.sh #source /usr/mpi/gcc/openmpi-4.0.3rc4/bin/mpivars.sh diff --git a/samples/gpu/qfabv1_nccl_run_allreduce.sh b/samples/gpu/qfabv1_nccl_run_allreduce.sh index 831b184e..f5c5cad6 100644 --- a/samples/gpu/qfabv1_nccl_run_allreduce.sh +++ b/samples/gpu/qfabv1_nccl_run_allreduce.sh @@ -43,11 +43,10 @@ do hostfile=$hostfile; np=$np ; iter=20; - if [ -f /usr/mpi/gcc/openmpi-4.1.0rc5/bin/mpivars.sh ]; then - source /usr/mpi/gcc/openmpi-4.1.0rc5/bin/mpivars.sh - else - source /usr/mpi/gcc/openmpi-4.0.3rc4/bin/mpivars.sh - fi + mpivars_path=`ls /usr/mpi/gcc/openmpi-*/bin/mpivars.sh` + source $mpivars_path + + if [[ "$mpivars_path" == "" ]]; then echo "Could not find MPIPATH"; exit; fi first_node=`head $hostfile -n 1` shape=`ssh $first_node 'curl -sH "Authorization: Bearer Oracle" -L http://169.254.169.254/opc/v2/instance/' | jq .shape` diff --git a/samples/gpu/qfabv1_nccl_run_alltoall.sh b/samples/gpu/qfabv1_nccl_run_alltoall.sh index 565bffe5..a9d217d8 100644 --- a/samples/gpu/qfabv1_nccl_run_alltoall.sh +++ b/samples/gpu/qfabv1_nccl_run_alltoall.sh @@ -51,11 +51,10 @@ do hostfile=$hostfile; np=$np ; iter=50; - if [ -f /usr/mpi/gcc/openmpi-4.1.0rc5/bin/mpivars.sh ]; then - source /usr/mpi/gcc/openmpi-4.1.0rc5/bin/mpivars.sh - else - source /usr/mpi/gcc/openmpi-4.0.3rc4/bin/mpivars.sh - fi + mpivars_path=`ls /usr/mpi/gcc/openmpi-*/bin/mpivars.sh` + source $mpivars_path + + if [[ "$mpivars_path" == "" ]]; then echo "Could not find MPIPATH"; exit; fi first_node=`head $hostfile -n 1` diff --git a/samples/nccl_compile/compile.sh b/samples/nccl_compile/compile.sh index a6d85d6b..dbf37e8a 100644 --- a/samples/nccl_compile/compile.sh +++ b/samples/nccl_compile/compile.sh @@ -1,17 +1,27 @@ -#!/bin/bash +#!/bin/bash # Run on 1 GPU node only -if [ -f /usr/mpi/gcc/openmpi-4.1.0rc5/bin/mpivars.sh ]; then - source /usr/mpi/gcc/openmpi-4.1.0rc5/bin/mpivars.sh - MPI_HOME=/usr/mpi/gcc/openmpi-4.1.0rc5 -else - source /usr/mpi/gcc/openmpi-4.0.3rc4/bin/mpivars.sh - MPI_HOME=/usr/mpi/gcc/openmpi-4.0.3rc4 +mpivars_path=`ls /usr/mpi/gcc/openmpi-*/bin/mpivars.sh` + +if [[ "$mpivars_path" == "" ]]; then + mpivars_path=`ls /opt/openmpi-*/bin/mpivars.sh` +fi + +if [[ "$mpivars_path" == "" ]]; then + echo "Could not find MPIPATH"; exit; fi + +source $mpivars_path +MPI_HOME=${mpivars_path%%/bin*} + +source /etc/os-release +if [ $ID == "ol" ] || [ $ID == "centos" ] ; then + cd /home/opc +elif [ $ID == "debian" ] || [ $ID == "ubuntu" ] ; then + cd /home/ubuntu fi -cd /home/opc git clone https://github.com/NVIDIA/nccl-tests.git cd nccl-tests/ make MPI=1 MPI_HOME=$MPI_HOME CUDA_HOME=/usr/local/cuda diff --git a/samples/prep_sample_files.sh b/samples/prep_sample_files.sh index fa10ee1a..0b9d78fb 100644 --- a/samples/prep_sample_files.sh +++ b/samples/prep_sample_files.sh @@ -9,5 +9,5 @@ done; cp nccl_compile/compile.sh /home/opc/ cp gpu/*.sbatch /home/opc/ - +cp /opt/oci-hpc/bin/node_ordering_by_rack.py /home/opc/ diff --git a/schema.yaml b/schema.yaml index 7ad5400b..1b22dc25 100755 --- a/schema.yaml +++ b/schema.yaml @@ -30,6 +30,7 @@ variableGroups: - ${bastion_ad} - ${bastion_shape} - ${bastion_ocpus} + - ${bastion_ocpus_denseIO_flex} - ${bastion_custom_memory} - ${bastion_memory} - ${bastion_boot_volume_size} @@ -43,6 +44,7 @@ variableGroups: - ${cluster_network_shape} - ${instance_pool_shape} - ${instance_pool_ocpus} + - ${instance_pool_ocpus_denseIO_flex} - ${instance_pool_custom_memory} - ${instance_pool_memory} - ${node_count} @@ -57,12 +59,35 @@ variableGroups: - ${compute_image_compartment} - ${image} - ${image_ocid} + - title: "Additionnal Login Node" + variables: + - ${login_node} + - ${login_ad} + - ${login_shape} + - ${login_ocpus} + - ${login_ocpus_denseIO_flex} + - ${login_custom_memory} + - ${login_memory} + - ${login_boot_volume_size} + - ${use_standard_image_login} + - ${use_marketplace_image_login} + - ${use_old_marketplace_image_login} + - ${marketplace_listing_login} + - ${old_marketplace_listing_login} + - ${unsupported_login} + - ${login_image_compartment} + - ${custom_login_image} + - ${unsupported_login_image} + - ${login_username} + - ${login_block} + - ${login_block_volume_size} + - ${login_block_volume_performance} - title: Autoscaling variables: - ${autoscaling} - ${autoscaling_monitoring} - ${latency_check} - - title: "API authentication" + - title: "API authentication, needed for autoscaling" variables: - ${inst_prin} - ${api_user_ocid} @@ -134,11 +159,13 @@ variableGroups: - ${slurm_nfs} - ${slurm_ha} - ${rack_aware} - - ${pyxis} - ${queue} - ${spack} - ${monitoring} - ${enroot} + - ${pyxis} + - ${pam} + - ${sacct_limits} - title: "Hidden" variables: @@ -234,6 +261,7 @@ variables: required: true default: VM.Standard2.4 bastion_ocpus: + title: "Cores" type: integer description: Number of OCPU's for flex shape minimum: 1 @@ -258,6 +286,24 @@ variables: - ${bastion_shape} - "VM.Standard3.Flex" required: true + + bastion_ocpus_denseIO_flex: + title: "Cores" + type: enum + description: Number of OCPU's for Dense IO flex shape + enum: + - 8 + - 16 + - 32 + default: 8 + visible: + and: + - or: + - eq: + - ${bastion_shape} + - "VM.DenseIO.E4.Flex" + required: true + bastion_custom_memory: title: Use custom memory size type: boolean @@ -339,7 +385,7 @@ variables: unsupported_bastion_image: title: "Image OCID" - description: "Custom image ID for compute nodes" + description: "Custom image ID for compute nodes. Please note that only Oracle Linux 7 and Ubuntu 20.04 are supported as bastion image at this moment." type: string required: true visible: @@ -363,7 +409,7 @@ variables: custom_bastion_image: title: "Bastion Image ID" - description: "Custom image ID for bastion nodes. Please note that only Oracle Linux and Ubuntu 20.04 are supported as bastion image at this moment. " + description: "Custom image ID for bastion nodes. Please note that only Oracle Linux 7 and Ubuntu 20.04 are supported as bastion image at this moment. " type: oci:core:image:id dependsOn: compartmentId: ${bastion_image_compartment} @@ -462,6 +508,7 @@ variables: - "BM.GPU.B4.8" - "BM.GPU.A100-v2.8" - "BM.Optimized3.36" + - "BM.HPC.E5.128" default: "BM.HPC2.36" title: "Shape of the Compute Nodes" description: "Shape of compute nodes used in permanent/initial cluster" @@ -509,6 +556,23 @@ variables: - "VM.Standard3.Flex" required: true + instance_pool_ocpus_denseIO_flex: + title: Cores + type: enum + description: Number of OCPU's for Dense IO flex shape + enum: + - 8 + - 16 + - 32 + default: 8 + visible: + and: + - or: + - eq: + - ${instance_pool_shape} + - "VM.DenseIO.E4.Flex" + required: true + instance_pool_custom_memory: title: Use custom memory size type: boolean @@ -643,7 +707,7 @@ variables: image: title: "Image" - description: "Custom image ID for compute nodes" + description: "Custom image ID for compute nodes. Supported OS are OL7, OL8, CentOS7 and Ubuntu 20.04" type: oci:core:image:id required: true dependsOn: @@ -677,7 +741,7 @@ variables: image_ocid: title: "Image OCID" - description: "Custom image ID for compute nodes" + description: "Custom image ID for compute nodes. Supported OS are OL7, OL8, CentOS7 and Ubuntu 20.04" type: string required: true visible: @@ -691,6 +755,8 @@ variables: type: boolean title: "Show advanced storage options" default: false + description: "Including running home on FSS." + visible: true use_scratch_nfs: type: boolean @@ -875,8 +941,8 @@ variables: rdma_subnet: type: string title: "RDMA subnet IP range" - default: "192.168.168.0/22" - description: "Must be the same size as private subnet" + default: "192.168.0.0/16" + description: "Must be at least the same size as private subnet for HPC and at least 16 times the size of the private subnet for GPUs" required: true private_subnet: type: string @@ -918,14 +984,17 @@ variables: default: false required: true description: "Add a second master of the same shape as the bastion as a back-up controller node. We recommend using a FSS to save the state and share between masters" - visible: - - ${slurm} + visible: ${slurm} pyxis: type: boolean title: "Install Nvidia Pyxis plugin for Slurm" default: false - description: "Install Pyxis. Pyxis is a plugin that integrates Enroot with Slurm." + description: "Install Pyxis. Pyxis is a plugin that integrates Enroot with Slurm. (Warning: using Pyxis with autoscaling is causing an issue that prevents jobs from being scheduled on nodes to be spun up)" + visible: + and: + - ${slurm} + - ${enroot} rack_aware: type: boolean @@ -933,8 +1002,7 @@ variables: default: false required: true description: "Slurm topology can define rack aware topologies to prioritize nodes on same racks per job.\n This is a LA feature and your tenancy needs to be whitelisted" - visible: - - ${slurm} + visible: ${slurm} queue: @@ -943,8 +1011,7 @@ variables: default: "compute" required: true description: "Add the permanent cluster to a specific queue, compute is the default queue" - visible: - - ${slurm} + visible: ${slurm} spack: type: boolean @@ -957,6 +1024,21 @@ variables: title: "Install Nvidia Enroot for containerized GPU workloads" default: false description: "Install Enroot, Nvidia Container Toolkit, and docker." + visible: ${slurm} + + pam: + type: boolean + title: "Enable PAM" + default: false + description: "Enable PAM for the Slurm cluster (Supported only on OL with RHCK kernel at this time). When PAM is enabled, users that are not in the sudo group will not be able to SSH into the compute nodes unless they have an active job running in Slurm." + visible: ${slurm} + + sacct_limits: + type: boolean + title: "Enable Limits for Slurm jobs" + default: false + description: "Enable Limits for the Slurm cluster When enabled, users will not be able to submit jobs of the right limits are not set" + visible: ${slurm} monitoring: type: boolean @@ -978,7 +1060,7 @@ variables: inst_prin: type: boolean - title: "Use Instance Principal (required for autoscaling)" + title: "Use Instance Principal instead of configuration file" description: "You will need to set a dynamic group and policy to allow the bastion to authenticate. This will not be created automatically." default: true @@ -1159,4 +1241,295 @@ variables: required: true visible: and: - - ${privilege_sudo} \ No newline at end of file + - ${privilege_sudo} + + + + login_node: + type: boolean + title: "Login Node" + default: false + description: "Create an additional login node for users" + + login_ad: + type: oci:identity:availabilitydomain:name + dependsOn: + compartmentId: ${targetCompartment} + visible: + and: + - complexExpression + - ${login_node} + required: true + description: "Availability Domain for login node" + title: "Availability Domain For Login Node" + default: ${ad} + + login_shape: + type: oci:core:instanceshape:name + dependsOn: + compartmentId: ${targetCompartment} + required: true + default: VM.Standard2.4 + visible: ${login_node} + + login_ocpus: + type: integer + description: Number of OCPU's for flex shape + minimum: 1 + maximum: 64 + default: 2 + visible: + and: + - or: + - eq: + - ${login_shape} + - "VM.Standard.E3.Flex" + - eq: + - ${login_shape} + - "VM.Standard.E4.Flex" + - eq: + - ${login_shape} + - "VM.Optimized3.Flex" + - eq: + - ${login_shape} + - "VM.Standard.A1.Flex" + - eq: + - ${login_shape} + - "VM.Standard3.Flex" + - ${login_node} + required: true + + login_ocpus_denseIO_flex: + title: Cores + type: enum + description: Number of OCPU's for Dense IO flex shape + enum: + - 8 + - 16 + - 32 + default: 8 + visible: + and: + - or: + - eq: + - ${login_shape} + - "VM.DenseIO.E4.Flex" + - ${login_node} + required: true + + login_custom_memory: + title: Use custom memory size + type: boolean + default: false + visible: + and: + - or: + - eq: + - ${login_shape} + - "VM.Standard.E3.Flex" + - eq: + - ${login_shape} + - "VM.Optimized3.Flex" + - eq: + - ${login_shape} + - "VM.Standard.E4.Flex" + - eq: + - ${login_shape} + - "VM.Standard.A1.Flex" + - eq: + - ${login_shape} + - "VM.Standard3.Flex" + - ${login_node} + login_memory: + title: Memory in GBS + type: integer + description: Number of memory for flex shape. Minimum 1GB per core. + minimum: 1 + maximum: 1024 + default: 16 + visible: + and: + - and: + - or: + - eq: + - ${login_shape} + - "VM.Standard.E3.Flex" + - eq: + - ${login_shape} + - "VM.Optimized3.Flex" + - eq: + - ${login_shape} + - "VM.Standard.E4.Flex" + - eq: + - ${login_shape} + - "VM.Standard.A1.Flex" + - eq: + - ${login_shape} + - "VM.Standard3.Flex" + - and: + - ${login_custom_memory} + - ${login_node} + required: true + + login_boot_volume_size: + type: integer + required: true + minimum: 50 + title: "Size of the boot volume in GB" + default: 50 + visible: ${login_node} + + login_block: + type: boolean + title: Additional block volume for login node + default: false + visible: ${login_node} + + login_block_volume_size: + required: true + type: integer + title: "Size of the additional volume in GB" + default: 1000 + visible: + and: + - and: + - ${login_block} + - ${login_node} + login_block_volume_performance: + type: enum + title: "Block volume performance" + required: true + enum: + - "0. Lower performance" + - "10. Balanced performance" + - "20. High Performance" + default: "10. Balanced performance" + visible: + and: + - and: + - ${login_block} + - ${login_node} + use_standard_image_login: + type: boolean + title: "use standard login image" + description: > + "Use standard login image (Oracle Linux)" + default: true + visible: ${login_node} + + unsupported_login: + title: "Use unsupported image" + description: "Custom image ID for Login Node" + type: boolean + default: false + visible: + not: + - ${use_standard_image_login} + + login_image_compartment: + title: "login image compartment" + type: oci:identity:compartment:id + default: ${targetCompartment} + visible: + and: + - not: + - ${use_standard_image_login} + - not: + - ${unsupported_login} + - not: + - ${use_marketplace_image_login} + required: true + + custom_login_image: + title: "Login Image ID" + description: "Custom image ID for login nodes. Please note that only Oracle Linux and Ubuntu 20.04 are supported as login image at this moment. " + type: oci:core:image:id + dependsOn: + compartmentId: ${login_image_compartment} + visible: + and: + - not: + - ${use_standard_image_login} + - not: + - ${unsupported_login} + - not: + - ${use_marketplace_image_login} + required: true + unsupported_login_image: + title: "Image OCID" + description: "Custom image ID for login nodes" + type: string + required: true + visible: + and: + - ${unsupported_login} + - not: + - ${use_standard_image_login} + - not: + - ${use_marketplace_image_login} + default: "image.ocid" + + login_username: + title: "Default username for login node" + description: "Custom image ID for login node" + type: string + default: "opc" + required: true + visible: + not: + - ${use_standard_image_login} + + use_marketplace_image_login: + type: boolean + title: "use marketplace image" + description: "Use marketplace image, otherwise provide custom image OCID" + default: true + visible: + not: + - ${use_standard_image_login} + use_old_marketplace_image_login: + type: boolean + title: "use older marketplace images" + description: "Images prior to September 2021" + default: false + visible: + and: + - ${use_marketplace_image_login} + - not: + - ${use_standard_image_login} + + marketplace_listing_login: + type: enum + title: "Image version" + description: "Marketplace listing to use" + required: true + enum: + - "HPC_OL7" + - "HPC_OL8" + - "GPU" + default: "HPC_OL7" + visible: + and: + - ${use_marketplace_image_login} + - not: + - ${use_old_marketplace_image_login} + - not: + - ${use_standard_image_login} + + old_marketplace_listing_login: + type: enum + title: "Image version" + description: "Marketplace listing to use" + required: true + enum: + - "1. Oracle Linux 7.9 OFED 5.3-1.0.0.1 RHCK 20210607" + - "2. Oracle Linux 7.8 OFED 5.0-1.0.0.0 UEK 20200826" + - "3. Oracle Linux 7.7 OFED 4.4-2.0.7.0 UEK 20200229" + - "4. Oracle Linux 7.9 OFED 5.0-2.1.8.0 RHCK 20210709" + default: "4. Oracle Linux 7.9 OFED 5.0-2.1.8.0 RHCK 20210709" + visible: + and: + - ${use_marketplace_image_login} + - ${use_old_marketplace_image_login} + - not: + - ${use_standard_image_login} \ No newline at end of file diff --git a/slurm_ha.tf b/slurm_ha.tf index 2a63c274..dfa9b507 100644 --- a/slurm_ha.tf +++ b/slurm_ha.tf @@ -67,6 +67,7 @@ resource "null_resource" "backup" { provisioner "remote-exec" { inline = [ + "#!/bin/bash", "sudo mkdir -p /opt/oci-hpc", "sudo chown ${var.bastion_username}:${var.bastion_username} /opt/oci-hpc/", "mkdir -p /opt/oci-hpc/bin", @@ -169,6 +170,7 @@ resource "null_resource" "backup" { provisioner "remote-exec" { inline = [ + "#!/bin/bash", "chmod 600 /home/${var.bastion_username}/.ssh/cluster.key", "cp /home/${var.bastion_username}/.ssh/cluster.key /home/${var.bastion_username}/.ssh/id_rsa", "chmod a+x /opt/oci-hpc/bin/*.sh", @@ -195,6 +197,8 @@ resource "null_resource" "cluster_backup" { bastion_ip = oci_core_instance.bastion.private_ip, backup_name = var.slurm_ha ? oci_core_instance.backup[0].display_name : "", backup_ip = var.slurm_ha ? oci_core_instance.backup[0].private_ip: "", + login_name = var.login_node ? oci_core_instance.login[0].display_name : "", + login_ip = var.login_node ? oci_core_instance.login[0].private_ip: "", compute = var.node_count > 0 ? zipmap(local.cluster_instances_names, local.cluster_instances_ips) : zipmap([],[]) public_subnet = data.oci_core_subnet.public_subnet.cidr_block, private_subnet = data.oci_core_subnet.private_subnet.cidr_block, @@ -227,7 +231,7 @@ resource "null_resource" "cluster_backup" { autoscaling = var.autoscaling, cluster_name = local.cluster_name, shape = var.cluster_network ? var.cluster_network_shape : var.instance_pool_shape, - instance_pool_ocpus = var.instance_pool_ocpus, + instance_pool_ocpus = local.instance_pool_ocpus, queue=var.queue, monitoring = var.monitoring, hyperthreading = var.hyperthreading, @@ -240,9 +244,16 @@ resource "null_resource" "cluster_backup" { admin_username = var.autoscaling_mysql_service ? var.admin_username : "root", enroot = var.enroot, pyxis = var.pyxis, + pam = var.pam, + sacct_limits = var.sacct_limits, privilege_sudo = var.privilege_sudo, privilege_group_name = var.privilege_group_name, - latency_check = var.latency_check + latency_check = var.latency_check, + inst_prin = var.inst_prin, + region = var.region, + tenancy_ocid = var.tenancy_ocid, + api_fingerprint = var.api_fingerprint, + api_user_ocid = var.api_user_ocid }) destination = "/opt/oci-hpc/playbooks/inventory" @@ -297,7 +308,7 @@ resource "null_resource" "cluster_backup" { private_subnet = var.private_subnet, private_subnet_id = var.private_subnet_id, targetCompartment = var.targetCompartment, - instance_pool_ocpus = var.instance_pool_ocpus, + instance_pool_ocpus = local.instance_pool_ocpus, instance_pool_memory = var.instance_pool_memory, instance_pool_custom_memory = var.instance_pool_custom_memory, queue=var.queue, @@ -319,14 +330,18 @@ resource "null_resource" "cluster_backup" { bastion_ip = oci_core_instance.bastion.private_ip, backup_name = var.slurm_ha ? oci_core_instance.backup[0].display_name : "", backup_ip = var.slurm_ha ? oci_core_instance.backup[0].private_ip: "", + login_name = var.login_node ? oci_core_instance.login[0].display_name : "", + login_ip = var.login_node ? oci_core_instance.login[0].private_ip: "", compute = var.node_count > 0 ? zipmap(local.cluster_instances_names, local.cluster_instances_ips) : zipmap([],[]) public_subnet = data.oci_core_subnet.public_subnet.cidr_block, public_subnet_id = local.bastion_subnet_id, private_subnet = data.oci_core_subnet.private_subnet.cidr_block, private_subnet_id = local.subnet_id, - nfs = var.node_count > 0 ? local.cluster_instances_names[0] : "", + rdma_subnet = var.rdma_subnet, + nfs = var.node_count > 0 && var.use_scratch_nfs ? local.cluster_instances_names[0] : "", scratch_nfs = var.use_scratch_nfs && var.node_count > 0, scratch_nfs_path = var.scratch_nfs_path, + use_scratch_nfs = var.use_scratch_nfs, slurm = var.slurm, slurm_nfs_path = var.add_nfs ? var.nfs_source_path : var.cluster_nfs_path rack_aware = var.rack_aware, @@ -364,13 +379,15 @@ resource "null_resource" "cluster_backup" { autoscaling_monitoring = var.autoscaling_monitoring, enroot = var.enroot, pyxis = var.pyxis, + pam = var.pam, + sacct_limits = var.sacct_limits, privilege_sudo = var.privilege_sudo, privilege_group_name = var.privilege_group_name, latency_check = var.latency_check, private_deployment = var.private_deployment, bastion_username = var.bastion_username, compute_username = var.compute_username, - use_multiple_ads = var.use_multiple_ads + use_multiple_ads = var.use_multiple_ads }) destination = "/opt/oci-hpc/conf/variables.tf" @@ -396,6 +413,7 @@ resource "null_resource" "cluster_backup" { provisioner "remote-exec" { inline = [ + "#!/bin/bash", "chmod 755 /opt/oci-hpc/autoscaling/crontab/*.sh", "chmod 755 /opt/oci-hpc/autoscaling/credentials/key.sh", "/opt/oci-hpc/autoscaling/credentials/key.sh /opt/oci-hpc/autoscaling/credentials/key.initial /opt/oci-hpc/autoscaling/credentials/key.pem > /opt/oci-hpc/autoscaling/credentials/key.log", diff --git a/variables.tf b/variables.tf index 27606904..8c3862ab 100755 --- a/variables.tf +++ b/variables.tf @@ -12,10 +12,15 @@ variable "cluster_name" { default = "" } variable "bastion_ad" {} variable "bastion_shape" { default = "VM.Standard2.4" } variable "use_standard_image" { default= true } +variable "use_standard_image_login" { default= true } variable "custom_bastion_image" { type = string default = "image.ocid" } +variable "custom_login_image" { + type = string + default = "image.ocid" +} variable "bastion_boot_volume_size" {} variable "cluster_network_shape" { default = "BM.HPC2.36" } variable "instance_pool_shape" { default = "VM.Standard2.4" } @@ -26,6 +31,7 @@ variable "use_old_marketplace_image" { default = false} variable "image" { default = "ocid1.image.oc1..aaaaaaaa5yxem7wzie34hi5km4qm2t754tsfxrjuefyjivebrxjad4jcj5oa" } variable "image_ocid" { default = "ocid1.image.oc1..aaaaaaaa5yxem7wzie34hi5km4qm2t754tsfxrjuefyjivebrxjad4jcj5oa" } variable "unsupported_bastion_image" { default = "" } +variable "unsupported_login_image" { default = "" } variable "use_cluster_nfs" { default = true} variable "use_scratch_nfs" { default = true } variable "cluster_nfs_path" { default = "/nfs/cluster" } @@ -38,21 +44,31 @@ variable "private_subnet_id" { default = ""} variable "vcn_subnet" { default = "172.16.0.0/21" } variable "public_subnet" { default = "172.16.0.0/24" } variable "additional_subnet" { default = "172.16.1.0/24" } -variable "rdma_subnet" { default = "192.168.168.0/22" } +variable "rdma_subnet" { default = "192.168.0.0/16" } variable "private_subnet" { default = "172.16.4.0/22" } variable "ssh_cidr" { default = "0.0.0.0/0" } variable "slurm" { default = false } variable "slurm_ha" { default = false } +variable "login_node" { default = false } +variable "login_ad" {default = ""} +variable "login_shape" { default = "VM.Standard2.4" } +variable "login_boot_volume_size" {default = 50} variable "slurm_nfs" { default = false } variable "rack_aware" { default = false } variable "ldap" { default = true } variable "spack" { default = false } variable "bastion_ocpus" { default = 2} +variable "bastion_ocpus_denseIO_flex" { default = 8} variable "instance_pool_ocpus" { default = 2} +variable "instance_pool_ocpus_denseIO_flex" { default = 8} variable "instance_pool_memory" { default = 16 } variable "instance_pool_custom_memory" { default = false } +variable "login_ocpus" { default = 2} +variable "login_ocpus_denseIO_flex" { default = 8} variable "bastion_memory" { default = 16 } variable "bastion_custom_memory" { default = false } +variable "login_memory" { default = 16 } +variable "login_custom_memory" { default = false } variable "privilege_sudo" { default = true } variable "privilege_group_name" { default = "privilege" } @@ -70,9 +86,12 @@ variable "marketplace_version_id" { "2" = "OL7.8-OFED5.0-1.0.0.0-UEK-20200826" "3" = "OL7.7-OFED-4.4-2.0.7.0-UEK-20200229" "4" = "OL7.9-OFED5.0-2.1.8.0-RHCK-20210709" - "HPC_OL7" = "OL7.9-RHCK-3.10.0-OFED-5.4-3.4.0-1" - "HPC_OL8" = "OracleLinux-8-RHCK-OFED-5.4-3.5.8.0-2022.11.15-0" - "GPU" = "OracleLinux-7-RHCK-3.10.0-OFED-5.4-3.4.0.0-GPU-510-2022.09.23-1" + "HPC_OL7" = "OracleLinux-7-RHCK-3.10.0-OFED-5.4-3.6.8.1-2023.01.10-0" + "HPC_OL8" = "OracleLinux-8-RHCK-OFED-5.4-3.6.8.1-2023.01.10-0" + "HPC_OL7_old" = "OL7.9-RHCK-3.10.0-OFED-5.4-3.4.0-1" + "HPC_OL8_old" = "OracleLinux-8-RHCK-OFED-5.4-3.5.8.0-2022.11.15-0" + "GPU_old" = "OracleLinux-7-RHCK-3.10.0-OFED-5.4-3.4.0.0-GPU-510-2022.09.23-1" + "GPU" = "OracleLinux-7-RHCK-3.10.0-OFED-5.4-3.6.8.1-GPU-515-2023.01.10-0" } } @@ -106,6 +125,26 @@ variable "bastion_block" { variable "bastion_block_volume_size" { default = 1000 } + +variable "login_block_volume_performance" { +/* + Allowed values + "0. Lower performance" + "10. Balanced performance" + "20. High Performance" +*/ + +default = "10. Balanced performance" + +} + +variable "login_block" { + default = false +} + +variable "login_block_volume_size" { + default = 1000 +} variable "scratch_nfs_type_cluster" { default = "nvme"} variable "scratch_nfs_type_pool" { default = "none" } variable "cluster_block_volume_size" { default = "1000" } @@ -134,8 +173,8 @@ variable "nfs_options" {default = ""} variable "monitoring" { default = true } variable "enroot" { default = false } variable "pyxis" { default = false } - - +variable "pam" { default = false } +variable "sacct_limits" { default = false } variable "unsupported" { type=bool @@ -147,7 +186,10 @@ variable "unsupported_bastion" { type=bool default = false } - +variable "unsupported_login" { + type=bool + default = false +} variable "bastion_username" { type = string default = "opc" @@ -157,6 +199,10 @@ variable "compute_username" { type = string default = "opc" } +variable "login_username" { + type = string + default = "opc" +} variable "autoscaling_monitoring" { type= bool @@ -190,4 +236,17 @@ variable cluster_nfs_export {default = ""} variable "private_deployment" { default = false } -variable "localdisk" { default = true } \ No newline at end of file +variable "localdisk" { default = true } + + +variable "use_marketplace_image_login" { default = true} +variable "use_old_marketplace_image_login" { default = false} + +variable "marketplace_listing_login" { + default = "HPC_OL7" +} + +variable "old_marketplace_listing_login" { + default = "4. Oracle Linux 7.9 OFED 5.0-2.1.8.0 RHCK 20210709" +} + \ No newline at end of file diff --git a/versions.tf b/versions.tf index 44a6c867..458fd9db 100755 --- a/versions.tf +++ b/versions.tf @@ -3,7 +3,7 @@ terraform { required_providers { oci = { source = "oracle/oci" - version = "4.99.0" + version = "4.112.0" } } } \ No newline at end of file